In [2]:
#Initializations
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split
import torch

  from pandas.core import (


In [5]:
torch.cuda.is_available()

False

In [None]:
from collections import Counter

#with open('emotion_text.txt', 'r') as file:
with open('train.txt', 'r') as file:
    train_texts = []
    train_labels = []
    for line in file:
        sentence, emotion = line.strip().split(';')
        train_texts.append(sentence.strip())
        train_labels.append(emotion.strip())

print(len(train_texts))
label_counts = Counter(train_labels)
print(label_counts)


# Read validation data
with open('val.txt', 'r') as file:
    val_texts = []
    val_labels = []
    for line in file:
        sentence, emotion = line.strip().split(';')
        val_texts.append(sentence.strip())
        val_labels.append(emotion.strip())

# Read testing data
with open('test.txt', 'r') as file:
    testing_texts = []
    testing_labels = []
    for line in file:
        sentence, emotion = line.strip().split(';')
        testing_texts.append(sentence.strip())
        testing_labels.append(emotion.strip())

print(f"Training data size: {len(train_texts)}")
print(f"Validation data size: {len(val_texts)}")
print(f"Testing data size: {len(testing_texts)}")

16000
Counter({'joy': 5362, 'sadness': 4666, 'anger': 2159, 'fear': 1937, 'love': 1304, 'surprise': 572})
Training data size: 16000
Validation data size: 2000
Testing data size: 2000


In [51]:
# Fine-tuning BERT for text classification
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=13)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# Tokenize texts and convert to PyTorch tensors
encoding = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Convert string labels to integer labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)

# Convert labels to tensor
labels_tensor = torch.tensor(train_labels_encoded)

# Create a TensorDataset and DataLoader
dataset = TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels_tensor)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Setup optimizer (AdamW is commonly used for BERT)
optimizer = AdamW(bert_model.parameters(), lr=5e-5)

# Set the model to training mode
bert_model.train()

# Fine-tuning loop
epochs = 8  # Number of fine-tuning epochs
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        # Move data to the correct device (GPU or CPU)
        input_ids, attention_mask, label = batch
        input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)
        
        # Zero out gradients from previous step
        optimizer.zero_grad()

        # Forward pass
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader)}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [40]:
# Bert embeddings for text classification

def get_bert_embeddings(texts, model, tokenizer, device):
    embeddings = []
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            # Move inputs to the selected device
            inputs = {key: value.to(device) for key, value in inputs.items()}
            outputs = model(**inputs)
            # Use the [CLS] token embedding (first token)
            cls_embedding = outputs.logits
            embeddings.append(cls_embedding.cpu().numpy())  # Collect embedding for classification

    return np.vstack(embeddings)
    


In [42]:
# Get embeddings for training and validation texts
x_train_bert = get_bert_embeddings(train_texts, bert_model, tokenizer, device)
x_val_bert = get_bert_embeddings(val_texts, bert_model, tokenizer, device)

# Fit the SVM model using fine-tuned BERT embeddings for training texts
svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(x_train_bert, train_labels)

# Make predictions on the validation data
val_predictions = svm_model.predict(x_val_bert)

# Print classification report for validation data
print("Classification Report on Validation Data (SVC RBF):")
print(classification_report(val_labels, val_predictions))

# Get predicted probabilities for the validation set
val_probabilities = svm_model.predict_proba(x_val_bert)

# For binary classification, use the positive class probabilities
# Otherwise, loop through classes for multi-class
precision = {}
recall = {}
plt.figure(figsize=(8, 6))

for i, label in enumerate(svm_model.classes_):
    # Get binary labels for this class
    binary_labels = [1 if y == label else 0 for y in val_labels]
    
    # Get precision-recall data for each class
    precision[label], recall[label], _ = precision_recall_curve(
        binary_labels,  # Binary labels for this class
        val_probabilities[:, i]  # Probability for this class
    )

    # Plot each class's curve
    plt.plot(recall[label], precision[label], label=f"Class {label}")

# Customize plot
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve of SVC RBF Model")
plt.legend(loc="best")
plt.show()


KeyboardInterrupt: 

In [49]:
from sklearn.model_selection import GridSearchCV
import seaborn as sns

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

# Initialize the GridSearchCV object with accuracy scoring
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid.fit(x_train_bert, train_labels)

# Extract the results into a DataFrame
results = pd.DataFrame(grid.cv_results_)

# Plot the results of the grid search
plt.figure(figsize=(10, 8))
sns.heatmap(results.pivot_table(index='param_C', columns='param_gamma', values='mean_test_score'), annot=True, cmap='viridis')
plt.title('Grid Search Mean Test Scores')
plt.xlabel('Gamma')
plt.ylabel('C')
plt.show()
# Use the best estimator from the grid search
best_svm_model = grid.best_estimator_

# Make predictions on the validation data using the best SVM model
val_predictions_best = best_svm_model.predict(x_val_bert)

# Print classification report for validation data using the best SVM model
print("Classification Report on Validation Data (Best SVM Model):")
print(classification_report(val_labels, val_predictions_best))


Fitting 3 folds for each of 16 candidates, totalling 48 fits


KeyboardInterrupt: 

In [47]:
print(results[['param_C', 'param_gamma', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(5))

   param_C param_gamma  mean_test_score  rank_test_score
12     100           1         0.449231                1
8       10           1         0.442479                2
13     100         0.1         0.436581                3
4        1           1         0.436239                4
9       10         0.1         0.431966                5


In [30]:
# Print the results of the testing data
for id, prediction in zip(val_texts, val_predictions):
    print(f"ID: {id} \nPrediction: {prediction}")

ID: i was up to my eyes and studying and feeling pretty jaded a href http maturestudenthanginginthere 
Prediction: joy
ID: i feel really petty complaining about panic attacks and such 
Prediction: sadness
ID: i friends its a feeling that runs under everything he is every dumbass word he says and moronic thing he does but its worst when hes with rukia 
Prediction: sadness
ID: i guess so walking around feeling cranky and mad 
Prediction: joy
ID: i continue to cruise along the expressway feeling shitty 
Prediction: sadness
ID: i pay attention it deepens into a feeling of being invaded and helpless 
Prediction: sadness
ID: i feel like i wouldnt have a longing if only we could have a baby and have that new experience together 
Prediction: joy
ID: i can feel the gap it feels like rich people status and poor people status 
Prediction: joy
ID: i started to feel like i was going mad as i was sure i could see stars floating in the water but whenever i went to grab one i came up with nothing 
Pre

In [23]:
def predict_emotion(text, bert_model, tokenizer, svm_model, device):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
        cls_embedding = outputs.logits.cpu().numpy()
    
    # Predict emotion using the SVM model
    prediction = svm_model.predict(cls_embedding)
    return prediction[0]

# Example usage
input_text = "I feel very happy today!"
predicted_emotion = predict_emotion(input_text, bert_model, tokenizer, svm_model, device)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: joy


In [36]:
while True:
    user_input = input("Enter a text (or type 'exit' to stop): ")
    if user_input.lower() == 'exit' or user_input.lower() == '':
        break
    predicted_emotion = predict_emotion(user_input, bert_model, tokenizer, svm_model, device)
    print(f"Text entered: {user_input}")
    print(f"Predicted Emotion: {predicted_emotion}")

Text entered: help
Predicted Emotion: joy
Text entered: p;
Predicted Emotion: joy
Text entered: i feel kinda appalled that she feels like she needs to explain in wide and lenghth her body measures etc pp.
Predicted Emotion: joy
