In [19]:
import re
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from nltk import PorterStemmer, word_tokenize
from nltk.corpus import stopwords
import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
train_data = pd.read_csv('data/wassa_train.csv')
test_data = pd.read_csv('data/wassa_test.csv')
val_data = pd.read_csv('data/wassa_val.csv')

In [3]:
train_data

Unnamed: 0,id,tweet,emotion
0,1929,Kinda wished I watched mischievous kiss before...,joy
1,4049,When you forget to mention you were bought dre...,anger
2,5405,@FreyaLynn @donniewahlberg yep! jimmy buffett ...,neutral
3,5900,"@philcampbell blue skies? where, it's still gr...",neutral
4,3712,Some moving clips on youtube tonight of the vi...,anger
...,...,...,...
5996,5407,@koifusionpdx i was so close to the tacos...on...,neutral
5997,496,@Fatumoriginal there's no sitcom better! If u ...,sadness
5998,499,@Mysteri759 @Ren102e906 @slb42jcb @swoozyqyah ...,sadness
5999,1784,Watch this amazing live.ly broadcast by @ittzd...,joy


In [4]:
def preprocess_text(text):
    result = text.lower()
    result = result.strip()
    result = re.result = re.sub(r"http\S+", "", result)
    result = re.sub('\S*@\S*\s?', '', result)
    result = unidecode.unidecode(result)
    stop_words = stopwords.words("english")
    word_list = word_tokenize(result)
    # # english stemmer
    # ps = SnowballStemmer("english")

    stemmed_sentence = ""
    for word in word_list:
        if word not in stop_words:
            stemmed_sentence += word
            stemmed_sentence += " "

    result = stemmed_sentence
    whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    result = ''.join(filter(whitelist.__contains__, result))
    result = ''.join([i for i in result if not i.isdigit()])
    return result

In [5]:

def preprocess_text_list(text_list):
  for i,text in enumerate(text_list):
    text_list[i] = preprocess_text(text_list[i])

In [6]:
train_texts = train_data["tweet"].tolist()
val_texts = val_data["tweet"].tolist()
test_texts = test_data["tweet"].tolist()


train_labels = train_data["emotion"].tolist()
val_labels = val_data["emotion"].tolist()
test_labels = test_data["emotion"].tolist()

train_labels[:10]
     

['joy',
 'anger',
 'neutral',
 'neutral',
 'anger',
 'joy',
 'neutral',
 'sadness',
 'sadness',
 'neutral']

In [7]:
preprocess_text_list(train_texts)
preprocess_text_list(val_texts)
preprocess_text_list(test_texts)

train_texts[:10]

['kinda wished watched mischievous kiss playful kiss ',
 'forget mention bought dreamboys tickets ',
 'yep  jimmy buffett ftw     jimmy needs tweet words wisdom every morning ',
 'blue skies   s still grey hazy window ',
 'moving clips youtube tonight vigil held tulsa metropolitan baptist church  terencecruther  justice  sadness ',
 'makes successful  happy forever ',
 'morning hon  breakfast  cant decide ',
 'standard candice starting show pout  startasyoumeantogoon  gbbo ',
 '   would frown bit  folding arms  why every time m need assistance someone expects lil   ',
 'someone make cofffeeeeeee  ']

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

In [13]:
X_train = tokenizer.texts_to_sequences(train_texts)
X_val = tokenizer.texts_to_sequences(val_texts)
X_test = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train)
X_val = pad_sequences(X_val, maxlen=X_train.shape[1])
X_test = pad_sequences(X_test, maxlen=X_train.shape[1])


In [14]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['emotion'])
y_val = label_encoder.transform(val_data['emotion'])
y_test = label_encoder.transform(test_data['emotion'])

In [15]:
embedding_dim = 50
vocab_size = len(tokenizer.word_index) + 1

In [16]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=X_train.shape[1]))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

In [36]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2062887d190>

In [38]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Accuracy: 20.00%


In [39]:
new_texts = ["I'm so happy right now!", "This is really frustrating."]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded = pad_sequences(new_sequences, maxlen=X_train.shape[1])

predictions = model.predict(new_padded)
predicted_labels = label_encoder.inverse_transform([1 if prediction > 0.5 else 0 for prediction in predictions.flatten()])

print(f'Predictions: {predicted_labels}')

Predictions: ['fear' 'fear']


In [40]:
new_texts = ["Kinda wished I watched mischievous kiss before."]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded = pad_sequences(new_sequences, maxlen=X_train.shape[1])

predictions = model.predict(new_padded)
predicted_labels = label_encoder.inverse_transform([1 if prediction > 0.5 else 0 for prediction in predictions.flatten()])

print(f'Predictions: {predicted_labels}')

Predictions: ['fear']


# SVM


In [9]:
 vectorizer = TfidfVectorizer()
 vectorizer.fit(train_texts)

In [10]:
vectorized_train_texts = vectorizer.transform(train_texts)
vectorized_val_texts = vectorizer.transform(val_texts)
vectorized_test_texts = vectorizer.transform(test_texts)

In [11]:
label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_labels)
encoded_val_labels = label_encoder.transform(val_labels)
encoded_test_labels = label_encoder.transform(test_labels)

In [12]:
svm_model = svm.SVC(probability=True)
svm_model.fit(vectorized_train_texts, encoded_train_labels)

In [13]:
y_pred = svm_model.predict_log_proba(vectorized_val_texts)
y_pred

array([[-5.86042706e+00, -2.62767516e+00, -3.65073746e-01,
        -5.45897634e+00, -1.48500868e+00],
       [-7.48842153e+00, -1.10583506e-02, -8.47690405e+00,
        -4.66847520e+00, -7.07840994e+00],
       [-1.95364877e+01, -6.32350164e-05, -1.71559992e+01,
        -9.72783860e+00, -1.25364272e+01],
       ...,
       [-7.09069420e-01, -2.50180770e+00, -3.07429488e+00,
        -1.29699370e+00, -2.24067394e+00],
       [-2.39121961e+00, -3.69999459e+00, -2.62587918e+00,
        -3.80513238e-01, -2.05672765e+00],
       [-5.07982431e+00, -5.64711392e+00, -3.69333977e+00,
        -1.01053520e-01, -2.78907721e+00]])

In [14]:
acc = accuracy_score(encoded_val_labels, np.argmax(y_pred, axis=1))
precision = precision_score(encoded_val_labels, np.argmax(y_pred, axis=1), average="micro")
recall = recall_score(encoded_val_labels, np.argmax(y_pred, axis=1), average="micro")
f1 = f1_score(encoded_val_labels, np.argmax(y_pred, axis=1), average="micro")
print(f"Model has acc = {acc}\n, precision = {precision}\n, recall = {recall}\n, f1_score = {f1}\n")

Model has acc = 0.8893333333333333
, precision = 0.8893333333333333
, recall = 0.8893333333333333
, f1_score = 0.8893333333333333



In [15]:
y_pred = svm_model.predict(vectorized_test_texts)

In [16]:
emotions = [
    "I hate getting up, it's so annoying.",
    "I love cats and their sweet noses.",
    "I put up with my schoolmates.",
    "This movie made me feel so happy!",
    "The traffic jam today was frustrating.",
    "Spending time with friends always brings joy.",
    "I feel lonely when I'm alone at home.",
    "Excited about the upcoming vacation!",
    "I love spending time with friends and family. It brings me so much joy!",
    "The rude behavior of that person made me really angry.",
    "I'm feeling neutral about the upcoming changes at work.",
    "Today is just another neutral day for me.",
    "The constant delays and issues with the project are making me angry.",
    "Winning the lottery would bring me immense joy and happiness!",
    "I don't have a strong opinion on the matter; I'm feeling quite neutral.",
    "The news about the recent event left me in deep sadness.",
    "Losing a loved one is an experience filled with sadness and grief.",
    "I have a neutral stance on the current political situation.",
    "I'm so happy right now!", "This is really frustrating."
    "Feeling sad and lonely tonight.",
    "I can't contain my excitement!",
    "Angry about the recent events.",
    "Today is a wonderful day!",
    "Dealing with a lot of stress.",
    "The movie made me cry, but it was beautiful.",
    "I'm thrilled about the upcoming project.",
    "Feeling a bit anxious about the presentation."
]

for text in emotions:
    vectorized_text = vectorizer.transform([text])

    text_pred = svm_model.predict(vectorized_text)

    decoded_emotion = label_encoder.inverse_transform(text_pred)

    print(f"'{text}' : {decoded_emotion[0]}")

'I hate getting up, it's so annoying.' : neutral
'I love cats and their sweet noses.' : joy
'I put up with my schoolmates.' : neutral
'This movie made me feel so happy!' : joy
'The traffic jam today was frustrating.' : neutral
'Spending time with friends always brings joy.' : joy
'I feel lonely when I'm alone at home.' : neutral
'Excited about the upcoming vacation!' : fear
'I love spending time with friends and family. It brings me so much joy!' : joy
'The rude behavior of that person made me really angry.' : anger
'I'm feeling neutral about the upcoming changes at work.' : neutral
'Today is just another neutral day for me.' : neutral
'The constant delays and issues with the project are making me angry.' : anger
'Winning the lottery would bring me immense joy and happiness!' : joy
'I don't have a strong opinion on the matter; I'm feeling quite neutral.' : sadness
'The news about the recent event left me in deep sadness.' : sadness
'Losing a loved one is an experience filled with sadne

In [31]:
texts = []
predicted_emotions = []
probabilities = []

for text in emotions:
    vectorized_text = vectorizer.transform([preprocess_text(text)])
    text_pred_prob = svm_model.predict_proba(vectorized_text)
    text_pred = svm_model.predict(vectorized_text)
    decoded_emotion = label_encoder.inverse_transform(text_pred)
    
    texts.append(text)
    predicted_emotions.append(decoded_emotion[0])
    probabilities.append(max(text_pred_prob[0]))  # Assuming you want the probability of the predicted class

emotion_df = pd.DataFrame({'Text': texts, 'Predicted Emotion': predicted_emotions, 'Probability': probabilities})
emotion_df

Unnamed: 0,Text,Predicted Emotion,Probability
0,"I hate getting up, it's so annoying.",neutral,0.521537
1,I love cats and their sweet noses.,joy,0.999738
2,I put up with my schoolmates.,anger,0.621853
3,This movie made me feel so happy!,joy,0.999492
4,The traffic jam today was frustrating.,neutral,0.616242
5,Spending time with friends always brings joy.,joy,0.599712
6,I feel lonely when I'm alone at home.,neutral,0.992052
7,Excited about the upcoming vacation!,neutral,0.489396
8,I love spending time with friends and family. ...,joy,0.89137
9,The rude behavior of that person made me reall...,anger,0.998376


In [21]:
joblib.dump(svm_model, 'svm_model/svm_model.joblib')
joblib.dump(vectorizer, 'svm_model/svm_vectorizer.joblib')
joblib.dump(label_encoder, 'svm_model/svm_label_encoder.joblib')

['svm_model/svm_label_encoder.joblib']

In [22]:
loaded_svm_model = joblib.load('svm_model/svm_model.joblib')
loaded_svm_vectorizer = joblib.load('svm_model/svm_vectorizer.joblib')
loaded_svm_label_encoder = joblib.load('svm_model/svm_label_encoder.joblib')

In [32]:
new_text = "Today is a wonderful excitement day!"

vectorized_new_text = loaded_svm_vectorizer.transform([preprocess_text(new_text)])

predictions_proba_svm = loaded_svm_model.predict_proba(vectorized_new_text)

predicted_label_svm = loaded_svm_label_encoder.inverse_transform(predictions_proba_svm.argmax(axis=1))[0]
probability_svm = predictions_proba_svm.max()

print(f"Text: {new_text}\nPredicted Label (SVM): {predicted_label_svm}\nProbability (SVM): {probability_svm:.2f}")

Text: Today is a wonderful excitement day!
Predicted Label (SVM): joy
Probability (SVM): 0.70


In [34]:
new_text = "I'm afraid of spiders, they're so big!"

vectorized_new_text = loaded_svm_vectorizer.transform([preprocess_text(new_text)])

predictions_proba_svm = loaded_svm_model.predict_proba(vectorized_new_text)

predicted_label_svm = loaded_svm_label_encoder.inverse_transform(predictions_proba_svm.argmax(axis=1))[0]
probability_svm = predictions_proba_svm.max()

print(f"Text: {new_text}\nPredicted Label (SVM): {predicted_label_svm}\nProbability (SVM): {probability_svm:.2f}")

Text: I'm afraid of spiders, they're so big!
Predicted Label (SVM): fear
Probability (SVM): 1.00
