In [11]:
import numpy as np
import pandas as pd

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import (
    train_test_split,
    cross_validate
)
from sklearn.feature_extraction.text import CountVectorizer

import ast

In [12]:
df = pd.read_csv("data/training_data.csv")
df.head(10)

Unnamed: 0,track_id,word_counts,emotion_freqs
0,TRAAAAV128F421A322,"{'like': 2, 'de': 1, 'got': 1, 'would': 1, 'se...","{'angry': 0.05, 'disgust': 0.15, 'fear': 0.15,..."
1,TRAAABD128F429CF47,"{'know': 5, 'time': 3, 'la': 7, 'get': 2, 'got...","{'angry': 0.0, 'disgust': 0.0, 'fear': 0.0, 'h..."
2,TRAAAED128E0783FAB,"{'love': 11, 'like': 1, 'time': 6, 'come': 4, ...","{'angry': 0.0, 'disgust': 0.0, 'fear': 0.02941..."
3,TRAAAEF128F4273421,"{'know': 1, 'got': 3, 'feel': 1, 'let': 1, 'wo...","{'angry': 0.07142857142857142, 'disgust': 0.07..."
4,TRAAAEW128F42930C0,"{'like': 1, 'take': 1, 'would': 1, 'wo': 1, 's...","{'angry': 0.125, 'disgust': 0.5, 'fear': 0.125..."
5,TRAAAFD128F92F423A,"{'one': 1, 'got': 1, 'never': 1, 'feel': 1, 'w...","{'angry': 0.0, 'disgust': 0.0, 'fear': 0.29411..."
6,TRAAAGF12903CEC202,"{'en': 1, 'e': 1, 'end': 1, 'fine': 1, 'n': 1,...","{'angry': 0, 'disgust': 0, 'fear': 0, 'happy':..."
7,TRAAAHJ128F931194C,"{'love': 1, 'time': 1, 'never': 3, 'say': 1, '...","{'angry': 0.0, 'disgust': 0.06666666666666667,..."
8,TRAAAHZ128E0799171,"{'love': 2, 'know': 1, 'like': 1, 'time': 2, '...","{'angry': 0.0297029702970297, 'disgust': 0.069..."
9,TRAAAJG128F9308A25,"{'like': 1, 'one': 1, 'get': 1, 'ca': 4, 'need...","{'angry': 0.13333333333333333, 'disgust': 0.06..."


In [13]:
df = df.drop(columns=['track_id'])

# Convert columns to dictionaries
df['word_counts'] = df['word_counts'].apply(ast.literal_eval)
df['emotion_freqs'] = df['emotion_freqs'].apply(ast.literal_eval)

# Remove rows with {'error': 1}
df = df[df['word_counts'] != {'error': 1}]

# Takes just the words from word_counts
df['text'] = df['word_counts'].apply(lambda x: ' '.join([f"{k} " * v for k, v in x.items()]))

# Take the top emotion from emotion_freqs
df['top_emotion'] = df['emotion_freqs'].apply(lambda x: max(x, key=x.get))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

# Encode top emotion into numerical label
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['top_emotion'])

df.head(10)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
df.shape

(13424, 4)

In [15]:
df.describe(include="all")

Unnamed: 0,word_counts,emotion_freqs,text,top_emotion
count,13424,13424,13424,13424
unique,13404,11363,13404,7
top,"{'love': 2, 'know': 7, 'like': 4, 'time': 1, '...","{'angry': 0, 'disgust': 0, 'fear': 0, 'happy':...",love love know know know know know know know ...,neutral
freq,2,454,2,7354


In [16]:
dummy = DummyClassifier(strategy="stratified", random_state = 123)
pd.DataFrame(cross_validate(dummy, X_train, y_train, cv=5, return_train_score=True))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.0,0.0,0.350532,0.346727
1,0.006034,0.0,0.347525,0.340694
2,0.0,0.0,0.353379,0.346814
3,0.0,0.0,0.346993,0.349075
4,0.0,0.0,0.371474,0.351603


In [17]:
train_scores = []
cv_scores = []

C_vals = 10.0 ** np.arange(-1.5, 2, 0.5)

for C in C_vals:
    model = LogisticRegression(max_iter=1000, random_state=123, C=C, class_weight="balanced")
    cv_results = cross_validate(model, X_train, y_train, return_train_score=True)

    train_scores.append(cv_results["train_score"].mean())
    cv_scores.append(cv_results["test_score"].mean())
    
pd.DataFrame({"C": C_vals, "cv": cv_scores, "train": train_scores,})

Unnamed: 0,C,cv,train
0,0.031623,0.665391,0.850814
1,0.1,0.702854,0.914591
2,0.316228,0.730205,0.960355
3,1.0,0.744467,0.987814
4,3.162278,0.751171,0.997659
5,10.0,0.753619,0.999521
6,31.622777,0.755535,0.999894


In [18]:
lr = LogisticRegression(max_iter=1000, random_state=123, C = 3.162278, class_weight="balanced")
scores = cross_validate(lr, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.680806,0.0,0.744149,0.997871
1,1.445011,0.0,0.761575,0.997339
2,1.592862,0.001336,0.748803,0.997472
3,1.524798,0.0,0.762108,0.997738
4,1.754737,0.0,0.742948,0.997871


In [19]:
df = pd.DataFrame(scores)
df.std()

fit_time       0.122679
score_time     0.000598
test_score     0.009322
train_score    0.000242
dtype: float64

In [20]:
lr.fit(X_train, y_train)

test_score = lr.score(X_test, y_test)
print(f"Test Set Accuracy: {test_score}")

Test Set Accuracy: 0.7656405163853028


In [28]:
# Test on fitted model
new_word_counts = {'love': 3, 'happy': 2, 'sad': 1}

# Preprocess the data
new_text = ' '.join([f'{k} ' * v for k, v in new_word_counts.items()])
new_features = vectorizer.transform([new_text])

# Predict the top emotion and decode the numerical value returned
predicted_emotions = lr.predict(new_features)
predicted_emotion_name = label_encoder.inverse_transform(predicted_emotions)

print(f'Predicted Emotion: {predicted_emotion_name[0]}')

Predicted Emotion: happy
