In [70]:
import pandas as pd

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Open datasets

In [36]:
train = pd.read_csv('./ref/messages_train.tsv', sep='\t')

dev_gold = pd.read_csv('./ref/goldstandard_dev_2022.tsv', sep='\t', header=None)
dev_texts = pd.read_csv('./ref/messages_dev.tsv', sep='\t')

In [37]:
train.head()

Unnamed: 0,message_id,response_id,article_id,empathy,distress,empathy_bin,distress_bin,essay,emotion,gender,...,income,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern
0,R_1hGrPtWM4SumG0U_1,R_1hGrPtWM4SumG0U,67,5.667,4.375,1,1,it is really diheartening to read about these ...,sadness,1,...,50000,6.0,5.0,5.0,5.5,5.5,3.571,2.0,3.429,4.0
1,R_1hGrPtWM4SumG0U_2,R_1hGrPtWM4SumG0U,86,4.833,4.875,1,1,the phone lines from the suicide prevention li...,sadness,1,...,50000,6.0,5.0,5.0,5.5,5.5,3.571,2.0,3.429,4.0
2,R_1hGrPtWM4SumG0U_3,R_1hGrPtWM4SumG0U,206,5.333,3.5,1,0,"no matter what your heritage, you should be ab...",neutral,1,...,50000,6.0,5.0,5.0,5.5,5.5,3.571,2.0,3.429,4.0
3,R_1hGrPtWM4SumG0U_4,R_1hGrPtWM4SumG0U,290,4.167,5.25,1,1,it is frightening to learn about all these sha...,fear,1,...,50000,6.0,5.0,5.0,5.5,5.5,3.571,2.0,3.429,4.0
4,R_1hGrPtWM4SumG0U_5,R_1hGrPtWM4SumG0U,342,5.333,4.625,1,1,the eldest generation of russians aren't being...,sadness,1,...,50000,6.0,5.0,5.0,5.5,5.5,3.571,2.0,3.429,4.0


In [38]:
dev_gold.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,7.0,7.0,sadness,5.5,5.5,4.0,5.0,4.5,4.429,2.286,4.143,3.143
1,3.167,3.625,sadness,5.5,5.5,4.0,5.0,4.5,4.429,2.286,4.143,3.143
2,4.5,3.125,fear,5.5,5.5,4.0,5.0,4.5,4.429,2.286,4.143,3.143
3,2.0,2.0,anger,5.5,5.5,4.0,5.0,4.5,4.429,2.286,4.143,3.143
4,4.5,4.625,anger,5.5,5.5,4.0,5.0,4.5,4.429,2.286,4.143,3.143


In [39]:
dev_texts.head()

Unnamed: 0,message_id,response_id,article_id,essay,gender,education,race,age,income
0,R_3QLVVnAgRBRH41U_1,R_3QLVVnAgRBRH41U,13,The story about the air strikes is very sadden...,1.0,4.0,3.0,20.0,24000.0
1,R_3QLVVnAgRBRH41U_2,R_3QLVVnAgRBRH41U,127,It is clear that climate change is something t...,1.0,4.0,3.0,20.0,24000.0
2,R_3QLVVnAgRBRH41U_3,R_3QLVVnAgRBRH41U,188,I did not know this comedian but thinking abou...,1.0,4.0,3.0,20.0,24000.0
3,R_3QLVVnAgRBRH41U_4,R_3QLVVnAgRBRH41U,307,I am an affirmed believer that your punishment...,1.0,4.0,3.0,20.0,24000.0
4,R_3QLVVnAgRBRH41U_5,R_3QLVVnAgRBRH41U,409,"Okay, I hate hearing about disease outbreaks. ...",1.0,4.0,3.0,20.0,24000.0


### Minor preprocessing

In [61]:
def preprocess(text):
    text = text.lower()
    text = [token for token in word_tokenize(text) if token.isalpha()]
    text = ' '.join(text)
    
    return text

## Tf-Idf Vectorization 

In [63]:
tfidf = TfidfVectorizer(stop_words='english', min_df=3)

train_texts = train.essay.apply(preprocess).values

X_train = tfidf.fit_transform(train_texts).toarray()
y_train = train.emotion.values

In [64]:
test_texts = dev_texts.essay.apply(preprocess).values

X_test = tfidf.transform(test_texts).toarray()
y_test = dev_gold[dev_gold.columns[2]]

## SVM on Tf-Idf Vectors

In [65]:
svm = LinearSVC()
svm.fit(X_train, y_train)

LinearSVC()

## Results

In [69]:
print(classification_report(y_test, svm.predict(X_test)))

              precision    recall  f1-score   support

       anger       0.49      0.38      0.43        76
     disgust       0.09      0.08      0.09        12
        fear       0.68      0.55      0.61        31
         joy       0.20      0.07      0.11        14
     neutral       0.25      0.24      0.24        25
     sadness       0.57      0.79      0.66        98
    surprise       0.20      0.14      0.17        14

    accuracy                           0.49       270
   macro avg       0.35      0.32      0.33       270
weighted avg       0.47      0.49      0.47       270

