In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

## 1. TFIDF

In [2]:
train_data = pd.read_pickle(open("tfidf/train_tfidf.pkl", "rb"))
train_X = list(train_data['TFIDF'])
train_y = train_data['Sentiment']
dev_data = pd.read_pickle(open("tfidf/dev_tfidf.pkl", "rb"))
dev_X = list(dev_data['TFIDF'])
dev_y = dev_data['Sentiment']

### AAE

In [3]:
AAE_train_data = train_data[train_data['Demographic'] == 'AAE']
AAE_train_X = list(AAE_train_data['TFIDF'])
AAE_train_y = AAE_train_data['Sentiment']

In [4]:
AAE_dev_data = dev_data[dev_data['Demographic'] == 'AAE']
AAE_dev_X = list(AAE_dev_data['TFIDF'])
AAE_dev_y = AAE_dev_data['Sentiment']

In [5]:
nb = GaussianNB()
nb.fit(AAE_train_X, AAE_train_y)
labels_predict = nb.predict(AAE_dev_X)
print(classification_report(AAE_dev_y, labels_predict))
print(confusion_matrix(AAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.67      0.45      0.54      1000
    positive       0.59      0.77      0.67      1000

    accuracy                           0.61      2000
   macro avg       0.63      0.61      0.60      2000
weighted avg       0.63      0.61      0.60      2000

[[454 546]
 [227 773]]


### SAE

In [6]:
SAE_train_data = train_data[train_data['Demographic'] == 'SAE']
SAE_train_X = list(SAE_train_data['TFIDF'])
SAE_train_y = SAE_train_data['Sentiment']

In [7]:
SAE_dev_data = dev_data[dev_data['Demographic'] == 'SAE']
SAE_dev_X = list(SAE_dev_data['TFIDF'])
SAE_dev_y = SAE_dev_data['Sentiment']

In [8]:
nb = GaussianNB()
nb.fit(SAE_train_X, SAE_train_y)
labels_predict = nb.predict(SAE_dev_X)
print(classification_report(SAE_dev_y, labels_predict))
print(confusion_matrix(SAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.61      0.78      0.68      1000
    positive       0.69      0.51      0.59      1000

    accuracy                           0.64      2000
   macro avg       0.65      0.64      0.63      2000
weighted avg       0.65      0.64      0.63      2000

[[777 223]
 [494 506]]


## 2. Embedding

In [9]:
train_data = pd.read_pickle(open("sentence-transformers/train_emb.pkl", "rb"))
train_X = list(train_data['TFIDF'])
train_y = train_data['Sentiment']
dev_data = pd.read_pickle(open("sentence-transformers/dev_emb.pkl", "rb"))
dev_X = list(dev_data['TFIDF'])
dev_y = dev_data['Sentiment']

### AAE

In [10]:
AAE_train_data = train_data[train_data['Demographic'] == 'AAE']
AAE_train_X = list(AAE_train_data['TFIDF'])
AAE_train_y = AAE_train_data['Sentiment']

In [11]:
AAE_dev_data = dev_data[dev_data['Demographic'] == 'AAE']
AAE_dev_X = list(AAE_dev_data['TFIDF'])
AAE_dev_y = AAE_dev_data['Sentiment']

In [12]:
nb = GaussianNB()
nb.fit(AAE_train_X, AAE_train_y)
labels_predict = nb.predict(AAE_dev_X)
print(classification_report(AAE_dev_y, labels_predict))
print(confusion_matrix(AAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.57      0.67      0.62      1000
    positive       0.60      0.50      0.54      1000

    accuracy                           0.58      2000
   macro avg       0.59      0.58      0.58      2000
weighted avg       0.59      0.58      0.58      2000

[[668 332]
 [502 498]]


### SAE

In [13]:

SAE_train_data = train_data[train_data['Demographic'] == 'SAE']
SAE_train_X = list(SAE_train_data['TFIDF'])
SAE_train_y = SAE_train_data['Sentiment']

In [14]:
SAE_dev_data = dev_data[dev_data['Demographic'] == 'SAE']
SAE_dev_X = list(SAE_dev_data['TFIDF'])
SAE_dev_y = SAE_dev_data['Sentiment']

In [15]:
nb = GaussianNB()
nb.fit(SAE_train_X, SAE_train_y)
labels_predict = nb.predict(SAE_dev_X)
print(classification_report(SAE_dev_y, labels_predict))
print(confusion_matrix(SAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.64      0.76      0.70      1000
    positive       0.70      0.58      0.64      1000

    accuracy                           0.67      2000
   macro avg       0.67      0.67      0.67      2000
weighted avg       0.67      0.67      0.67      2000

[[757 243]
 [421 579]]
