In [48]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

## 1. TFIDF

In [49]:
train_data = pd.read_pickle(open("tfidf/train_tfidf.pkl", "rb"))
train_X = list(train_data['TFIDF'])
train_y = train_data['Sentiment']
dev_data = pd.read_pickle(open("tfidf/dev_tfidf.pkl", "rb"))
dev_X = list(dev_data['TFIDF'])
dev_y = dev_data['Sentiment']

### AAE

In [50]:
AAE_train_data = train_data[train_data['Demographic'] == 'AAE']
AAE_train_X = list(AAE_train_data['TFIDF'])
AAE_train_y = AAE_train_data['Sentiment']

In [51]:
AAE_dev_data = dev_data[dev_data['Demographic'] == 'AAE']
AAE_dev_X = list(AAE_dev_data['TFIDF'])
AAE_dev_y = AAE_dev_data['Sentiment']

In [52]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(AAE_train_X, AAE_train_y)
labels_predict = knn.predict(AAE_dev_X)
print(classification_report(AAE_dev_y, labels_predict))
print(confusion_matrix(AAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.61      0.57      0.59      1000
    positive       0.59      0.63      0.61      1000

    accuracy                           0.60      2000
   macro avg       0.60      0.60      0.60      2000
weighted avg       0.60      0.60      0.60      2000

[[567 433]
 [369 631]]


### SAE

In [53]:
SAE_train_data = train_data[train_data['Demographic'] == 'SAE']
SAE_train_X = list(SAE_train_data['TFIDF'])
SAE_train_y = SAE_train_data['Sentiment']

In [54]:
SAE_dev_data = dev_data[dev_data['Demographic'] == 'SAE']
SAE_dev_X = list(SAE_dev_data['TFIDF'])
SAE_dev_y = SAE_dev_data['Sentiment']

In [55]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(SAE_train_X, SAE_train_y)
labels_predict = knn.predict(SAE_dev_X)
print(classification_report(SAE_dev_y, labels_predict))
print(confusion_matrix(SAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.63      0.62      0.63      1000
    positive       0.63      0.63      0.63      1000

    accuracy                           0.63      2000
   macro avg       0.63      0.63      0.63      2000
weighted avg       0.63      0.63      0.63      2000

[[623 377]
 [369 631]]


## 2. Embedding

In [56]:
train_data = pd.read_pickle(open("sentence-transformers/train_emb.pkl", "rb"))
train_X = list(train_data['TFIDF'])
train_y = train_data['Sentiment']
dev_data = pd.read_pickle(open("sentence-transformers/dev_emb.pkl", "rb"))
dev_X = list(dev_data['TFIDF'])
dev_y = dev_data['Sentiment']

### AAE

In [57]:
AAE_train_data = train_data[train_data['Demographic'] == 'AAE']
AAE_train_X = list(AAE_train_data['TFIDF'])
AAE_train_y = AAE_train_data['Sentiment']

In [58]:
AAE_dev_data = dev_data[dev_data['Demographic'] == 'AAE']
AAE_dev_X = list(AAE_dev_data['TFIDF'])
AAE_dev_y = AAE_dev_data['Sentiment']

In [59]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(AAE_train_X, AAE_train_y)
labels_predict = knn.predict(AAE_dev_X)
print(classification_report(AAE_dev_y, labels_predict))
print(confusion_matrix(AAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.62      0.62      0.62      1000
    positive       0.62      0.62      0.62      1000

    accuracy                           0.62      2000
   macro avg       0.62      0.62      0.62      2000
weighted avg       0.62      0.62      0.62      2000

[[617 383]
 [377 623]]


### SAE

In [60]:

SAE_train_data = train_data[train_data['Demographic'] == 'SAE']
SAE_train_X = list(SAE_train_data['TFIDF'])
SAE_train_y = SAE_train_data['Sentiment']

In [61]:
SAE_dev_data = dev_data[dev_data['Demographic'] == 'SAE']
SAE_dev_X = list(SAE_dev_data['TFIDF'])
SAE_dev_y = SAE_dev_data['Sentiment']

In [62]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(SAE_train_X, SAE_train_y)
labels_predict = knn.predict(SAE_dev_X)
print(classification_report(SAE_dev_y, labels_predict))
print(confusion_matrix(SAE_dev_y, labels_predict))

              precision    recall  f1-score   support

    negative       0.70      0.69      0.70      1000
    positive       0.70      0.70      0.70      1000

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000

[[694 306]
 [295 705]]
