In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import numpy as np
import pickle

# Load data and embeddings

In [2]:
!git clone https://github.com/hwalli92/nlp_project.git

fatal: destination path 'nlp_project' already exists and is not an empty directory.


In [3]:
%cd nlp_project

/content/nlp_project


In [4]:
with open("HateXPlainData/trainHateXplain", "rb") as file:
  train_data = pickle.load(file)

with open("HateXPlainData/valHateXplain", "rb") as file:
  val_data = pickle.load(file)

with open("HateXPlainData/testHateXplain", "rb") as file:
  test_data = pickle.load(file)

In [5]:
train_w2v = np.load("HateXPlainData/Train_W2V_Embeddings.npy")
train_glove = np.load("HateXPlainData/Train_GloVe_Embeddings.npy")

val_w2v = np.load("HateXPlainData/Val_W2V_Embeddings.npy")
val_glove = np.load("HateXPlainData/Val_GloVe_Embeddings.npy")

test_w2v = np.load("HateXPlainData/Test_W2V_Embeddings.npy")
test_glove = np.load("HateXPlainData/Test_GloVe_Embeddings.npy")

# Process data

In [6]:
labels = {'offensive': 2, 'hatespeech': 1, 'normal': 0}

train_tokens, train_labels = zip(*train_data)
train_labels = np.array([labels[l] for l in train_labels])

val_tokens, val_labels = zip(*val_data)
val_labels = [labels[l] for l in val_labels]

test_tokens, test_labels = zip(*test_data)
test_labels = np.array([labels[l] for l in test_labels])


# Naive Bayes w/ Word2Vec

In [7]:
w2v_model = MultinomialNB()

max_train = np.max(train_w2v)
min_train = np.min(train_w2v)
train_w2v = (train_w2v - min_train) / (max_train - min_train)

w2v_model.fit(train_w2v, train_labels)

max_train = np.max(test_w2v)
min_train = np.min(test_w2v)
test_w2v = (test_w2v - min_train) / (max_train - min_train)
y_pred = w2v_model.predict(test_w2v)
print(y_pred)
print(test_labels)
print(metrics.classification_report(test_labels, y_pred))
print(metrics.confusion_matrix(test_labels, y_pred))

[0 0 0 ... 0 0 0]
[0 0 2 ... 2 1 1]
              precision    recall  f1-score   support

           0       0.41      1.00      0.58       782
           1       0.50      0.00      0.00       594
           2       0.00      0.00      0.00       548

    accuracy                           0.41      1924
   macro avg       0.30      0.33      0.19      1924
weighted avg       0.32      0.41      0.24      1924

[[782   0   0]
 [593   1   0]
 [547   1   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



# Naive Bayes w/ GloVe

In [8]:
glove_model = MultinomialNB()

max_train = np.max(train_glove)
min_train = np.min(train_glove)
train_glove = (train_glove - min_train) / (max_train - min_train)

glove_model.fit(train_glove, train_labels)

max_train = np.max(test_glove)
min_train = np.min(test_glove)
test_glove = (test_glove - min_train) / (max_train - min_train)
y_pred = glove_model.predict(test_glove)
print(y_pred)
print(test_labels)
print(metrics.classification_report(test_labels, y_pred))
print(metrics.confusion_matrix(test_labels, y_pred))

[0 0 0 ... 0 0 0]
[0 0 2 ... 2 1 1]
              precision    recall  f1-score   support

           0       0.41      1.00      0.58       782
           1       0.00      0.00      0.00       594
           2       0.00      0.00      0.00       548

    accuracy                           0.41      1924
   macro avg       0.14      0.33      0.19      1924
weighted avg       0.17      0.41      0.23      1924

[[782   0   0]
 [594   0   0]
 [548   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes w/ Token Counts

In [9]:
class NaiveBayesClassifier:
  def __init__(self):
    self.vectorizer =  CountVectorizer()
    self.classifier = MultinomialNB()

  def train(self, X_train, y_train):
    X_train_f = self.vectorizer.fit_transform(X_train)
    self.classifier.fit(X_train_f, y_train)
    return X_train_f

  def predict(self, X_test):
    X_test_f = self.vectorizer.transform(X_test)
    y_pred = self.classifier.predict(X_test_f)
    return y_pred, X_test_f

In [10]:
classifier = NaiveBayesClassifier()
train_strings = [' '.join(string) for string in train_tokens]
X_train_f = classifier.train(train_strings, train_labels)
test_strings = [' '.join(string) for string in test_tokens]
y_pred, X_val_f = classifier.predict(test_strings)

print(y_pred)
print(test_labels)
print(metrics.classification_report(test_labels, y_pred))
print(metrics.confusion_matrix(test_labels, y_pred))

[0 0 2 ... 2 1 2]
[0 0 2 ... 2 1 1]
              precision    recall  f1-score   support

           0       0.66      0.75      0.70       782
           1       0.67      0.76      0.71       594
           2       0.53      0.34      0.42       548

    accuracy                           0.64      1924
   macro avg       0.62      0.62      0.61      1924
weighted avg       0.62      0.64      0.62      1924

[[584  89 109]
 [ 84 451  59]
 [221 138 189]]
