In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/MyDrive/NLP\ Project

Mounted at /content/gdrive
/content/gdrive/MyDrive/NLP Project


In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import numpy as np
import pickle

# Import Data and Embeddings

In [3]:
with open("HateXPlainData/trainHateXplain", "rb") as file:
  train_data = pickle.load(file)

with open("HateXPlainData/valHateXplain", "rb") as file:
  val_data = pickle.load(file)

with open("HateXPlainData/testHateXplain", "rb") as file:
  test_data = pickle.load(file)

In [4]:
train_w2v = np.load("HateXPlainData/Train_W2V_Embeddings.npy")
train_glove = np.load("HateXPlainData/Train_GloVe_Embeddings.npy")

val_w2v = np.load("HateXPlainData/Val_W2V_Embeddings.npy")
val_glove = np.load("HateXPlainData/Val_GloVe_Embeddings.npy")

test_w2v = np.load("HateXPlainData/Test_W2V_Embeddings.npy")
test_glove = np.load("HateXPlainData/Test_GloVe_Embeddings.npy")

# Process Data

In [67]:
labels = {'offensive': 2, 'hatespeech': 1, 'normal': 0}

train_tokens, train_labels = zip(*train_data)
train_labels = np.array([labels[l] for l in train_labels])

val_tokens, val_labels = zip(*val_data)
val_labels = [labels[l] for l in val_labels]

test_tokens, test_labels = zip(*test_data)
test_labels = np.array([labels[l] for l in test_labels])

# Logistic Regression Model

In [63]:
class MyLogisticRegression:
  def __init__(self, lr=0.01, epochs=10000, batch_size=None, tol=0.0001):
    self.learning_rate = lr
    self.epochs = epochs
    self.batch_size = batch_size
    self.tol = tol

  def softmax(self, z):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

  def loss_function(self, y_hat, y_true):
    neg_log = -np.log(np.sum(y_hat * y_true, axis=1))
    return np.mean(neg_log)

  def calculate_gradient(self, X, y_hat, y_true):
    samples = len(y_hat)
    y_true = np.argmax(y_true, axis=1)

    dY = y_hat.copy()
    dY[range(samples), y_true] -= 1

    dW = np.dot(X.T, dY) / samples
    return dW

  def fit(self, X, y):
    loss = []
    N = X.shape[0]
    y_onehot = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1, 1))
    weights = np.random.rand(X.shape[1], y_onehot.shape[1])
    
    for _ in range(self.epochs):
      y_hat = self.softmax(np.dot(X, weights))      
      dW = self.calculate_gradient(X, y_hat, y_onehot)
      weights -= self.learning_rate * dW

      loss.append(self.loss_function(y_hat, y_onehot))
    
    self.weights = weights
    self.loss = loss

  def predict(self, X):
    z = np.dot(X, self.weights)
        
    return np.argmax(self.softmax(z), axis=1)

# Logistic Regression w/ Word2Vec

In [70]:
w2v_model = LogisticRegression(max_iter=20000, multi_class='multinomial')

w2v_model.fit(train_w2v, train_labels)

print(w2v_model.score(test_w2v, test_labels))

0.5945945945945946


In [64]:
w2v_model = MyLogisticRegression()
w2v_model.fit(train_w2v, np.array(train_labels))

In [65]:
y_pred = w2v_model.predict(test_w2v)
print(y_pred)
print(test_labels)
print(metrics.classification_report(test_labels, y_pred))
print(metrics.confusion_matrix(test_labels, y_pred))

[1 1 1 ... 1 1 1]
[0 0 2 ... 2 1 1]
              precision    recall  f1-score   support

           0       0.17      0.08      0.11       782
           1       0.25      0.58      0.35       594
           2       0.47      0.11      0.18       548

    accuracy                           0.25      1924
   macro avg       0.29      0.26      0.21      1924
weighted avg       0.28      0.25      0.20      1924

[[ 64 675  43]
 [222 346  26]
 [100 386  62]]


# Logistic Regression w/ GloVe

In [69]:
glove_model = LogisticRegression(max_iter=20000, multi_class='multinomial')

glove_model.fit(train_glove, train_labels)

print(glove_model.score(test_glove, test_labels))

0.5971933471933472
