In [1]:
import spacy
import numpy as np
import string
import re
import pandas as pd

In [2]:
nlp = spacy.load('en_vectors_web_lg')

In [3]:
X = pd.read_csv("./data/hate.csv", delimiter=",", dtype='unicode')

In [4]:
b=X.iloc[:,6].values

In [7]:
def train(X, y):
    D = 50
    K = 2
    # initialize parameters randomly
    h = 50 # size of hidden layer
    W0 = 0.01 * np.random.randn(300,50)
    b0 = np.zeros((1,50))
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))

    # some hyperparameters
    step_size = 0.825
    reg = 1e-3 # regularization strength
    
    num_examples = len(y)
    # pooling loop
    kt = np.empty((len(y), 300))
    for i in range(len(y)):
        # word embeddings -> Max pooling / Average pooling -> 1-layer MLP -> ReLu -> Fully connected softmax
        # word embed
        Xi = nlp(X[i].decode('utf-8'))
        t = len(Xi)
        wt = map(lambda x: x.vector, Xi)
        # ReLu
        zt = np.maximum(0, wt)
        # max pooling / Average pooling
        #mt = np.max(zt, axis=0)
        at = np.mean(zt, axis=0)
        # concat
        kt[i, :] = at #np.append(mt, at)
        
    # gradient descent loop
    for j in xrange(5000): 
        # evaluate class scores, [N x K]
        hidden_layer0 = np.dot(kt, W0) + b0
        hidden_layer = np.maximum(0, np.dot(hidden_layer0, W) + b) # note, ReLU activation
        scores = np.dot(hidden_layer, W2) + b2

        # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]

        # compute the loss: average cross-entropy loss and regularization
        correct_logprobs = -np.log(probs[range(num_examples),y])
        data_loss = np.sum(correct_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        if j % 100 == 0:
            print "iteration %d: loss %f" % (j, loss)
            
        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y] -= 1
        dscores /= num_examples
  
        # backpropate the gradient to the parameters
        # first backprop into parameters W2 and b2
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis=0, keepdims=True)
        # next backprop into hidden layer
        dhidden = np.dot(dscores, W2.T)
        # backprop the ReLU non-linearity
        dhidden[hidden_layer <= 0] = 0
        # into W,b
        dW = np.dot(hidden_layer0.T, dhidden)
        db = np.sum(dhidden, axis=0, keepdims=True)
        # next backprop into hidden0 layer
        dhidden0 = np.dot(dhidden, W.T)
        # finally into W0, b0
        dW0 = np.dot(kt.T, dhidden0)
        db0 = np.sum(dhidden0, axis=0, keepdims=True)

        # add regularization gradient contribution
        dW2 += reg * W2
        dW += reg * W
        dW0 += reg * W0

        # perform a parameter update
        W += -step_size * dW
        b += -step_size * db
        W2 += -step_size * dW2
        b2 += -step_size * db2
        W0 += -step_size * dW0
        b0 += -step_size * db0
    return b0, W0, b, W, b2, W2

b=X.iloc[:,6].values
a=map(int, X.iloc[:,5].values)
a = map(lambda x : 1 if x == 0 else 0, a)
l,m,p,q,r,s = train(b[:17000], a[:17000])

iteration 0: loss 0.693255
iteration 100: loss 0.238105
iteration 200: loss 0.238084
iteration 300: loss 0.238066
iteration 400: loss 0.238050
iteration 500: loss 0.238035
iteration 600: loss 0.238019
iteration 700: loss 0.238000
iteration 800: loss 0.237972
iteration 900: loss 0.237921
iteration 1000: loss 0.237803
iteration 1100: loss 0.237429
iteration 1200: loss 0.235877
iteration 1300: loss 0.227052
iteration 1400: loss 0.214563
iteration 1500: loss 0.206920
iteration 1600: loss 0.202370
iteration 1700: loss 0.199544
iteration 1800: loss 0.197759
iteration 1900: loss 0.196421
iteration 2000: loss 0.195519
iteration 2100: loss 0.194740
iteration 2200: loss 0.194041
iteration 2300: loss 0.193568
iteration 2400: loss 0.193180
iteration 2500: loss 0.192760
iteration 2600: loss 0.192474
iteration 2700: loss 0.192240
iteration 2800: loss 0.192021
iteration 2900: loss 0.191868
iteration 3000: loss 0.191725
iteration 3100: loss 0.191581
iteration 3200: loss 0.191496
iteration 3300: loss 0

In [8]:
from sklearn.metrics import f1_score, roc_auc_score
def predict(X, y, b0, W0, b, W, b2, W2):
    kt = np.empty((len(y), 300))
    for i in range(len(y)):
        # word embeddings -> Max pooling / Average pooling -> 2-layer MLP -> ReLu -> Fully connected softmax
        # word embed
        Xi = nlp(X[i].decode('utf-8'))
        t = len(Xi)
        wt = map(lambda x: x.vector, Xi)
        # ReLu
        zt = np.maximum(0, wt)  #wt * (wt > 0)
        # max pooling / Average pooling
        #mt = np.max(zt, axis=0)
        at = np.mean(zt, axis=0)
        # concat
        kt[i, :] = at #np.append(mt, at)
    # evaluate class scores, [N x K]
    hidden_layer0 = np.dot(kt, W0) + b0
    # evaluate class scores, [N x K]
    hidden_layer = np.maximum(0, np.dot(hidden_layer0, W) + b) # note, ReLU activation
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    print 'accuracy: %.2f' % (np.mean(predicted_class == y))
    print 'f1-score: ', f1_score( y, predicted_class, average='weighted' )
    print 'auc: ', roc_auc_score(y, scores[:, 1])
    
predict(b[:17000], a[:17000], l, m, p, q, r, s)
predict(b[17000:], a[17000:], l, m, p, q, r, s)

accuracy: 0.94
f1-score:  0.9126919751220812
auc:  0.8620982798081546
accuracy: 0.96
f1-score:  0.9395088725475187
auc:  0.8215338401921093
