# Training GloVe on our own dataset

In [2]:
import os
import pandas as pd
import numpy as np
import pickle
import random
from scipy.sparse import *

### Loading the co-occurence matrix

In [4]:
DATA_PATH = 'data'

In [13]:
with open(os.path.join(DATA_PATH, 'cooc.pkl'), 'rb') as f:
        cooc = pickle.load(f)
print("{} nonzero entries".format(cooc.nnz))

6496907 nonzero entries


In [14]:
nmax = 100
print("using nmax =", nmax, ", cooc.max() =", cooc.max())

using nmax = 100 , cooc.max() = 207302


In [15]:
print("initializing embeddings")
embedding_dim = 30
xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

initializing embeddings


In [16]:
cooc.toarray()

array([[207302,  73069,  57298, ...,      8,      4,      5],
       [ 73069, 204448,  44668, ...,      5,      5,     25],
       [ 57298,  44668, 126287, ...,      3,      6,     10],
       ...,
       [     8,      5,      3, ...,      5,      0,      0],
       [     4,      5,      6, ...,      0,      5,      0],
       [     5,     25,     10, ...,      0,      0,      5]])

### Training GloVe

In [7]:
epochs = 20

In [8]:
eta = 0.001
alpha = 3 / 4

In [10]:
for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            logn = np.log(n)
            fn = min(1.0, (n / nmax) ** alpha)
            x, y = xs[ix, :], ys[jy, :]
            scale = 2 * eta * fn * (logn - np.dot(x, y))
            xs[ix, :] += scale * y
            ys[jy, :] += scale * x


epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19


In [11]:
np.savez(os.path.join(DATA_PATH, 'embeddings'), xs, ys)

In [17]:
 npzfile  = np.load(os.path.join(DATA_PATH, 'embeddings.npz'))

In [18]:
xs = npzfile['arr_0']

In [19]:
ys = npzfile['arr_1']

Loading vocab

In [20]:
with open(os.path.join(DATA_PATH, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)
vocab_size = len(vocab)
vocab_size

21161

### Create sentence embeddings by averaging every word in each sentence

In [21]:
idx_to_embedding =xs+ys

In [22]:
def create_sent_embeddings(data_path):
    embeddings = []
    with open(data_path) as file:
        for line in file:
            tmp = np.zeros(embedding_dim)
            count = 0
            for val in line.strip().split():
                IDX = vocab.get(val, -1)
                if(IDX!= -1):
                    tmp+=idx_to_embedding[IDX]
                    count+=1
            if(count!= 0):
                tmp = tmp/count
            embeddings.append(tmp)
    return embeddings


In [23]:
pos_train = create_sent_embeddings(os.path.join(DATA_PATH, 'train_pos.txt'))

In [24]:
pos_train_arr = np.array(pos_train)

In [25]:
pos_train_arr.shape

(100000, 30)

In [26]:
neg_train = create_sent_embeddings(os.path.join(DATA_PATH, 'train_neg.txt'))

In [27]:
neg_train_arr = np.array(neg_train)

In [28]:
neg_train_arr.shape

(100000, 30)

In [29]:
poslabels = np.repeat(1,pos_train_arr.shape[0])

In [30]:
neglabels = np.repeat(-1,neg_train_arr.shape[0])

In [31]:
labels = np.append(poslabels,neglabels)

In [32]:
training_set = np.concatenate((pos_train_arr, neg_train_arr))

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, accuracy_score

### Random Forest Classifier

Training and validation

In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(training_set, labels, test_size=0.3, random_state=42)

In [36]:
clf = RandomForestClassifier(156, verbose=True)

In [37]:
clf.fit(X_train,  y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 156 out of 156 | elapsed:  2.1min finished


RandomForestClassifier(n_estimators=156, verbose=True)

In [38]:
preds= clf.predict(X_valid)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 156 out of 156 | elapsed:    2.1s finished


In [39]:
print(classification_report(y_valid, preds) )

precision    recall  f1-score   support

          -1       0.69      0.58      0.63     30149
           1       0.64      0.74      0.68     29851

    accuracy                           0.66     60000
   macro avg       0.66      0.66      0.66     60000
weighted avg       0.66      0.66      0.66     60000



In [40]:
print( accuracy_score(y_valid, preds))

0.6601


Testing

In [41]:
test_data = create_sent_embeddings(os.path.join(DATA_PATH, 'test_data.txt'))

In [42]:
test_embeddings = np.array(test_data)
test_embeddings.shape

(10000, 30)

In [43]:
test_predictions = clf.predict(test_embeddings)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 156 out of 156 | elapsed:    0.6s finished


In [44]:
id_count = 1 
output_arr = []
for val in test_predictions:
    print(id_count, val)
    output_arr.append([id_count, val])
    id_count+=1

1
7319 -1
7320 1
7321 -1
7322 -1
7323 1
7324 1
7325 -1
7326 1
7327 1
7328 1
7329 1
7330 1
7331 1
7332 1
7333 -1
7334 1
7335 1
7336 -1
7337 1
7338 1
7339 1
7340 -1
7341 -1
7342 1
7343 -1
7344 -1
7345 -1
7346 1
7347 -1
7348 -1
7349 -1
7350 1
7351 -1
7352 -1
7353 1
7354 1
7355 -1
7356 1
7357 -1
7358 -1
7359 -1
7360 -1
7361 -1
7362 -1
7363 1
7364 -1
7365 1
7366 -1
7367 1
7368 -1
7369 -1
7370 1
7371 1
7372 1
7373 1
7374 1
7375 -1
7376 1
7377 1
7378 1
7379 -1
7380 -1
7381 1
7382 -1
7383 -1
7384 -1
7385 1
7386 1
7387 1
7388 1
7389 -1
7390 1
7391 -1
7392 1
7393 1
7394 1
7395 -1
7396 1
7397 1
7398 1
7399 -1
7400 1
7401 -1
7402 -1
7403 -1
7404 1
7405 -1
7406 -1
7407 1
7408 1
7409 -1
7410 -1
7411 -1
7412 1
7413 1
7414 1
7415 1
7416 -1
7417 1
7418 -1
7419 -1
7420 1
7421 -1
7422 1
7423 1
7424 -1
7425 1
7426 1
7427 -1
7428 -1
7429 1
7430 -1
7431 -1
7432 1
7433 1
7434 -1
7435 -1
7436 -1
7437 1
7438 -1
7439 -1
7440 -1
7441 1
7442 -1
7443 1
7444 -1
7445 1
7446 1
7447 -1
7448 -1
7449 -1
7450 -1
7451 1
7

In [45]:
output_df = pd.DataFrame(np.array(output_arr))
output_df.columns=["Id", "Prediction"]
output_df.set_index('Id', inplace=True)
output_df.to_csv("predfiles/Predictions_CIL_Glove_RFC.csv")

# Stanford Glove Embeddings

In [41]:
EMBEDDING_PATH = 'embeddings/glove'

In [42]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

In [43]:
embeddings_dict = {}

Create dictionary with stanford embeddings

In [46]:
with open(os.path.join(EMBEDDING_PATH, "glove.twitter.27B.25d.txt"), 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

Now use word embeddings to get sentence embeddings

In [47]:
def create_sent_embeddings_stanford(data_path):
    embeddings = []
    with open(data_path) as file:
        for line in file:
            tmp = np.zeros(25)
            count = 0
            for val in line.strip().split():
                if val in embeddings_dict:
                    tmp+=embeddings_dict[val]
                    count+=1
            if(count!= 0):
                tmp = tmp/count
            embeddings.append(tmp)  
    return embeddings

In [48]:
pos_train = create_sent_embeddings_stanford(os.path.join(DATA_PATH, 'train_pos.txt'))

In [49]:
pos_train_arr = np.array(pos_train)
pos_train_arr.shape

(100000, 25)

In [50]:
neg_train = create_sent_embeddings_stanford(os.path.join(DATA_PATH, 'train_neg.txt'))

In [51]:
neg_train_arr = np.array(neg_train)
neg_train_arr.shape

(100000, 25)

In [52]:
poslabels = np.repeat(1,pos_train_arr.shape[0])
neglabels = np.repeat(-1,neg_train_arr.shape[0])

In [53]:
labels = np.append(poslabels,neglabels)
training_set = np.concatenate((pos_train_arr, neg_train_arr))

In [54]:
X_train, X_valid, y_train, y_valid = train_test_split(training_set, labels, test_size=0.3, random_state=42)

In [55]:
clf = RandomForestClassifier(156,n_jobs =1, verbose=True)
clf.fit(X_train,  y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 156 out of 156 | elapsed:  2.2min finished


RandomForestClassifier(n_estimators=156, n_jobs=1, verbose=True)

In [81]:
preds= clf.predict(X_valid)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 156 out of 156 | elapsed:    2.3s finished


In [82]:
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

          -1       0.79      0.69      0.74     30149
           1       0.72      0.81      0.76     29851

    accuracy                           0.75     60000
   macro avg       0.75      0.75      0.75     60000
weighted avg       0.75      0.75      0.75     60000



In [83]:
print(accuracy_score(y_valid,preds))

0.7503833333333333


Now, test data from competition (again)

In [84]:
test_sentences = create_sent_embeddings_stanford(os.path.join(DATA_PATH, 'test_data.txt'))

In [85]:
test_embeddings = np.array(test_sentences)
test_predictions = clf.predict(test_embeddings)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 156 out of 156 | elapsed:    0.5s finished


In [86]:
id_count = 1 
output_arr = []
for val in test_predictions:
    #print(id_count, val)
    output_arr.append([id_count, val])
    id_count+=1

In [87]:
len(output_arr)

10000

In [88]:
output_df = pd.DataFrame(np.array(output_arr))
output_df.columns=["Id", "Prediction"]
output_df.set_index('Id', inplace=True)
output_df.to_csv("predfiles/Predictions_Stanford_GloVe_RFC.csv")