In [2]:
import torch
import torch.nn as nn

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score, plot_roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score, f1_score, plot_roc_curve, roc_auc_score, accuracy_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords


import thor

In [3]:
train = pd.read_csv('../data/total_train.csv')

X = train['title'].str.lower()
y = train['class']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.1, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

train_sentences = thor.tokenize(X_train, pattern='\w\w+')
test_sentences =thor.tokenize(X_test, pattern = '\w\w+')

In [5]:
train_sentences

0                    [common, signs, of, cervical, cancer]
1        [cubana, chief, priest, hires, rain, makers, t...
2        [geomagnetic, storm, may, have, effectively, d...
3        [masten, wanjala, mob, beats, kenyan, child, s...
4        [astro, founding, member, of, the, reggae, gro...
                               ...                        
72291    [urgent, call, for, sperm, donors, as, birming...
72292    [tiktok, suspends, livestreaming, and, new, up...
72293    [seal, lying, in, sunbeam, could, be, depresse...
72294    [lumion, 11, crack, amp, serial, key, full, ve...
72295    [the, appearance, of, the, monuments, will, be...
Name: title, Length: 72296, dtype: object

In [6]:
# import GloVe model
glove_input_file = '../word2vec/glove.6B.100d.txt'
word2vec_output_file = '../word2vec/word2vec.txt'

glove2word2vec(glove_input_file, word2vec_output_file)

w2v = KeyedVectors.load_word2vec_format('../word2vec/word2vec.txt', binary=False)

  glove2word2vec(glove_input_file, word2vec_output_file)


In [47]:
# turn each word into a 100-dimensional vector. Words which are not in our vocabulary will have to be encoded as all 0's
def word_to_vector(word):
    try:
        return np.array(w2v[word])
    except:
        return np.zeros(100)
    
# turn a document an array of 30 vectors
def sent_to_tensor(sent):
    n = len(sent)
    # if the sentence is less than 30 words, we'll need to add some padding
    # note: we pad the front; this is because of how an RNN flows and what we are trying to extract
    if n < 30:
        return np.array([np.zeros(100) for i in range(30-n)]+[word_to_vector(word) for word in sent])
    else:
        return np.array([word_to_vector(sent[i]) for i in range(30)])

# turn the corpus into a 3-tensor
def corpus_to_vector(ser):
    return np.array([sent_to_tensor(sent) for sent in ser])


In [48]:
sent_to_tensor(train_sentences[0])

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.1529    , -0.24279   ,  0.89837003, ..., -0.59100002,
         1.00390005,  0.20664001],
       [ 0.092774  ,  0.98281997, -0.25476   , ..., -0.54462999,
        -0.34422001, -0.90757   ],
       [ 0.30875999,  0.57172   , -0.76573002, ..., -0.49552   ,
         0.36862999, -0.43560001]])

In [49]:
# convert data to vector embeddings
Xvec_train = corpus_to_vector(train_sentences)

Xvec_test = corpus_to_vector(test_sentences)

In [50]:
Xvec_train.shape

(72296, 30, 100)

In [51]:
Xvec_train[0]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.1529    , -0.24279   ,  0.89837003, ..., -0.59100002,
         1.00390005,  0.20664001],
       [ 0.092774  ,  0.98281997, -0.25476   , ..., -0.54462999,
        -0.34422001, -0.90757   ],
       [ 0.30875999,  0.57172   , -0.76573002, ..., -0.49552   ,
         0.36862999, -0.43560001]])

In [52]:
# convert embeddings to PyTorch tensors
Xten_train = torch.tensor(Xvec_train, dtype=torch.float32, device='cuda')

In [35]:
# create OnionNet Model with LSTM layer
class OnionNet(nn.Module):
    def __init__(self):
        super(OnionNet, self).__init__()
        # The first hidden layer is an LSTM layer. It takes as input shapes of (batch_size, sequence length, embedding)
        # The output will be of shape (sequence length, hidden units)
        self.lstm = nn.LSTM(input_size = 100, hidden_size = 64, batch_first=True, device='cuda')
        # add one more hidden layer (just for experimentation)
        self.hidden = nn.Linear(in_features = 64, out_features = 32, device='cuda')
        # output layer. Since we're using BCELoss, we'll need to run this through a sigmoid
        self.output = nn.Linear(in_features = 32, out_features = 1, device='cuda')
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        # because lstm_out returns an output for each of the 30 tokens
        # we'll need to manually extract the last output value (as it "summarizes" the entire sentenc)
        z = lstm_out[-1]
        z = self.tanh(self.hidden(z))
        z = self.output(z)
        z = self.sigmoid(z)
        return z

In [36]:
# play around with the LSTM layer to see the input / output sizes
lstm_out, (h_n, c_n) = nn.LSTM(input_size=100, hidden_size=1, batch_first=True)(torch.tensor(Xvec_train[1], dtype=torch.float32))


lstm_out.shape, h_n.shape, c_n.shape

(torch.Size([30, 1]), torch.Size([1, 1]), torch.Size([1, 1]))

- Notice that ```lstm_out``` is an array of size 30. This is because we had 30 tokens per sentence and the LSTM returns an output for each token.
- Of course, output we actually care about is the one at the end.

In [37]:
lstm_out[-1]

tensor([0.1970], grad_fn=<SelectBackward0>)

In [38]:
# recast labels as pytorch tensor
y2_train = torch.tensor(y_train, dtype=torch.float32, device='cuda')

In [39]:
# reshape for training purposes
y2_train = y2_train.view(y2_train.shape[0],1)

In [40]:
y2_train[0]

tensor([0.], device='cuda:0')

In [53]:
# instantiate model, loss, and optimizer
onionnet = OnionNet()

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(onionnet.parameters(), lr = 0.001)



In [54]:
# training time!
epochs = 5

for epoch in range(epochs):
    print(" ")
    print(f"Epoch {epoch+1} =======================")
    print(" ")
    # we will treat each sentence as a batch of inputs
    for i, row in enumerate(Xten_train):
        running_loss = 0
        output = onionnet(row)
        loss = criterion(output, y2_train[i])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        running_loss += loss.item()
        if i%10000 == 0:
            print(f'average loss: {running_loss/(i+1)}')

 
 
average loss: 0.6574232578277588
average loss: 5.263331582839221e-06
average loss: 1.155833664026733e-06
average loss: 3.485726106604323e-07
average loss: 6.051477563461362e-07
average loss: 3.464714061312455e-07
average loss: 3.396275038480842e-07
average loss: 2.3272407526452277e-07
 
 
average loss: 0.03673220053315163
average loss: 3.0362991428580263e-06
average loss: 5.517982184502502e-07
average loss: 1.6803145649524448e-07
average loss: 5.641443062262524e-06
average loss: 6.252781415862542e-07
average loss: 2.3725523689651833e-07
average loss: 1.3242998299306668e-07
 
 
average loss: 0.048866018652915955
average loss: 1.898771823316726e-06
average loss: 1.5565290436924211e-06
average loss: 1.2170006755576411e-07
average loss: 1.762014535518799e-07
average loss: 2.105834872965188e-07
average loss: 1.5673632276640127e-07
average loss: 7.631753511022574e-08
 
 
average loss: 0.026623273268342018
average loss: 7.026282659400381e-07
average loss: 4.867654451906225e-07
average los

In [55]:
Xten_test = torch.tensor(Xvec_test, dtype=torch.float32, device='cuda')

In [56]:
with torch.no_grad():
    correct = 0
    for i,row in enumerate(Xten_test):
        if onionnet(row).item() >= 0.5:
            prediction = 1
        else:
            prediction = 0
        
        if prediction == y_test[i]:
            correct+=1
            
    print(correct/len(y_test))

0.9177144279845637


The basic LSTM model improved accuracy over the feed-forward neural network by approximately 1% .