In [109]:
# core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# nlp processing / cleaning
import spacy
import nltk

# tokenizers
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model

# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import joblib

# warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Read in Data

In [129]:
train = pd.read_csv('data/liars_train.csv')
test = pd.read_csv('data/liars_test.csv')
valid = pd.read_csv('data/liars_valid.csv')

In [3]:
print(train.statement[0])

Says the Annies List political group supports third-trimester abortions on demand.


## Clean up text - Lemmitize, NER

In [4]:
nlp = spacy.load('en_core_web_sm')
train.statement = train.statement.apply(lambda x: ' '.join([token.lemma_.lower() for token in nlp(x)]))

## Feature Engineering - TF-IDF, Word2Vec, BERT Embeddings

TF IDF

In [133]:
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), max_features = 1000)
tfidf_train = tfidf.fit_transform(train.statement)
tfidf_test = tfidf.transform(test.statement)

tfidf_train = torch.tensor(tfidf_train.toarray(), dtype = torch.float)
tfidf_test = torch.tensor(tfidf_test.toarray(), dtype = torch.float)

print(tfidf_train.shape, tfidf_test.shape)

torch.Size([10240, 1000]) torch.Size([1267, 1000])


Word2Vec

In [21]:
# Train model
nltk.download('punkt')

tokenized_statements_train = [nltk.tokenize.word_tokenize(statement.lower()) for statement in train.statement]

w2v_model = Word2Vec(
    sentences = tokenized_statements_train, 
    vector_size = 1000, window = 5, min_count = 1, workers = 4
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Keith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
def get_sentence_vector(statement, model):
    words = [word for word in statement if word in model.wv.key_to_index]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return np.zeros(model.vector_size)
    
w2v_train = pd.Series(tokenized_statements_train).apply(lambda x: get_sentence_vector(x, w2v_model))
w2v_train = np.array(w2v_train.tolist())

tokenized_statements_test = [nltk.tokenize.word_tokenize(statement.lower()) for statement in test.statement]
w2v_test = pd.Series(tokenized_statements_test).apply(lambda x: get_sentence_vector(x, w2v_model))
w2v_test = np.array(w2v_test.tolist())

In [23]:
print(w2v_train.shape, w2v_test.shape)

(10240, 1000) (1267, 1000)


BERT

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [12]:
def get_embeddings_bert(statement, tokenizer, model):
    input = tokenizer(
        statement, return_tensors = 'pt', 
        padding = True, truncation = True, max_length = 768
    )
    
    with torch.no_grad():
        output = model(**input)
    embeddings_vector = output.last_hidden_state.mean(dim = 1).squeeze()
    
    return embeddings_vector

In [14]:
bert_train = train.statement.apply(lambda x: get_embeddings_bert(x, tokenizer, model))
bert_test = test.statement.apply(lambda x: get_embeddings_bert(x, tokenizer, model))

bert_train = np.array(bert_train.tolist())
bert_test = np.array(bert_test.tolist())

print(bert_train.shape, bert_test.shape)

(10240, 768) (1267, 768)


GPT

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 27.8MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 35.1MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 44.4MB/s]
config.json: 100%|██████████| 665/665 [00:00<00:00, 1.33MB/s]
model.safetensors: 100%|██████████| 548M/548M [00:14<00:00, 38.6MB/s] 


In [18]:
def get_embeddings_gpt(statement, tokenizer, model):
    input = tokenizer(
        statement, return_tensors = 'pt', # padding = True, 
        truncation = True, max_length = 768
    )
    
    with torch.no_grad():
        output = model(**input)
    embeddings_vector = output.last_hidden_state.mean(dim = 1).squeeze()
    
    return embeddings_vector

In [19]:
gpt_train = train.statement.apply(lambda x: get_embeddings_gpt(x, tokenizer, model))
gpt_test = test.statement.apply(lambda x: get_embeddings_gpt(x, tokenizer, model))

gpt_train = np.array(gpt_train.tolist())
gpt_test = np.array(gpt_test.tolist())

print(gpt_train.shape, gpt_test.shape)

(10240, 768) (1267, 768)


In [99]:
y_train = train.label
y_test = test.label

# Prep labels for nn training
label_to_int = {label: idx for idx, label in enumerate(np.unique(y_train))}
y_train_tensor = np.array([label_to_int[label] for label in y_train])
y_train_tensor = torch.tensor(y_train_tensor, dtype = torch.long)

## Modeling

Model definitions

In [100]:
class RnnTextClassifier(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RnnTextClassifier, self).__init__()

        # model params
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):

        # reshape input
        x = x.unsqueeze(1)

        # initialize hidden state
        hidden = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # get RNN output
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])
        
        return out

class RnnDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __len__ (self):
        return len(self.X_data)
    
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

def train_rnn(model, data_loader, criterion, optimizer, n_epochs):
    
    model.train()
    
    for _ in n_epochs:

        for X_batch, y_batch in data_loader:

            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
    return model

In [138]:
X_train = tfidf_train.detach().clone()
n_classes = len(y_train.unique())

rnn = RnnTextClassifier(
    input_size = X_train.shape[1], 
    output_size = len(y_train.unique()), 
    hidden_size = 256, 
    num_layers = 2
)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr = 0.001)
n_epochs = range(1)
dataset = RnnDataset(X_train, y_train_tensor)
data_loader = DataLoader(dataset, batch_size = int(X_train.shape[0] / 128), shuffle = True)

test = train_rnn(rnn, data_loader, criterion, optimizer, n_epochs)

In [None]:
dataset = RnnDataset(X_train, y_train_tensor)
data_loader = DataLoader(dataset, batch_size = int(X_train.shape[0] / 128), shuffle = True)

for X_batch, y_batch in data_loader:
    y_pred = test(X_batch)
    break

Fitting

In [139]:
start_time = time.perf_counter()

for X_train, X_name in zip([tfidf_train, w2v_train, bert_train, gpt_train], ['tfidf', 'w2v', 'bert', 'gpt']):

    # recurrent neural network
    rnn = RnnTextClassifier(
        input_size = X_train.shape[1], output_size = len(y_train.unique()), 
        hidden_size = 256, num_layers = 2
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(rnn.parameters(), lr = 0.001)
    n_epochs = range(100)
    dataset = RnnDataset(X_train, y_train_tensor)
    data_loader = DataLoader(dataset, batch_size = int(X_train.shape[0] / 128), shuffle = True)
    trained_rnn = train_rnn(rnn, data_loader, criterion, optimizer, n_epochs)
    torch.save(trained_rnn, f'models/rnn_model_{X_name}.pth')
    print(f'Finished with RNN-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')

    # logistic regression
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    joblib.dump(lr, f'models/lr_model_{X_name}.joblib')
    print(f'Finished with LR-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')

    # random forest
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    joblib.dump(rf, f'models/rf_model_{X_name}.joblib')
    print(f'Finished with RF-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')
        
    # support vector machine
    svm = SVC()
    svm.fit(X_train, y_train)
    joblib.dump(svm, f'models/svm_model_{X_name}.joblib')
    print(f'Finished with SVM-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')

Finished with RNN-tfidf - Time elapsed: 0.06

Finished with LR-tfidf - Time elapsed: 0.07



KeyboardInterrupt: 

In [120]:
X_train.shape[0]

5