In [1]:
# core
import pandas as pd
import numpy as np
import time

# nlp processing / cleaning
import spacy
import nltk

# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model

# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# functional
import joblib
import pickle

# custom
from scripts.classes import RnnTextClassifier, RnnDataset
from scripts.functions import get_sentence_vector, get_embeddings_bert, get_embeddings_gpt, train_rnn

# warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


## Read in Data & Clean (lemmitize, lower)

In [2]:
train = pd.read_csv('data/liars_train.csv')
test = pd.read_csv('data/liars_test.csv')
valid = pd.read_csv('data/liars_valid.csv')

In [3]:
nlp = spacy.load('en_core_web_sm')
train.statement = train.statement.apply(lambda x: ' '.join([token.lemma_.lower() for token in nlp(x)]))

## Feature Engineering - TF-IDF, Word2Vec, BERT/GPT Embeddings

TF IDF

In [4]:
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), max_features = 1000)
tfidf_train = tfidf.fit_transform(train.statement)
tfidf_test = tfidf.transform(test.statement)

tfidf_train = torch.tensor(tfidf_train.toarray(), dtype = torch.float)
tfidf_test = torch.tensor(tfidf_test.toarray(), dtype = torch.float)

print(tfidf_train.shape, tfidf_test.shape)

torch.Size([10240, 1000]) torch.Size([1267, 1000])


Word2Vec

In [5]:
# Train model
nltk.download('punkt')

tokenized_statements_train = [nltk.tokenize.word_tokenize(statement.lower()) for statement in train.statement]

w2v_model = Word2Vec(
    sentences = tokenized_statements_train, 
    vector_size = 1000, window = 5, min_count = 1, workers = 4
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Keith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
w2v_train = pd.Series(tokenized_statements_train).apply(lambda x: get_sentence_vector(x, w2v_model))
w2v_train = np.array(w2v_train.tolist())
w2v_train = torch.tensor(w2v_train, dtype = torch.float)

tokenized_statements_test = [nltk.tokenize.word_tokenize(statement.lower()) for statement in test.statement]
w2v_test = pd.Series(tokenized_statements_test).apply(lambda x: get_sentence_vector(x, w2v_model))
w2v_test = np.array(w2v_test.tolist())
w2v_test = torch.tensor(w2v_test, dtype = torch.float)

In [7]:
print(w2v_train.shape, w2v_test.shape)

torch.Size([10240, 1000]) torch.Size([1267, 1000])


BERT

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [9]:
bert_train = train.statement.apply(lambda x: get_embeddings_bert(x, tokenizer, model))
bert_test = test.statement.apply(lambda x: get_embeddings_bert(x, tokenizer, model))

bert_train = np.array(bert_train.tolist())
bert_test = np.array(bert_test.tolist())

bert_train = torch.tensor(bert_train, dtype = torch.float)
bert_test = torch.tensor(bert_test, dtype = torch.float)

print(bert_train.shape, bert_test.shape)

torch.Size([10240, 768]) torch.Size([1267, 768])


GPT

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
model.eval()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [11]:
gpt_train = train.statement.apply(lambda x: get_embeddings_gpt(x, tokenizer, model))
gpt_test = test.statement.apply(lambda x: get_embeddings_gpt(x, tokenizer, model))

gpt_train = np.array(gpt_train.tolist())
gpt_test = np.array(gpt_test.tolist())

gpt_train = torch.tensor(gpt_train, dtype = torch.float)
gpt_test = torch.tensor(gpt_test, dtype = torch.float)

print(gpt_train.shape, gpt_test.shape)

torch.Size([10240, 768]) torch.Size([1267, 768])


Labels

In [12]:
y_train = train.label
y_test = test.label

# Create conversion dicts
label_to_int = {label: idx for idx, label in enumerate(np.unique(y_train))}
int_to_label = {idx: label for idx, label in enumerate(np.unique(y_train))}

# Make tensors for NN
y_train_tensor = np.array([label_to_int[label] for label in y_train])
y_train_tensor = torch.tensor(y_train_tensor, dtype = torch.long)

Data Export

In [13]:
data_list = [tfidf_train, w2v_train, bert_train, gpt_train, tfidf_test, w2v_test, bert_test, gpt_test, y_train, y_test]
data_names = ['tfidf_train', 'w2v_train', 'bert_train', 'gpt_train', 'tfidf_test', 'w2v_test', 'bert_test', 'gpt_test', 'y_train', 'y_test']

for var_df, var_name in zip(data_list, data_names):
    with open(f'data/{var_name}.pkl', 'wb') as f:
        pickle.dump(var_df, f)

## Modeling

In [14]:
start_time = time.perf_counter()

for X_train, X_name in zip([tfidf_train, w2v_train, bert_train, gpt_train], ['tfidf', 'w2v', 'bert', 'gpt']):

    # recurrent neural network
    rnn = RnnTextClassifier(
        input_size = X_train.shape[1], output_size = len(y_train.unique()), 
        hidden_size = 256, num_layers = 2
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(rnn.parameters(), lr = 0.001)
    n_epochs = range(100)
    dataset = RnnDataset(X_train, y_train_tensor)
    data_loader = DataLoader(dataset, batch_size = int(X_train.shape[0] / 128), shuffle = True)
    trained_rnn = train_rnn(rnn, data_loader, criterion, optimizer, n_epochs)
    torch.save(trained_rnn, f'models/rnn_multi_{X_name}.pth')
    print(f'Finished with RNN-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')

    # logistic regression
    lr = LogisticRegression(max_iter = 4000)
    lr.fit(X_train, y_train)
    joblib.dump(lr, f'models/lr_multi_{X_name}.joblib')
    print(f'Finished with LR-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')

    # random forest
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    joblib.dump(rf, f'models/rf_multi_{X_name}.joblib')
    print(f'Finished with RF-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')
        
    # support vector machine
    svm = SVC()
    svm.fit(X_train, y_train)
    joblib.dump(svm, f'models/svm_multi_{X_name}.joblib')
    print(f'Finished with SVM-{X_name} - Time elapsed: {(time.perf_counter()-start_time)/60:.2f}\n')

Finished with RNN-tfidf - Time elapsed: 0.56

Finished with LR-tfidf - Time elapsed: 0.57

Finished with RF-tfidf - Time elapsed: 0.73

Finished with SVM-tfidf - Time elapsed: 1.62

Finished with RNN-w2v - Time elapsed: 2.18

Finished with LR-w2v - Time elapsed: 2.22

Finished with RF-w2v - Time elapsed: 2.70

Finished with SVM-w2v - Time elapsed: 3.58

Finished with RNN-bert - Time elapsed: 4.11

Finished with LR-bert - Time elapsed: 4.20

Finished with RF-bert - Time elapsed: 4.57

Finished with SVM-bert - Time elapsed: 5.17

Finished with RNN-gpt - Time elapsed: 5.71



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished with LR-gpt - Time elapsed: 6.00

Finished with RF-gpt - Time elapsed: 6.37

Finished with SVM-gpt - Time elapsed: 7.00



In [15]:
lr_tfidf = joblib.load('models/lr_multi_tfidf.joblib')
lr_w2v = joblib.load('models/lr_multi_w2v.joblib')
lr_bert = joblib.load('models/lr_multi_bert.joblib')
lr_gpt = joblib.load('models/lr_multi_gpt.joblib')

rf_tfidf = joblib.load('models/rf_multi_tfidf.joblib')
rf_w2v = joblib.load('models/rf_multi_w2v.joblib')
rf_bert = joblib.load('models/rf_multi_bert.joblib')
rf_gpt = joblib.load('models/rf_multi_gpt.joblib')

svm_tfidf = joblib.load('models/svm_multi_tfidf.joblib')
svm_w2v = joblib.load('models/svm_multi_w2v.joblib')
svm_bert = joblib.load('models/svm_multi_bert.joblib')
svm_gpt = joblib.load('models/svm_multi_gpt.joblib')

rnn_tfidf = torch.load('models/rnn_multi_tfidf.pth')
rnn_w2v = torch.load('models/rnn_multi_w2v.pth')
rnn_bert = torch.load('models/rnn_multi_bert.pth')
rnn_gpt = torch.load('models/rnn_multi_gpt.pth')

## Evaluation

In [16]:
models = [
    rnn_tfidf, rnn_w2v, rnn_bert, rnn_gpt,
    lr_tfidf, lr_w2v, lr_bert, lr_gpt,
    rf_tfidf, rf_w2v, rf_bert, rf_gpt, 
    svm_tfidf, svm_w2v, svm_bert, svm_gpt
]

data_sets = [
    (tfidf_train, tfidf_test), (w2v_train, w2v_test), 
    (bert_train, bert_test), (gpt_train, gpt_test),
]

data_names = ['tfidf', 'w2v', 'bert', 'gpt']

In [20]:
columns = ['model', 'data', 'accuracy_is', 'accuracy_oos', 'precision_oos', 'recall_oos', 'f1_oos']

results = pd.DataFrame(columns = columns)

for model, data, data_name in zip(models, data_sets*4, data_names*4):
    X_train, X_test = data
    if 'Rnn' in str(model):
        y_pred_train = model(X_train).argmax(dim=1)
        y_pred_train = [int_to_label[idx.item()] for idx in y_pred_train]
        y_pred_test = model(X_test).argmax(dim=1)
        y_pred_test = [int_to_label[idx.item()] for idx in y_pred_test]
    else:
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_oos = precision_score(y_test, y_pred_test, average='weighted')
    recall_oos = recall_score(y_test, y_pred_test, average='weighted')
    f1_oos = f1_score(y_test, y_pred_test, average='weighted')
    class_report_oos = classification_report(y_test, y_pred_test)

    result = dict(zip(columns, [
        str(model), data_name, accuracy_train, accuracy_test, 
        precision_oos, recall_oos, f1_oos
    ]))
    results = pd.concat([results, pd.DataFrame(result, index=[0])], ignore_index=True)
    
    print(
        f'Model: {model}\n'
        f'Data: {data_name}\n'
        f'In-sample accuracy: {accuracy_train:.3f}\n'
        f'Out-of-sample accuracy: {accuracy_test:.3f}\n'
        f'Classification report (OOS):\n{class_report_oos}\n'
        f'\n----------\n'
    )

Model: RnnTextClassifier(
  (rnn): RNN(1000, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_features=6, bias=True)
)
Data: tfidf
In-sample accuracy: 0.986
Out-of-sample accuracy: 0.194
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.26      0.23      0.24       212
       false       0.19      0.18      0.19       249
   half-true       0.21      0.21      0.21       265
 mostly-true       0.17      0.16      0.16       241
  pants-fire       0.11      0.12      0.11        92
        true       0.20      0.23      0.21       208

    accuracy                           0.19      1267
   macro avg       0.19      0.19      0.19      1267
weighted avg       0.20      0.19      0.19      1267


----------

Model: RnnTextClassifier(
  (rnn): RNN(1000, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_feat

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: LogisticRegression(max_iter=4000)
Data: tfidf
In-sample accuracy: 0.400
Out-of-sample accuracy: 0.239
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.27      0.18      0.22       212
       false       0.27      0.36      0.31       249
   half-true       0.24      0.28      0.26       265
 mostly-true       0.22      0.21      0.21       241
  pants-fire       0.10      0.03      0.05        92
        true       0.21      0.23      0.22       208

    accuracy                           0.24      1267
   macro avg       0.22      0.21      0.21      1267
weighted avg       0.23      0.24      0.23      1267


----------

Model: LogisticRegression(max_iter=4000)
Data: w2v
In-sample accuracy: 0.239
Out-of-sample accuracy: 0.220
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.15      0.05      0.08       212
       false       0.22      0.43      0.29       249
   half

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: RandomForestClassifier()
Data: gpt
In-sample accuracy: 0.999
Out-of-sample accuracy: 0.242
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.18      0.06      0.09       212
       false       0.22      0.34      0.27       249
   half-true       0.26      0.36      0.30       265
 mostly-true       0.27      0.34      0.30       241
  pants-fire       0.00      0.00      0.00        92
        true       0.22      0.16      0.18       208

    accuracy                           0.24      1267
   macro avg       0.19      0.21      0.19      1267
weighted avg       0.22      0.24      0.22      1267


----------

Model: SVC()
Data: tfidf
In-sample accuracy: 0.767
Out-of-sample accuracy: 0.232
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.26      0.12      0.16       212
       false       0.26      0.40      0.32       249
   half-true       0.23      0.30      0.26 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: SVC()
Data: w2v
In-sample accuracy: 0.236
Out-of-sample accuracy: 0.250
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.00      0.00      0.00       212
       false       0.23      0.50      0.32       249
   half-true       0.23      0.40      0.29       265
 mostly-true       0.33      0.36      0.34       241
  pants-fire       0.00      0.00      0.00        92
        true       0.00      0.00      0.00       208

    accuracy                           0.25      1267
   macro avg       0.13      0.21      0.16      1267
weighted avg       0.16      0.25      0.19      1267


----------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: SVC()
Data: bert
In-sample accuracy: 0.402
Out-of-sample accuracy: 0.260
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.30      0.11      0.16       212
       false       0.27      0.36      0.31       249
   half-true       0.25      0.41      0.31       265
 mostly-true       0.24      0.30      0.27       241
  pants-fire       0.00      0.00      0.00        92
        true       0.29      0.17      0.21       208

    accuracy                           0.26      1267
   macro avg       0.23      0.22      0.21      1267
weighted avg       0.25      0.26      0.24      1267


----------

Model: SVC()
Data: gpt
In-sample accuracy: 0.221
Out-of-sample accuracy: 0.213
Classification report (OOS):
              precision    recall  f1-score   support

 barely-true       0.00      0.00      0.00       212
       false       0.24      0.22      0.23       249
   half-true       0.21      0.82      0.33       265
 mostly-tr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
results

Unnamed: 0,model,data,accuracy_is,accuracy_oos,precision_oos,recall_oos,f1_oos
0,"RnnTextClassifier(\n (rnn): RNN(1000, 256, nu...",tfidf,0.985938,0.194159,0.196119,0.194159,0.194749
1,"RnnTextClassifier(\n (rnn): RNN(1000, 256, nu...",w2v,0.237012,0.239148,0.195117,0.239148,0.173715
2,"RnnTextClassifier(\n (rnn): RNN(768, 256, num...",bert,0.999414,0.229676,0.230131,0.229676,0.228597
3,"RnnTextClassifier(\n (rnn): RNN(768, 256, num...",gpt,0.856934,0.209945,0.201538,0.209945,0.192728
4,LogisticRegression(max_iter=4000),tfidf,0.399902,0.239148,0.232249,0.239148,0.231159
5,LogisticRegression(max_iter=4000),w2v,0.239258,0.220205,0.193823,0.220205,0.184905
6,LogisticRegression(max_iter=4000),bert,0.405957,0.260458,0.26126,0.260458,0.258416
7,LogisticRegression(max_iter=4000),gpt,0.400098,0.228098,0.225181,0.228098,0.219562
8,RandomForestClassifier(),tfidf,0.991211,0.221784,0.220277,0.221784,0.215404
9,RandomForestClassifier(),w2v,0.999414,0.235201,0.22679,0.235201,0.227268
