<a href="https://colab.research.google.com/gist/jacobdanovitch/285b42364083f9db89db4881b97068ff/twtc-torchhub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install regex requests sentencepiece sacremoses

In [0]:
import torch

import pandas as pd
import numpy as np

import multiprocessing as mp
from tqdm.auto import tqdm; tqdm.pandas()

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score, accuracy_score

In [0]:
def tqdm_parallel(fn, vals, processes):
    with mp.Pool(processes=processes) as pool, tqdm(total=len(vals)) as pbar:
        for x in pool.imap(fn, vals):
            pbar.update()
            yield x

In [0]:
def load_torchhub_transformer(model):
  tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', model)    
  transformer = torch.hub.load('huggingface/pytorch-transformers', 'model', model).eval().cuda()

  return transformer, tokenizer

In [30]:
#xlnet, tokenizer = load_torchhub_transformer('xlnet-base-cased')
#distilbert, tokenizer = load_torchhub_transformer('distilbert-base-uncased')
#gpt2, tokenizer = load_torchhub_transformer('gpt2-medium')

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large').eval().cuda()

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_master


loading archive file http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz from cache at /root/.cache/torch/pytorch_fairseq/83e3a689e28e5e4696ecb0bbb05a77355444a5c8a3437e0f736d8a564e80035e.c687083d14776c1979f3f71654febb42f2bb3d9a94ff7ebdfe1ac6748dba89d2
| dictionary: 50264 types


In [0]:
def transformer_featurizer(model, tokenizer, txt):
  tok = tokenizer.encode(txt)[:512]
  tok = torch.tensor([tok])
  features = model(tok.to('cuda:0'))[0].squeeze(0).cpu().detach().numpy()
  features = features.mean(axis=0)
  
  return features


def extract_transformer_features(model, tokenizer, report):
  encoder = lambda x: transformer_featurizer(model, tokenizer, x)
  embedded = report.progress_apply(encoder).values
  return np.matrix(embedded.tolist())

In [0]:
def roberta_encoder(x):
  return roberta.encode(x)[:512]

def roberta_featurizer(tokens):
    features = roberta.extract_features(tokens).squeeze(0).cpu().detach().numpy()
    features = features.mean(axis=0)
    
    return features

def extract_roberta_features(report):
  tokens = tqdm_parallel(roberta_encoder, report, processes=8)
  embedded = np.matrix([roberta_featurizer(r) for r in tokens])

  return embedded

In [7]:
df = pd.read_csv('preprocessed.csv')[['report', 'label']].copy()

print(df.shape)
df.head()

(5824, 2)


Unnamed: 0,report,label
0,PERSON is a Level NUMBER sex offender and woul...,0
1,PERSON made headlines for all the wrong reason...,0
2,"The ORGANIZATION have acquired PERSON twice, f...",1
3,Signed for an above-slot $NUMBER million as a ...,1
4,"It often takes time for those high-ceilinged, ...",0


In [38]:
#X = extract_transformer_features(gpt2, tokenizer, df.report)
#X = TfidfVectorizer().fit_transform(df.report)
X = extract_roberta_features(df.report)

y = df['label']

X.shape, y.shape

((5824, 9804), (5824,))

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape

(4659, 9804)

In [40]:
clf = LinearSVC(max_iter=2000)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
test_data = (y_test, preds)

print(classification_report(y_test, preds))

print(accuracy_score(*test_data))
print(f1_score(*test_data))
print(balanced_accuracy_score(*test_data))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90       921
           1       0.64      0.36      0.47       244

    accuracy                           0.82      1165
   macro avg       0.75      0.66      0.68      1165
weighted avg       0.81      0.82      0.81      1165

0.8248927038626609
0.46596858638743455
0.6557755290934657
