In [1]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [2]:
!pip install fasttext
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import os
import fasttext.util
import nltk
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import random

import warnings
warnings.filterwarnings('ignore')

import transformers
from transformers import BertModel, BertConfig, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [4]:
colab = True

if colab:
    df = pd.read_csv("/content/gdrive/MyDrive/Metodata/training_data_ra_only.csv")
else:
    df = pd.read_csv("../data/training_data_ra_only.csv")

In [5]:
df.columns

Index(['id', 'pubdatetime', 'publication_name', 'title_h1', 'text_200',
       'relevant', 'solution_frame', 'problem_frame', 'title_h2',
       'articleHead', 'text', 'text_lead', 'text_body', 'publication_type',
       'publication_edition', 'wordCount', 'country', 'state', 'city',
       'matches', 'matches_count'],
      dtype='object')

In [6]:
df.isna().sum() / len(df)

id                     0.000000
pubdatetime            0.000000
publication_name       0.005036
title_h1               0.004443
text_200               0.000000
relevant               0.000000
solution_frame         0.000000
problem_frame          0.000000
title_h2               0.939870
articleHead            0.534656
text                   0.267773
text_lead              0.299171
text_body              0.267773
publication_type       0.267773
publication_edition    0.869964
wordCount              0.267773
country                0.390995
state                  0.578791
city                   0.678910
matches                0.267773
matches_count          0.267773
dtype: float64

In [7]:
df.drop(["title_h2", "articleHead", "publication_edition", "solution_frame",\
         "problem_frame", "state", "city", "matches", "matches_count"], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,id,pubdatetime,publication_name,title_h1,text_200,relevant,text,text_lead,text_body,publication_type,wordCount,country
0,804124,2021-05-10T00:00:00Z,The Hill,The Hill's 12:30 Report - Presented by Faceboo...,Presented by Facebook To view past editions of...,0,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions o...,Web Publication;WebLinks,2031.0,UNITED STATES
1,554905,2020-10-30T00:00:00Z,The Arizona Republic (Phoenix),'60 Minutes' interviews stark study in contrasts,Having seen both the unedited footage of Presi...,0,Having seen both the unedited footage of Presi...,Having seen both the unedited footage of Presi...,This is not a political observation. It's a te...,Newspaper;Newspapers,734.0,
2,798375,2021-01-05T00:00:00Z,Newstex Blogs,Is There a Case for Principled Populism From t...,"\r\n\r\nJan 05, 2021( Conservative Daily News:...",0,"\n\nJan 05, 2021( Conservative Daily News: htt...","Jan 05, 2021( Conservative Daily News:","In the past 10 years, we have seen a new entra...",Web Blog;Blogs,1689.0,UNITED STATES / CHINA / EUROPE / HUNGARY
3,691445,2020-11-10T00:00:00Z,Canadian Press,Election breathes new life into false 'dead vo...,As President Donald Trump continued to assert ...,0,As President Donald Trump continued to assert ...,As President Donald Trump continued to assert ...,The false claim that deceased voters cast vote...,Newswire;Newswires & Press Releases,886.0,
4,490404,2020-10-07T00:00:00Z,Tampa Bay Times,Viewer's Guide: Virus response on stage with P...,Mike Pence and Kamala Harris do not have a tou...,0,Mike Pence and Kamala Harris do not have a tou...,Mike Pence and Kamala Harris do not have a tou...,The 90-minute debate will be divided into nine...,Newspaper;Newspapers,1062.0,UNITED STATES


In [9]:
df[df.text.notna()].isna().sum() / len(df[df.text.notna()])
df = df[df.text.notna()]
df = df[df.relevant != 99]
df = df[df.title_h1.notna()]
df = df[df.text_lead.notna()]

In [10]:
!cp /content/gdrive/MyDrive/Metodata/preprocessing.py /content/
!cp /content/gdrive/MyDrive/Metodata/requirements.txt /content/

!pip install -r requirements.txt
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
!cp /content/gdrive/MyDrive/Metodata/contraction_map.json /content/

from preprocessing import Preprocessor
preprocessor = Preprocessor()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
ddf = preprocessor.perform_clean_lemmatize_tokenize(df, "title_h1")
df = preprocessor.perform_clean_lemmatize_tokenize(df, "text")
df = preprocessor.perform_clean_lemmatize_tokenize(df, "text_lead")

100%|██████████| 2354/2354 [00:00<00:00, 3735.54it/s]
100%|██████████| 2354/2354 [00:02<00:00, 1083.69it/s]
100%|██████████| 2354/2354 [00:34<00:00, 68.86it/s] 
100%|██████████| 2354/2354 [00:04<00:00, 471.46it/s]
100%|██████████| 2354/2354 [00:01<00:00, 1215.61it/s]
100%|██████████| 2354/2354 [00:00<00:00, 5221.57it/s]


In [13]:
df.head()

Unnamed: 0,id,pubdatetime,publication_name,title_h1,text_200,relevant,text,text_lead,text_body,publication_type,...,country,cleaned_title_h1,lemmatized_title_h1,tokens_title_h1,cleaned_text,lemmatized_text,tokens_text,cleaned_text_lead,lemmatized_text_lead,tokens_text_lead
0,804124,2021-05-10T00:00:00Z,The Hill,The Hill's 12:30 Report - Presented by Faceboo...,Presented by Facebook To view past editions of...,0,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions o...,Web Publication;WebLinks,...,UNITED STATES,hills report presented facebook biden reverses...,hill report present facebook biden reverse tru...,"[hill, report, present, facebook, biden, rever...",presented facebook view past editions hills re...,present facebook view past editions hill repor...,"[present, facebook, view, past, editions, hill...",presented facebook view past editions hills re...,present facebook view past editions hill repor...,"[present, facebook, view, past, editions, hill..."
1,554905,2020-10-30T00:00:00Z,The Arizona Republic (Phoenix),'60 Minutes' interviews stark study in contrasts,Having seen both the unedited footage of Presi...,0,Having seen both the unedited footage of Presi...,Having seen both the unedited footage of Presi...,This is not a political observation. It's a te...,Newspaper;Newspapers,...,,minutes interviews stark study contrasts,minutes interview stark study contrast,"[minutes, interview, stark, study, contrast]",seen unedited footage president donald trumps ...,see unedited footage president donald trump in...,"[see, unedited, footage, president, donald, tr...",seen unedited footage president donald trumps ...,see unedited footage president donald trump in...,"[see, unedited, footage, president, donald, tr..."
2,798375,2021-01-05T00:00:00Z,Newstex Blogs,Is There a Case for Principled Populism From t...,"\r\n\r\nJan 05, 2021( Conservative Daily News:...",0,"\n\nJan 05, 2021( Conservative Daily News: htt...","Jan 05, 2021( Conservative Daily News:","In the past 10 years, we have seen a new entra...",Web Blog;Blogs,...,UNITED STATES / CHINA / EUROPE / HUNGARY,case principled populism gop,case principled populism gop,"[case, principled, populism, gop]",jan conservative daily news delivered newstex ...,jan conservative daily news deliver newstex po...,"[jan, conservative, daily, news, deliver, news...",jan conservative daily news,jan conservative daily news,"[jan, conservative, daily, news]"
3,691445,2020-11-10T00:00:00Z,Canadian Press,Election breathes new life into false 'dead vo...,As President Donald Trump continued to assert ...,0,As President Donald Trump continued to assert ...,As President Donald Trump continued to assert ...,The false claim that deceased voters cast vote...,Newswire;Newswires & Press Releases,...,,election breathes new life false dead voter cl...,election breathe new life false dead voter claim,"[election, breathe, new, life, false, dead, vo...",president donald trump continued assert withou...,president donald trump continue assert without...,"[president, donald, trump, continue, assert, w...",president donald trump continued assert withou...,president donald trump continue assert without...,"[president, donald, trump, continue, assert, w..."
4,490404,2020-10-07T00:00:00Z,Tampa Bay Times,Viewer's Guide: Virus response on stage with P...,Mike Pence and Kamala Harris do not have a tou...,0,Mike Pence and Kamala Harris do not have a tou...,Mike Pence and Kamala Harris do not have a tou...,The 90-minute debate will be divided into nine...,Newspaper;Newspapers,...,UNITED STATES,viewers guide virus response stage pence harris,viewers guide virus response stage pence harris,"[viewers, guide, virus, response, stage, pence...",mike pence kamala harris tough act followthe v...,mike pence kamala harris tough act followthe v...,"[mike, pence, kamala, harris, tough, act, foll...",mike pence kamala harris tough act followthe v...,mike pence kamala harris tough act followthe v...,"[mike, pence, kamala, harris, tough, act, foll..."


### ----------------------------- train test split

In [14]:
RANDOM_STATE = 234
df_train, df_test = train_test_split(df, test_size=0.25, shuffle=True, random_state=RANDOM_STATE)

### ----------------------------- Feature Engineering

In [15]:
def augment_data(data: pd.Series, y: pd.Series, mode="all"):
    if mode=="all":
        raw = data[y==1]
        lens_dist = raw.apply(lambda x: len(x.split()))
        num_new = sum(y==0) - sum(y==1)
        words = raw.explode()
        new_data = pd.Series([" ".join(np.random.choice(words, size=int(np.random.choice(lens_dist)))) for _ in range(num_new)])
        new_y = pd.Series([1 for _ in range(num_new)])
        data_all = pd.concat([data, new_data])
        y_all = pd.concat([y, new_y])
        temp_df = pd.DataFrame()
        temp_df["data"] = data_all
        temp_df["y"] = y_all
        temp_df = temp_df.sample(frac=1)
        return temp_df.data, temp_df.y
    
    if mode=="most":
        raw = data[y==1]
        lens_dist = raw.apply(lambda x: len(x.split()))
        num_new = sum(y==0) - sum(y==1)
        words = raw.apply(lambda x: x.split()).explode().value_counts()[:20]
        
        def get_new_title():
            tokens = [random.choices(list(words.index), weights=list(words))[0] for _ in range(int(np.random.choice(lens_dist)))]
            return " ".join(tokens)
        
        new_data = pd.Series([get_new_title() for _ in range(num_new)])
        new_y = pd.Series([1 for _ in range(num_new)])
        data_all = pd.concat([data, new_data])
        y_all = pd.concat([y, new_y])
        temp_df = pd.DataFrame()
        temp_df["data"] = data_all
        temp_df["y"] = y_all
        temp_df = temp_df.sample(frac=1)
        return temp_df.data, temp_df.y
        
    raise Exception("mode not specified")

In [16]:
def get_whole_text(df, data_augmentation=True):
    data, y = df.tokens_text.apply(lambda x: " ".join(x)), df.relevant
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

def get_title(df, is_final_feature, data_augmentation=True):
    if is_final_feature:
        temp_df = df[["tokens_title_h1", "relevant"]]
        temp_df["titles"] = df.tokens_title_h1.apply(lambda x: " ".join(x))
        temp_df = temp_df[temp_df.titles.notna()]
        temp_df = temp_df[temp_df.titles.apply(len) > 2]
        data, y = temp_df.titles, temp_df.relevant
        if data_augmentation:
            data, y = augment_data(data, y, mode="most")
            return data, y
        return data, y ## indexes rearenged
        
    else: 
        titles = df.tokens_title_h1.apply(lambda x: " ".join(x))
        return titles, df.relevant  ## original indexes
    
def get_title_plus_first_paragraph(df, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    
    text_lead = df.tokens_text_lead.apply(lambda x: " ".join(x))
    data, y = title.str.cat(text_lead), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

def get_x_first_sentence(text: str, num: int):
    sentences = text.split(".")
    if num == 1:
        for sen in sentences:
            if len(sen.split(" ")) > 1:
                return sen
    else:
        sens = []
        for sen in sentences:
            if len(sen.split(" ")) > 1:
                sens.append(sen)
                if len(sens) == num:
                    return sens
        return sens

def get_title_plus_first_sentence(df, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    first_sentences = df.text.apply(lambda x: get_x_first_sentence(x, num=1))
    first_sentences = first_sentences.apply(lambda x: preprocessor.clean_query(x)).apply(lambda x: " ".join(x))
    data, y = title.str.cat(first_sentences), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

def get_title_plus_x_sentences(df, num_sentences, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    sentences = df.text.apply(lambda x: get_x_first_sentence(x, num=num_sentences))
    sentences = sentences.apply(lambda x: " ".join(x))
    sentences = sentences.apply(lambda x: preprocessor.clean_query(x)).apply(lambda x: " ".join(x))
    data, y = title.str.cat(sentences), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y
    
def get_text_paragraphs(text: str):
    initial_paragraphs = text.split("\n")
    final_paragraphs = []
    for par in initial_paragraphs:
        if len(par.split(" ")) > 3:
            final_paragraphs.append(par)
    return final_paragraphs

def get_title_plus_first_sentence_each_paragraph(df, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    
    paragraphs = df.text.apply(get_text_paragraphs)
    paragraphs_first_sentences = paragraphs.apply(lambda x: [get_x_first_sentence(par, num=1) for par in x])
    paragraphs_first_sentences = paragraphs_first_sentences.apply(lambda x: " ".join(x))
    paragraphs_first_sentences = paragraphs_first_sentences.apply(lambda x: preprocessor.clean_query(x)).apply(lambda x: " ".join(x))
    data, y = title.str.cat(paragraphs_first_sentences), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

In [17]:
classification_models = [GradientBoostingClassifier, svm.SVC, LogisticRegression] # RandomForestClassifier]
features = [
            {"name": "title", "function": get_title},
            {"name": "title_plus_first_paragraph", "function": get_title_plus_first_paragraph},
            {"name": "title_plus_first_sentence", "function": get_title_plus_first_sentence},
            {"name": "title_plus_5_sentences", "function": get_title_plus_x_sentences},
            {"name": "title_plus_10_sentences", "function": get_title_plus_x_sentences},
            
            {"name": "title_plus_first_sentence_each_paragraph",
             "function": get_title_plus_first_sentence_each_paragraph},
            {"name": "whole_text", "function": get_whole_text}
]

### ----------------------------- Embedding with all-MiniLM-L6-v2 model - Classification with classic algorithms

In [18]:
# emb_model = SentenceTransformer('./transformer-model')
emb_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

def embed(data: pd.Series):
    embeddings = data.apply(lambda x: emb_model.encode(str(x)))
    embeddings_arr = np.zeros((len(data), 768))
    for row, emb in enumerate(embeddings):
        embeddings_arr[row, :] = emb
        
    return embeddings_arr

def train_model(model, training_data, y_train):
    model = model.fit(training_data, y_train)
    return model

def test_model(model, test_data, y_test):
    y_pred = model.predict(test_data)
    return classification_report(y_test, y_pred)

def get_data(feature_combination_name: str, function, df, data_augmentation: bool):
    if feature_combination_name == "title_plus_5_sentences":
        X, y = function(df, 5, data_augmentation)
    elif feature_combination_name == "title_plus_10_sentences":
        X, y = function(df, 10, data_augmentation)
    elif feature_combination_name == "title":
        X, y = function(df, is_final_feature=True, data_augmentation=data_augmentation)
    else:
        X, y = function(df, data_augmentation)
        
    return X, y

In [257]:
for f_dict in features:
    name = f_dict.get("name")
    print(name)
    func = f_dict.get("function")
    data, y = get_data(name, func, df_train, data_augmentation=True)
    print(all(data.apply(type) == str))
    print(data.apply(len).min())
    print(data.shape, y.shape)
    print(data.isna().sum())

title
True
3
(3160,) (3160,)
0
title_plus_first_paragraph
True
25
(3160,) (3160,)
0
title_plus_first_sentence
True
23
(3160,) (3160,)
0
title_plus_5_sentences
True
106
(3160,) (3160,)
0
title_plus_10_sentences
True
197
(3160,) (3160,)
0


In [270]:
df_train.shape, df_test.shape, df_test.relevant.sum()

((1765, 21), (589, 21), 54)

### ----------------------------- Training

In [271]:
trained_models = []

for feature in features:
    print("-" * 80)
    feature_combination_name, function = feature["name"], feature["function"]
    
    print("Feature: ", feature_combination_name)
    print("Data prep")
    
    data, y = get_data(feature_combination_name, function, df_train, data_augmentation=True)
    
    print("Embedding")
    embedding = embed(data)
    
    for clf_model_class in classification_models:
        
        clf_model = clf_model_class()
        clf_model.class_weight = "balanced"
        
        print("\tModel: ", clf_model, "id: ", id(clf_model))
        print("\t\t training")
        clf_model = train_model(clf_model, embedding, y)
        
        trained_models.append({
            "feature_combination_name": feature_combination_name,
            "model_name": clf_model.__repr__(),
            "model": clf_model
        })

--------------------------------------------------------------------------------
Feature:  title
Data prep
Embedding
	Model:  GradientBoostingClassifier() id:  140022093050576
		 training
	Model:  SVC(class_weight='balanced') id:  140022087201168
		 training
	Model:  LogisticRegression(class_weight='balanced') id:  140022083286288
		 training
--------------------------------------------------------------------------------
Feature:  title_plus_first_paragraph
Data prep
Embedding
	Model:  GradientBoostingClassifier() id:  140035285471120
		 training
	Model:  SVC(class_weight='balanced') id:  140024392058640
		 training
	Model:  LogisticRegression(class_weight='balanced') id:  140024392060176
		 training
--------------------------------------------------------------------------------
Feature:  title_plus_first_sentence
Data prep
Embedding
	Model:  GradientBoostingClassifier() id:  140022095397584
		 training
	Model:  SVC(class_weight='balanced') id:  140022090343952
		 training
	Model:  L

### ----------------------------- Testing

In [272]:
reports = []

for f, feature in enumerate(features):
    
    feature_combination_name, function = feature["name"], feature["function"]
    
    data, y = get_data(feature_combination_name, function, df_test, data_augmentation=False)
        
    embedding = embed(data)
    
    for tm in trained_models:
        
        if tm.get("feature_combination_name") == feature_combination_name:
            
            print("-" * 80)
            print(feature_combination_name)
            print(tm.get("model_name"))
            
            clf_model = tm.get("model")
            print(id(clf_model))
            y_pred = clf_model.predict(embedding)
            clf_report_dict = classification_report(y, y_pred, output_dict=True)
            clf_report = classification_report(y, y_pred)
            reports.append((tm.get("model_name") + " " + feature_combination_name, clf_report_dict))

            print()
            print(clf_report)
            print()

--------------------------------------------------------------------------------
title
GradientBoostingClassifier()
140022093050576

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       534
           1       0.62      0.48      0.54        54

    accuracy                           0.93       588
   macro avg       0.78      0.73      0.75       588
weighted avg       0.92      0.93      0.92       588


--------------------------------------------------------------------------------
title
SVC(class_weight='balanced')
140022087201168

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       534
           1       0.63      0.57      0.60        54

    accuracy                           0.93       588
   macro avg       0.79      0.77      0.78       588
weighted avg       0.93      0.93      0.93       588


------------------------------------------------------------------------------

# ----------------------------------- Manual Evaluation

In [117]:
# data, y = get_data("whole_text", get_whole_text, df_train)

In [116]:
# train_emb = embed(data)

In [115]:
# train_emb.shape

In [114]:
# # clf = RandomForestClassifier(class_weight="balanced")
# clf = RandomForestClassifier()
# clf.fit(train_emb, y)

In [110]:
# data, y = get_data("whole_text", get_whole_text, df_test)

In [111]:
# test_emb = embed(data)

In [109]:
# y_pred = clf.predict(test_emb)
# print(classification_report(y, y_pred))

In [108]:
# data

In [107]:
# data[y==1]

In [112]:
# idx = 302
# print(df_test.loc[302].text)

In [113]:
# print(data.loc[302])

# ---------------------------- Deep Bert classification

In [19]:
df_train.head()

Unnamed: 0,id,pubdatetime,publication_name,title_h1,text_200,relevant,text,text_lead,text_body,publication_type,...,country,cleaned_title_h1,lemmatized_title_h1,tokens_title_h1,cleaned_text,lemmatized_text,tokens_text,cleaned_text_lead,lemmatized_text_lead,tokens_text_lead
481,757599,2021-03-04T00:00:00Z,The East Bay Times (California),Law enforcement on alert in D.C. after plot wa...,The threat appeared to be connected to a far-r...,0,The threat appeared to be connected to a far-r...,By MICHAEL BALSAMO and ASHRAF KHALIL | Associa...,There were no signs of disturbance Thursday at...,Newspaper;Newspapers,...,UNITED STATES,law enforcement alert dc plot warning us capitol,law enforcement alert dc plot warn us capitol,"[law, enforcement, alert, dc, plot, warn, us, ...",threat appeared connected farright conspiracy ...,threat appear connect farright conspiracy theo...,"[threat, appear, connect, farright, conspiracy...",michael balsamo ashraf khalil associated press...,michael balsamo ashraf khalil associate pressw...,"[michael, balsamo, ashraf, khalil, associate, ..."
1781,364903,2020-03-11T00:00:00Z,Yerepouni Daily News,Joe Biden calls for unity after big wins in Mi...,DETROIT (Reuters) - Joe Biden scored decisive ...,0,DETROIT (Reuters) - Joe Biden scored decisive ...,DETROIT (Reuters) - Joe Biden scored decisive ...,"""We share a common goal, and together we are g...",Web Publication;Web-based Publications,...,UNITED STATES,joe biden calls unity big wins michigan three ...,joe biden call unity big win michigan three state,"[joe, biden, call, unity, big, win, michigan, ...",detroit reuters joe biden scored decisive prim...,detroit reuters joe biden score decisive prima...,"[detroit, reuters, joe, biden, score, decisive...",detroit reuters joe biden scored decisive prim...,detroit reuters joe biden score decisive prima...,"[detroit, reuters, joe, biden, score, decisive..."
729,671864,2020-12-04T00:00:00Z,Hindustan Times,Covid-19: What side effects to expect from a v...,"India, Dec. 4 -- Researchers usually take year...",0,"India, Dec. 4 -- Researchers usually take year...","India, Dec. 4 -- Researchers usually take year...",Russian President Vladimir Putin has already o...,Newswire;Newspapers,...,UNITED STATES / INDIA / UNITED KINGDOM / RUSSI...,covid side effects expect vaccine shot painful...,covid side effect expect vaccine shoot painful...,"[covid, side, effect, expect, vaccine, shoot, ...",india dec researchers usually take years devel...,india dec researchers usually take years devel...,"[india, dec, researchers, usually, take, years...",india dec researchers usually take years devel...,india dec researchers usually take years devel...,"[india, dec, researchers, usually, take, years..."
1238,764132,2021-02-18T00:00:00Z,TVEyes - BBC 1 South West,Joins BBC News - 01:15 AM GMT,The latest national and international news fro...,0,The latest national and international news fro...,fighting the same battle all over the world? A...,It has bipartisan support in Australia and Goo...,Transcript;News Transcripts,...,AUSTRALIA / UNITED STATES / UNITED KINGDOM,joins bbc news gmt,join bbc news gmt,"[join, bbc, news, gmt]",latest national international news bbc news sp...,latest national international news bbc news sp...,"[latest, national, international, news, bbc, n...",fighting battle world absolutely australia pre...,fight battle world absolutely australia preced...,"[fight, battle, world, absolutely, australia, ..."
488,469699,2020-09-10T00:00:00Z,Capital FM,Uganda to License Online Posts in Fresh Assaul...,The Uganda Communications Commission (UCC) has...,1,The Uganda Communications Commission (UCC) has...,The Uganda Communications Commission (UCC) has...,"""The requirement for people to seek authorisat...",Newspaper;Web-based Publications,...,UGANDA / EASTERN AFRICA / KENYA / EUROPEAN UNI...,uganda license online posts fresh assault free...,uganda license online post fresh assault freed...,"[uganda, license, online, post, fresh, assault...",uganda communications commission ucc issued pu...,uganda communications commission ucc issue pub...,"[uganda, communications, commission, ucc, issu...",uganda communications commission ucc issued pu...,uganda communications commission ucc issue pub...,"[uganda, communications, commission, ucc, issu..."


In [20]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

'LABEL_0'

In [21]:
sample_txt = df_train.tokens_text.apply(lambda x: " ".join(x)).iloc[4]
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: uganda communications commission ucc issue public notice state anyone wish publish information online must license ahead october deadline latest blow right freedom expression uganda ahead elections follow guidelines issue june restrict public gather electoral process compliance covid prevention measure mean election campaign allow media social media platforms requirement people seek authorisation post information online retrogressive blatant violation right freedom expression access information restrictions public gather already place ugandan authorities shut vital channel people express political opinions share critical information covid say deprose muchena amnesty internationals director east southern africa freedom expression need license communications commission cite section uganda communications act among others prohibit broadcast content without broadcast licence apply law share content internet social media platforms authorities effectively criminalize right freedom 

In [22]:
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length=32,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',  # Return PyTorch tensors
)

encoding.keys()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


dict_keys(['input_ids', 'attention_mask'])

In [23]:
class ArticleDataset(Dataset):

  def __init__(self, X: pd.Series, y: pd.Series, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.X)
  
  def __getitem__(self, item):
    text = str(self.X.iloc[item])
    y = self.y.iloc[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'y': torch.tensor(y, dtype=torch.long),
    }

In [24]:
df_val, df_test = train_test_split(df_test, test_size=0.4, random_state=RANDOM_STATE)

In [25]:
df_train.shape, df_val.shape, df_test.shape

((1765, 21), (353, 21), (236, 21))

In [26]:
# X_train, y_train = get_data("whole_text", get_whole_text, df_train, data_augmentation=True)
# X_val, y_val = get_data("whole_text", get_whole_text, df_val, data_augmentation=False)
# X_test, y_test = get_data("whole_text", get_whole_text, df_test, data_augmentation=False)

X_train, y_train = get_data("title_plus_10_sentences", get_title_plus_x_sentences, df_train, data_augmentation=True)
X_val, y_val = get_data("title_plus_10_sentences", get_title_plus_x_sentences, df_val, data_augmentation=False)
X_test, y_test = get_data("title_plus_10_sentences", get_title_plus_x_sentences, df_test, data_augmentation=False)

In [27]:
def create_data_loader(X: pd.Series, y: pd.Series, tokenizer, max_len, batch_size):
    dataset = ArticleDataset(
         X=X,
         y=y,
         tokenizer=tokenizer,
         max_len=max_len)

    return DataLoader(
    dataset,
    batch_size=batch_size,
    num_workers=8)


BATCH_SIZE = 256
MAX_LEN = 128

train_data_loader = create_data_loader(X_train, y_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(X_val, y_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(X_test, y_test, tokenizer, MAX_LEN, BATCH_SIZE)

# data = next(iter(train_data_loader))
# data.keys()

In [34]:
ni = next(iter(train_data_loader))

In [35]:
print(ni['input_ids'].shape)
print(ni['attention_mask'].shape)
print(ni['y'].shape)

torch.Size([256, 128])
torch.Size([256, 128])
torch.Size([256])


In [36]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}

In [37]:
# model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

class ArticleClassifier(nn.Module):

  def __init__(self, n_classes, model):
    super(ArticleClassifier, self).__init__()
    self.model = model
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.model.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    pooled_output = self.model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [38]:
device = "cuda"
classifier = ArticleClassifier(n_classes=2, model=model)
classifier = classifier.to(device)

input_ids = ni['input_ids'].to(device)
attention_mask = ni['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([256, 128])
torch.Size([256, 128])


In [39]:
EPOCHS = 100

optimizer = AdamW(classifier.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

# scheduler = get_linear_schedule_with_warmup(
#   optimizer,
#   num_warmup_steps=0,
#   num_training_steps=total_steps
# )

loss_fn = nn.CrossEntropyLoss().to(device)

In [40]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  all_f1 = []
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["y"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs.logits, dim=1)
    loss = loss_fn(outputs.logits, targets)

    clf_report = classification_report(targets.detach().cpu().numpy(), preds.detach().cpu().numpy(), output_dict=True)
    f1 = clf_report["1"]["f1-score"]
    all_f1.append(f1)


    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses), np.mean(all_f1)

In [41]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.to(device)
  model = model.eval()

  losses = []
  correct_predictions = 0
  all_f1 = []

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["y"].to(device)
      

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      

      _, preds = torch.max(outputs.logits, dim=1)
      clf_report = classification_report(targets.detach().cpu().numpy(), preds.detach().cpu().numpy(), output_dict=True)
      f1 = clf_report["1"]["f1-score"]
      all_f1.append(f1)

      loss = loss_fn(outputs.logits, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses), np.mean(all_f1)

In [42]:
for f, param in enumerate(classifier.parameters()):
    thresh = 50
    if f < thresh:
        param.requires_grad = False
    else:
        param.requires_grad = True


In [43]:
%%time
from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss, f1 = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} f1-score {f1}')

  val_acc, val_loss, f1 = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} f1-score {f1}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/100
----------


NameError: ignored

In [None]:
ldft = len(df_test)
device = "cuda"

test_acc, test_loss, f1 = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  ldft,
)

print(test_acc.item())
print(f1)