In [22]:
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import os
import fasttext.util
import nltk
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("../data/training_data_ra_only.csv")

In [4]:
df.columns

Index(['id', 'pubdatetime', 'publication_name', 'title_h1', 'text_200',
       'relevant', 'solution_frame', 'problem_frame', 'title_h2',
       'articleHead', 'text', 'text_lead', 'text_body', 'publication_type',
       'publication_edition', 'wordCount', 'country', 'state', 'city',
       'matches', 'matches_count'],
      dtype='object')

In [5]:
df.isna().sum() / len(df)

id                     0.000000
pubdatetime            0.000000
publication_name       0.005036
title_h1               0.004443
text_200               0.000000
relevant               0.000000
solution_frame         0.000000
problem_frame          0.000000
title_h2               0.939870
articleHead            0.534656
text                   0.267773
text_lead              0.299171
text_body              0.267773
publication_type       0.267773
publication_edition    0.869964
wordCount              0.267773
country                0.390995
state                  0.578791
city                   0.678910
matches                0.267773
matches_count          0.267773
dtype: float64

In [6]:
df.drop(["title_h2", "articleHead", "publication_edition", "solution_frame",\
         "problem_frame", "state", "city", "matches", "matches_count"], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,id,pubdatetime,publication_name,title_h1,text_200,relevant,text,text_lead,text_body,publication_type,wordCount,country
0,804124,2021-05-10T00:00:00Z,The Hill,The Hill's 12:30 Report - Presented by Faceboo...,Presented by Facebook To view past editions of...,0,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions o...,Web Publication;WebLinks,2031.0,UNITED STATES
1,554905,2020-10-30T00:00:00Z,The Arizona Republic (Phoenix),'60 Minutes' interviews stark study in contrasts,Having seen both the unedited footage of Presi...,0,Having seen both the unedited footage of Presi...,Having seen both the unedited footage of Presi...,This is not a political observation. It's a te...,Newspaper;Newspapers,734.0,
2,798375,2021-01-05T00:00:00Z,Newstex Blogs,Is There a Case for Principled Populism From t...,"\r\n\r\nJan 05, 2021( Conservative Daily News:...",0,"\n\nJan 05, 2021( Conservative Daily News: htt...","Jan 05, 2021( Conservative Daily News:","In the past 10 years, we have seen a new entra...",Web Blog;Blogs,1689.0,UNITED STATES / CHINA / EUROPE / HUNGARY
3,691445,2020-11-10T00:00:00Z,Canadian Press,Election breathes new life into false 'dead vo...,As President Donald Trump continued to assert ...,0,As President Donald Trump continued to assert ...,As President Donald Trump continued to assert ...,The false claim that deceased voters cast vote...,Newswire;Newswires & Press Releases,886.0,
4,490404,2020-10-07T00:00:00Z,Tampa Bay Times,Viewer's Guide: Virus response on stage with P...,Mike Pence and Kamala Harris do not have a tou...,0,Mike Pence and Kamala Harris do not have a tou...,Mike Pence and Kamala Harris do not have a tou...,The 90-minute debate will be divided into nine...,Newspaper;Newspapers,1062.0,UNITED STATES


In [8]:
df[df.text.notna()].isna().sum() / len(df[df.text.notna()])
df = df[df.text.notna()]
df = df[df.relevant != 99]
df = df[df.title_h1.notna()]
df = df[df.text_lead.notna()]

In [9]:
from preprocessing.preprocessing import Preprocessor
preprocessor = Preprocessor()

In [10]:
df = preprocessor.perform_clean_lemmatize_tokenize(df, "title_h1")
df = preprocessor.perform_clean_lemmatize_tokenize(df, "text")
df = preprocessor.perform_clean_lemmatize_tokenize(df, "text_lead")

100%|█████████████████████████████████████████████████████████████████████████████████████| 2354/2354 [00:00<00:00, 3672.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 2354/2354 [00:01<00:00, 1529.22it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 2354/2354 [00:18<00:00, 126.86it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 2354/2354 [00:05<00:00, 439.10it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 2354/2354 [00:01<00:00, 1463.55it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 2354/2354 [00:00<00:00, 4809.75it/s]


In [11]:
df.head()

Unnamed: 0,id,pubdatetime,publication_name,title_h1,text_200,relevant,text,text_lead,text_body,publication_type,...,country,cleaned_title_h1,lemmatized_title_h1,tokens_title_h1,cleaned_text,lemmatized_text,tokens_text,cleaned_text_lead,lemmatized_text_lead,tokens_text_lead
0,804124,2021-05-10T00:00:00Z,The Hill,The Hill's 12:30 Report - Presented by Faceboo...,Presented by Facebook To view past editions of...,0,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions of...,Presented by Facebook To view past editions o...,Web Publication;WebLinks,...,UNITED STATES,hills report presented facebook biden reverses...,hill report present facebook biden reverse tru...,"[hill, report, present, facebook, biden, rever...",presented facebook view past editions hills re...,present facebook view past editions hill repor...,"[present, facebook, view, past, editions, hill...",presented facebook view past editions hills re...,present facebook view past editions hill repor...,"[present, facebook, view, past, editions, hill..."
1,554905,2020-10-30T00:00:00Z,The Arizona Republic (Phoenix),'60 Minutes' interviews stark study in contrasts,Having seen both the unedited footage of Presi...,0,Having seen both the unedited footage of Presi...,Having seen both the unedited footage of Presi...,This is not a political observation. It's a te...,Newspaper;Newspapers,...,,minutes interviews stark study contrasts,minutes interview stark study contrast,"[minutes, interview, stark, study, contrast]",seen unedited footage president donald trumps ...,see unedited footage president donald trump in...,"[see, unedited, footage, president, donald, tr...",seen unedited footage president donald trumps ...,see unedited footage president donald trump in...,"[see, unedited, footage, president, donald, tr..."
2,798375,2021-01-05T00:00:00Z,Newstex Blogs,Is There a Case for Principled Populism From t...,"\r\n\r\nJan 05, 2021( Conservative Daily News:...",0,"\n\nJan 05, 2021( Conservative Daily News: htt...","Jan 05, 2021( Conservative Daily News:","In the past 10 years, we have seen a new entra...",Web Blog;Blogs,...,UNITED STATES / CHINA / EUROPE / HUNGARY,case principled populism gop,case principled populism gop,"[case, principled, populism, gop]",jan conservative daily news delivered newstex ...,jan conservative daily news deliver newstex po...,"[jan, conservative, daily, news, deliver, news...",jan conservative daily news,jan conservative daily news,"[jan, conservative, daily, news]"
3,691445,2020-11-10T00:00:00Z,Canadian Press,Election breathes new life into false 'dead vo...,As President Donald Trump continued to assert ...,0,As President Donald Trump continued to assert ...,As President Donald Trump continued to assert ...,The false claim that deceased voters cast vote...,Newswire;Newswires & Press Releases,...,,election breathes new life false dead voter cl...,election breathe new life false dead voter claim,"[election, breathe, new, life, false, dead, vo...",president donald trump continued assert withou...,president donald trump continue assert without...,"[president, donald, trump, continue, assert, w...",president donald trump continued assert withou...,president donald trump continue assert without...,"[president, donald, trump, continue, assert, w..."
4,490404,2020-10-07T00:00:00Z,Tampa Bay Times,Viewer's Guide: Virus response on stage with P...,Mike Pence and Kamala Harris do not have a tou...,0,Mike Pence and Kamala Harris do not have a tou...,Mike Pence and Kamala Harris do not have a tou...,The 90-minute debate will be divided into nine...,Newspaper;Newspapers,...,UNITED STATES,viewers guide virus response stage pence harris,viewers guide virus response stage pence harris,"[viewers, guide, virus, response, stage, pence...",mike pence kamala harris tough act followthe v...,mike pence kamala harris tough act followthe v...,"[mike, pence, kamala, harris, tough, act, foll...",mike pence kamala harris tough act followthe v...,mike pence kamala harris tough act followthe v...,"[mike, pence, kamala, harris, tough, act, foll..."


### ----------------------------- train test split

In [12]:
df_train, df_test = train_test_split(df, test_size=0.25, shuffle=True, random_state=234)

### ----------------------------- Data Augmentation

In [78]:
def augment_data(data: pd.Series, y: pd.Series, mode="all"):
    if mode=="all":
        raw = data[y==1]
        lens_dist = raw.apply(lambda x: len(x.split()))
        num_new = sum(y==0) - sum(y==1)
        words = raw.explode()
        new_data = pd.Series([" ".join(np.random.choice(words, size=int(np.random.choice(lens_dist)))) for _ in range(num_new)])
        new_y = pd.Series([1 for _ in range(num_new)])
        data_all = pd.concat([data, new_data])
        y_all = pd.concat([y, new_y])
        temp_df = pd.DataFrame()
        temp_df["data"] = data_all
        temp_df["y"] = y_all
        temp_df = temp_df.sample(frac=1)
        return temp_df.data, temp_df.y
    
    if mode=="most":
        raw = data[y==1]
        lens_dist = raw.apply(lambda x: len(x.split()))
        num_new = sum(y==0) - sum(y==1)
        words = raw.apply(lambda x: x.split()).explode().value_counts()[:20]
        
        def get_new_title():
            tokens = [random.choices(list(words.index), weights=list(words))[0] for _ in range(int(np.random.choice(lens_dist)))]
            return " ".join(tokens)
        
        new_data = pd.Series([get_new_title() for _ in range(num_new)])
        new_y = pd.Series([1 for _ in range(num_new)])
        data_all = pd.concat([data, new_data])
        y_all = pd.concat([y, new_y])
        temp_df = pd.DataFrame()
        temp_df["data"] = data_all
        temp_df["y"] = y_all
        temp_df = temp_df.sample(frac=1)
        return temp_df.data, temp_df.y
        
    raise Exception("mode not specified")

### ----------------------------- Feature Engineering

In [79]:
def get_whole_text(df, data_augmentation=True):
    data, y = df.tokens_text.apply(lambda x: " ".join(x)), df.relevant
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

def get_title(df, is_final_feature, data_augmentation=True):
    if is_final_feature:
        temp_df = df[["tokens_title_h1", "relevant"]]
        temp_df["titles"] = df.tokens_title_h1.apply(lambda x: " ".join(x))
        temp_df = temp_df[temp_df.titles.notna()]
        temp_df = temp_df[temp_df.titles.apply(len) > 2]
        data, y = temp_df.titles, temp_df.relevant
        if data_augmentation:
            data, y = augment_data(data, y, mode="most")
            return data, y
        return data, y ## indexes rearenged
        
    else: 
        titles = df.tokens_title_h1.apply(lambda x: " ".join(x))
        return titles, df.relevant  ## original indexes
    
def get_title_plus_first_paragraph(df, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    
    text_lead = df.tokens_text_lead.apply(lambda x: " ".join(x))
    data, y = title.str.cat(text_lead), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

def get_x_first_sentence(text: str, num: int):
    sentences = text.split(".")
    if num == 1:
        for sen in sentences:
            if len(sen.split(" ")) > 1:
                return sen
    else:
        sens = []
        for sen in sentences:
            if len(sen.split(" ")) > 1:
                sens.append(sen)
                if len(sens) == num:
                    return sens
        return sens

def get_title_plus_first_sentence(df, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    first_sentences = df.text.apply(lambda x: get_x_first_sentence(x, num=1))
    first_sentences = first_sentences.apply(lambda x: preprocessor.clean_query(x)).apply(lambda x: " ".join(x))
    data, y = title.str.cat(first_sentences), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

def get_title_plus_x_sentences(df, num_sentences, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    sentences = df.text.apply(lambda x: get_x_first_sentence(x, num=num_sentences))
    sentences = sentences.apply(lambda x: " ".join(x))
    sentences = sentences.apply(lambda x: preprocessor.clean_query(x)).apply(lambda x: " ".join(x))
    data, y = title.str.cat(sentences), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y
    
def get_text_paragraphs(text: str):
    initial_paragraphs = text.split("\n")
    final_paragraphs = []
    for par in initial_paragraphs:
        if len(par.split(" ")) > 3:
            final_paragraphs.append(par)
    return final_paragraphs

def get_title_plus_first_sentence_each_paragraph(df, data_augmentation=True):
    title, y = get_title(df, is_final_feature=False)
    title = title.apply(lambda x: f"{x} ")
    
    paragraphs = df.text.apply(get_text_paragraphs)
    paragraphs_first_sentences = paragraphs.apply(lambda x: [get_x_first_sentence(par, num=1) for par in x])
    paragraphs_first_sentences = paragraphs_first_sentences.apply(lambda x: " ".join(x))
    paragraphs_first_sentences = paragraphs_first_sentences.apply(lambda x: preprocessor.clean_query(x)).apply(lambda x: " ".join(x))
    data, y = title.str.cat(paragraphs_first_sentences), y
    if data_augmentation:
        data, y = augment_data(data, y)
        return data, y
    return data, y

In [80]:
classification_models = [GradientBoostingClassifier, svm.SVC, LogisticRegression, RandomForestClassifier]
features = [
            {"name": "title", "function": get_title},
            {"name": "title_plus_first_paragraph", "function": get_title_plus_first_paragraph},
            {"name": "title_plus_first_sentence", "function": get_title_plus_first_sentence},
            {"name": "title_plus_5_sentences", "function": get_title_plus_x_sentences},
            {"name": "title_plus_10_sentences", "function": get_title_plus_x_sentences},
            
            {"name": "title_plus_first_sentence_each_paragraph",
             "function": get_title_plus_first_sentence_each_paragraph},
            {"name": "whole_text", "function": get_whole_text}
]

### ----------------------------- Embedding with all-MiniLM-L6-v2 model - Classification with classic algorithms

In [81]:
emb_model = SentenceTransformer('./transformer-model')

def embed(data: pd.Series):
    embeddings = data.apply(lambda x: emb_model.encode(str(x)))
    embeddings_arr = np.zeros((len(data), 384))
    for row, emb in enumerate(embeddings):
        embeddings_arr[row, :] = emb
        
    return embeddings_arr

def train_model(model, training_data, y_train):
    model = model.fit(training_data, y_train)
    return model

def test_model(model, test_data, y_test):
    y_pred = model.predict(test_data)
    return classification_report(y_test, y_pred)

def get_data(feature_combination_name: str, function, df, data_augmentation: bool):
    if feature_combination_name == "title_plus_5_sentences":
        X, y = function(df, 5, data_augmentation)
    elif feature_combination_name == "title_plus_10_sentences":
        X, y = function(df, 10, data_augmentation)
    elif feature_combination_name == "title":
        X, y = function(df, is_final_feature=True, data_augmentation=data_augmentation)
    else:
        X, y = function(df, data_augmentation)
        
    return X, y

In [82]:
for f_dict in features:
    name = f_dict.get("name")
    print(name)
    func = f_dict.get("function")
    data, y = get_data(name, func, df_train, data_augmentation=True)
    print(all(data.apply(type) == str))
    print(data.apply(len).min())
    print(data.shape, y.shape)
    print(data.isna().sum())

title
True
3
(3160,) (3160,)
0


### ----------------------------- Training

In [83]:
trained_models = []

for feature in features:
    print("-" * 80)
    feature_combination_name, function = feature["name"], feature["function"]
    
    print("Feature: ", feature_combination_name)
    print("Data prep")
    
    data, y = get_data(feature_combination_name, function, df_train, data_augmentation=True)
    
    print("Embedding")
    embedding = embed(data)
    
    for clf_model_class in classification_models:
        
        clf_model = clf_model_class()
        clf_model.class_weight = "balanced"
        
        print("\tModel: ", clf_model, "id: ", id(clf_model))
        print("\t\t training")
        clf_model = train_model(clf_model, embedding, y)
        
        trained_models.append({
            "feature_combination_name": feature_combination_name,
            "model_name": clf_model.__repr__(),
            "model": clf_model
        })

--------------------------------------------------------------------------------
Feature:  title
Data prep
Embedding
	Model:  GradientBoostingClassifier() id:  140461155616944
		 training
	Model:  SVC(class_weight='balanced') id:  140461154192784
		 training
	Model:  LogisticRegression(class_weight='balanced') id:  140461124282400
		 training
	Model:  RandomForestClassifier(class_weight='balanced') id:  140461124281200
		 training


### ----------------------------- Testing

In [84]:
reports = []

for f, feature in enumerate(features):
    
    feature_combination_name, function = feature["name"], feature["function"]
    
    data, y = get_data(feature_combination_name, function, df_test, data_augmentation=False)
        
    embedding = embed(data)
    
    for tm in trained_models:
        
        if tm.get("feature_combination_name") == feature_combination_name:
            
            print("-" * 80)
            print(feature_combination_name)
            print(tm.get("model_name"))
            
            clf_model = tm.get("model")
            print(id(clf_model))
            y_pred = clf_model.predict(embedding)
            clf_report_dict = classification_report(y, y_pred, output_dict=True)
            clf_report = classification_report(y, y_pred)
            reports.append((tm.get("model_name") + " " + feature_combination_name, clf_report_dict))

            print()
            print(clf_report)
            print()

--------------------------------------------------------------------------------
title
GradientBoostingClassifier()
140461155616944

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       534
           1       0.60      0.50      0.55        54

    accuracy                           0.92       588
   macro avg       0.78      0.73      0.75       588
weighted avg       0.92      0.92      0.92       588


--------------------------------------------------------------------------------
title
SVC(class_weight='balanced')
140461154192784

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       534
           1       0.63      0.59      0.61        54

    accuracy                           0.93       588
   macro avg       0.79      0.78      0.79       588
weighted avg       0.93      0.93      0.93       588


------------------------------------------------------------------------------

### ------------------- Writing results to excel

In [140]:
accs = []
f1s = []
recs = []
precs = []
dff = pd.DataFrame()

fe = "title_plus_first_paragraph"
for name, report in reports:
    
    if name.split()[1]==fe:
        print(name)
        accs.append(round(report["accuracy"], 2))
        f1s.append(round(report["1"]["f1-score"], 2))
        recs.append(round(report["1"]["recall"], 2))
        precs.append(round(report["1"]["precision"], 2))
        
dff["acc"] = accs
dff["f1"] = f1s
dff["recall"] = recs
dff["prec"] = precs
dff.to_excel(f"{fe}.xlsx")

GradientBoostingClassifier() title_plus_first_paragraph
SVC(class_weight='balanced') title_plus_first_paragraph
LogisticRegression(class_weight='balanced') title_plus_first_paragraph
RandomForestClassifier(class_weight='balanced') title_plus_first_paragraph


# ----------------------------------- Manual Evaluation

In [47]:
data, y = get_data("whole_text", get_whole_text, df_train)

In [48]:
train_emb = embed(data)

In [49]:
train_emb.shape

(1765, 384)

In [50]:
# clf = RandomForestClassifier(class_weight="balanced")
clf = RandomForestClassifier()
clf.fit(train_emb, y)

RandomForestClassifier()

In [51]:
data, y = get_data("whole_text", get_whole_text, df_test)

In [52]:
test_emb = embed(data)

In [53]:
y_pred = clf.predict(test_emb)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       535
           1       0.80      0.07      0.14        54

    accuracy                           0.91       589
   macro avg       0.86      0.54      0.55       589
weighted avg       0.90      0.91      0.88       589



In [55]:
data

1694    washington us senate set vote early next week ...
1679    jun international business time news deliver n...
1131    people gather memorial day weekend port aransa...
2206    oct better georgia newstex foreign interferenc...
1147    accuse bombmaker identify ringleader plot kidn...
                              ...                        
732     dec wrap deliver newstex trump administration ...
1551    washington ap joe bidens bet race simple one n...
2372    full text charleston west virginia vice presid...
1779    oct marketbeat deliver newstex combination pho...
568     residents say counterprotesting pledge allegia...
Name: tokens_text, Length: 589, dtype: object

In [56]:
data[y==1]

24     past years qanon movement baseless conspiracy ...
222    people buy products incentivised leave positiv...
377    new york may un human right office ohchr welco...
508    interview cnn netflix ceo reveal decision remo...
34     san francisco facebook incfacebook say wednesd...
251    minister threaten hit social media giants fin ...
113    googleowned youtube suspend former new york ci...
122    canberra australia ap australian regulators ru...
337    first congress override trump veto defense bil...
229    new delhi june special cell delhi police regis...
493    youtube reduce amount content spread conspirac...
265    lisa laflamme federal government today vow inf...
30     google earn tens millions pound every year all...
389    effort reverse flood abuse platform twitter ro...
507    tuesday us president insult top infectious dis...
86     us election test facebook say chief executive ...
46     may geller report deliver newstex gop ignore u...
504    utah governor spencer co

In [59]:
idx = 302
print(df_test.loc[302].text)

FULL TEXT

Florida Governor Ron DeSantis announced this week that he would fine social media companies that ban political candidates. Every outlet from Fox News to MSNBC fired off missives about the bill. What got lost in the news coverage is that Silicon Valley deplatforms very few politicians, save shock-jocks like Donald Trump and Laura Loomer (if you want to call her a politician). The same cannot be said for sex workers.

This month, Centro University released a study estimating that 46 percent of adult influencers reported losing access to Twitter or Instagram in the last year. The bans put a permanent dent in the stars’ income, with Centro estimating sex workers lose $260 million a year due to social media bans. You won’t hear DeSantis, Fox News, Glenn Greenwald, or any other so-called free speech warriors decrying porn stars’ lost incomes, so let me break down how social media companies are screwing over porn stars (and not screwing them in a good way!).

Silicon Valley titans 

In [60]:
print(data.loc[302])

full text florida governor ron desantis announce week would fine social media company ban political candidates every outlet fox news msnbc fire missives bill get lose news coverage silicon valley deplatforms politicians save shockjocks like donald trump laura loomer want call politician say sex workers month centro university release study estimate percent adult influencers report lose access twitter instagram last year ban put permanent dent star income centro estimate sex workers lose million year due social media ban hear desantis fox news glenn greenwald socalled free speech warriors decry porn star lose incomes let break social media company screw porn star screw good way silicon valley titans revoke social media access multiple time take recent snapchat ban santa monicabased app bar post public account lose mean communicate fan would message snap lose percent revenue build back follow adult performers face far worse centro report show percent adult influencers report temporarily 