This will be a short notebook, reserved only for the Logistic Regression Model. We will use the cleaned & lemmatized dataset that we have exported as a .csv file during the preprocessing part.

In [2]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
import pandas as pd
import numpy as np
import json
import math
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, roc_auc_score, f1_score, confusion_matrix

import scipy
from scipy.sparse import hstack

# This module will be for saving the trained model for later use
import joblib

import spacy
import re

In [4]:
import spacy
import re

nlp = spacy.load('en_core_web_sm')


# Should be (almost) the same as Canberk's, but slighlty faster, as not compiling the regex each time
regex1 = re.compile(r'(http\S+)|(#(\w+))|(@(\w+))|[^\w\s]|(\w*\d\w*)')
regex2 = re.compile(r'( +)|(\n)')

def lemmatize(article):
    article = re.sub(regex1, '', article)
    article = re.sub(regex2,' ', article).strip().lower()
    
    doc = nlp(article)
    lemmatized_article = " ".join([token.lemma_ for token in doc if (token.is_stop==False)]) 
    
    return lemmatized_article

In [114]:
am = pd.read_csv('../adverse_media_training.csv.zip')
nam = pd.read_csv('../non_adverse_media_training.csv.zip')

am_confirmed = am.loc[(am.label == 'am') | (am.label == 'am ')]
am_confirmed = pd.concat([am_confirmed, nam.loc[nam.label == 'am']])

nam_confirmed = nam.loc[(nam.label == 'nam') | (nam.label == 'random')]
nam_confirmed = pd.concat([nam_confirmed, am.loc[(am.label == 'nam') | (am.label == 'random')]])

am_confirmed['is_adverse_media'] = 1
nam_confirmed['is_adverse_media'] = 0

# Creating the train dataset
data = pd.concat([am_confirmed, nam_confirmed])
print(data.shape)
print()
print(data['is_adverse_media'].value_counts())


(729, 12)

1    411
0    318
Name: is_adverse_media, dtype: int64


In [115]:
data["article"] = data["title"] + " " + data["article"]
data["lemmatized"] = data["article"].apply(lemmatize)

data = data.sample(frac = 1, random_state=42)
data = data.reset_index()
data = data.drop(['index'], axis=1)

In [116]:

data[data.is_adverse_media == 0].shape, data[data.is_adverse_media == 1].shape

((318, 13), (411, 13))

In [152]:
orig_df = data.copy()

orig_df['proba0'] = 0.0
orig_df['proba1'] = 0.0

In [153]:
import pandas as pd
from sklearn.model_selection import KFold 
 
#Cross validation
 
k = 5
kf = KFold(n_splits=k, random_state=42, shuffle=True)
 
for train_index , test_index in kf.split(data):
    x_train , x_val = data.iloc[train_index,:].lemmatized, data.iloc[test_index,:].lemmatized
    y_train , y_val = data['is_adverse_media'][train_index], data['is_adverse_media'][test_index]

    ngram_vectorizer = TfidfVectorizer(max_features=40000,
                             min_df=5, 
                             max_df=0.5, 
                             analyzer='word', 
                             stop_words='english', 
                             ngram_range=(1, 3))
    ngram_vectorizer.fit(x_train)
    
    tfidf_train = ngram_vectorizer.transform(x_train)
    tfidf_validation = ngram_vectorizer.transform(x_val)
    
    lr = LogisticRegression(solver='sag', random_state=42)
    lr.fit(tfidf_train, y_train)
    
    probabilities_with_lemmatized_articles = np.append(lr.predict_proba(tfidf_validation), x_val.to_numpy().reshape((len(x_val), 1)), axis=1)

    proba_df = pd.DataFrame(probabilities_with_lemmatized_articles, columns= ['proba0', 'proba1', 'lemmatized'])

    for i in range(len(proba_df)):
        current_proba = proba_df.iloc[i,:]
        orig_df.loc[orig_df.lemmatized == current_proba.lemmatized, 'proba0'] = current_proba['proba0']
        orig_df.loc[orig_df.lemmatized == current_proba.lemmatized, 'proba1'] = current_proba['proba1']


In [155]:
orig_df.to_csv('../original_training_data_with_probabilities_from_logit_model.csv', index=False)

In [156]:
orig_df[orig_df['proba0'] == 0.0]

Unnamed: 0,source,entity_name,entity_type,url,article,full_response,label,explanation,assessor,comment,title,is_adverse_media,lemmatized,proba0,proba1


In [157]:
!zip ../original_training_data_with_probabilities_from_logit_model.csv.zip ../original_training_data_with_probabilities_from_logit_model.csv

  adding: ../original_training_data_with_probabilities_from_logit_model.csv (deflated 82%)


In [159]:
pd.read_csv('../original_training_data_with_probabilities_from_logit_model.csv.zip', lineterminator='\n')

Unnamed: 0,source,entity_name,entity_type,url,article,full_response,label,explanation,assessor,comment,title,is_adverse_media,lemmatized,proba0,proba1
0,Shakshi,Philippines,Government,https://www.theguardian.com/world/2020/jul/09/...,Duterte's anti-terror law a dark new chapter f...,[{'query': {'id': '1605369535013-f6b21141ed514...,nam,new anti-terror bill,Shakshi,,Duterte's anti-terror law a dark new chapter f...,0,duterte antiterror law dark new chapter philip...,0.769765,0.230235
1,Canberk,Standard Chartered,Company,https://www.reuters.com/article/us-stanchart-s...,Singapore fines Standard Chartered entities $4...,[{'query': {'id': '1605051648461-00acd9e69551a...,am,money laundering,Carel,,Singapore fines Standard Chartered entities $4...,1,singapore fines standard chartered entity mill...,0.570707,0.429293
2,Shakshi,Pahlawan ROZI,Individual,https://www.treasury.gov/press-center/press-re...,Treasury Designates Drug Trafficker in Norther...,[{'query': {'id': '1605054791688-48fe4477c5594...,am,narcotics trafficking,Carel,,Treasury Designates Drug Trafficker in Norther...,1,treasury designate drug trafficker northern af...,0.209541,0.790459
3,Sebastien,NURJAMAN RIDUAN ISAMUDDIN,individual,http://dictionnaire.sensagent.leparisien.fr/ri...,"From Wikipedia, the free encyclopedia Wikipedi...",[{'query': {'id': '1605044327832-8d0ce7562106f...,am,terrorism,Carel,,"From Wikipedia, the free encyclopedia",1,wikipedia free encyclopedia wikipedia voir aus...,0.330436,0.669564
4,Shakshi,Morente,Government,https://businessmirror.com.ph/2020/11/10/moren...,"Morente: New law last option to end graft, cor...",[{'query': {'id': '1605369151537-b954c15065890...,nam,,Shakshi,,"Morente: New law last option to end graft, cor...",0,morente new law option end graft corruption bi...,0.759299,0.240701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,Sebastien,Lamine Diack,individual,https://www.bbc.com/sport/athletics/54176537,Lamine Diack: Former IAAF head found guilty of...,[{'query': {'id': '1605045641286-fa5d7e18d8ddc...,am,corruption,Carel,,Lamine Diack: Former IAAF head found guilty of...,1,lamine diack iaaf head find guilty corruption ...,0.252535,0.747465
725,Wanting,Park Geun-hye,individual,https://www.bbc.com/news/world-asia-37971085,South Korea's presidential scandal Published\n...,[{'query': {'id': '1605368847433-8295c35bcf801...,am,corruption,Canberk,,South Korea's presidential scandal,1,south koreas presidential scandal publish im...,0.272804,0.727196
726,Sebastien,Ricky George,individual,https://www.bbc.com/news/uk-england-hereford-w...,Hereford United legend Ricky George jailed for...,[{'query': {'id': '1605045990306-1f220d31fff61...,am,money laundering,Carel,,Hereford United legend Ricky George jailed for...,1,hereford united legend ricky george jail money...,0.253502,0.746498
727,Shakshi,Philippines,Individual,https://news.google.com/articles/CAIiEA2NF3jZo...,Philippine anti-terrorism law triggers fear of...,[{'query': {'id': '1605373910263-a63fb4a8e8dae...,nam,,Shakshi,,Philippine anti-terrorism law triggers fear of...,0,philippine antiterrorism law trigger fear mass...,0.722767,0.277233
