In [3]:
import spacy
import re
import pandas as pd
import numpy as np

nlp = spacy.load('en_core_web_sm')

regex1 = re.compile(r'(http\S+)|(#(\w+))|(@(\w+))|[^\w\s]|(\w*\d\w*)')
regex2 = re.compile(r'(\s+)|(\n+)')

def lemmatize(article):
    article = re.sub(regex1, '', article)
    article = re.sub(regex2,' ', article).strip().lower()
    
    doc = nlp(article)
    lemmatized_article = " ".join([token.lemma_ for token in doc if (token.is_stop==False)]) 
    
    return lemmatized_article

In [4]:
am = pd.read_csv('../adverse_media_training.csv.zip')
nam = pd.read_csv('../non_adverse_media_training.csv.zip')

am_confirmed = am.loc[(am.label == 'am') | (am.label == 'am ')]
am_confirmed = pd.concat([am_confirmed, nam.loc[nam.label == 'am']])

nam_confirmed = nam.loc[(nam.label == 'nam') | (nam.label == 'random')]
nam_confirmed = pd.concat([nam_confirmed, am.loc[(am.label == 'nam') | (am.label == 'random')]])

am_confirmed['is_adverse_media'] = 1
nam_confirmed['is_adverse_media'] = 0

# Creating the train dataset
data = pd.concat([am_confirmed, nam_confirmed])

data["article"] = data["title"] + " " + data["article"]
data["lemmatized"] = data["article"].apply(lemmatize)

data = data.sample(frac = 1, random_state=42)
data = data.reset_index()
data = data.drop(['index'], axis=1)

In [13]:
public_test = pd.read_csv('../public_test.csv')
public_test = public_test.drop(['id'], axis=1)
public_test["article"] = public_test["title"] + " " + public_test["article"]
public_test["lemmatized"] = public_test["article"].apply(lemmatize)
public_test

Unnamed: 0,title,article,label,lemmatized
0,Caputo concealed Cayman Island offshore firms ...,Caputo concealed Cayman Island offshore firms ...,1,caputo conceal cayman island offshore firm arg...
1,California Man Pleads Guilty in $6 Million Art...,California Man Pleads Guilty in $6 Million Art...,1,california man plead guilty million art fraud ...
2,Couple jailed for laundering £50m,Couple jailed for laundering £50m A couple who...,1,couple jail launder couple run diamond trading...
3,John Gilligan charged with money laundering of...,John Gilligan charged with money laundering of...,1,john gilligan charge money laundering offence ...
4,Grace Mugabe faces arrest in Mary Chiwenga Sty...,Grace Mugabe faces arrest in Mary Chiwenga Sty...,1,grace mugabe face arrest mary chiwenga style s...
...,...,...,...,...
154,Kanye West's strange presidential bid unravels...,Kanye West's strange presidential bid unravels...,0,kanye west strange presidential bid unravel th...
155,Anti-money laundering software startup TookiTa...,Anti-money laundering software startup TookiTa...,0,antimoney laundering software startup tookitak...
156,If we really want to know what makes terrorist...,If we really want to know what makes terrorist...,0,want know make terrorist commit atrocity half ...
157,An effective e-declaration system will be a wa...,An effective e-declaration system will be a wa...,0,effective edeclaration system watershed countr...


In [14]:
public_test = public_test[['lemmatized', 'label']]
public_test

Unnamed: 0,lemmatized,label
0,caputo conceal cayman island offshore firm arg...,1
1,california man plead guilty million art fraud ...,1
2,couple jail launder couple run diamond trading...,1
3,john gilligan charge money laundering offence ...,1
4,grace mugabe face arrest mary chiwenga style s...,1
...,...,...
154,kanye west strange presidential bid unravel th...,0
155,antimoney laundering software startup tookitak...,0
156,want know make terrorist commit atrocity half ...,0
157,effective edeclaration system watershed countr...,0


In [12]:
train = data.copy()
train = train[['lemmatized', 'is_adverse_media']]
train['label']= train['is_adverse_media']
train = train.drop(['is_adverse_media'], axis=1)
train

Unnamed: 0,lemmatized,label
0,duterte antiterror law dark new chapter philip...,0
1,singapore fines standard chartered entity mill...,1
2,treasury designate drug trafficker northern af...,1
3,wikipedia free encyclopedia wikipedia voir aus...,1
4,morente new law option end graft corruption bi...,0
...,...,...
724,lamine diack iaaf head find guilty corruption ...,1
725,south koreas presidential scandal publish imag...,1
726,hereford united legend ricky george jail money...,1
727,philippine antiterrorism law trigger fear mass...,0


In [None]:
big_train = pd.concat([am_confirmed, nam_confirmed])
