## Preprocessing

Let us first separate only the neccesary colums, combine the data into one collection, and covert it to binary labels 

In [16]:
from google.colab import drive
import glob
drive.mount('/content/drive')

import os
path = '/content/drive/My Drive/Colab Notebooks/Machine Learning/project/Karl'
os.chdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import pandas as pd
import numpy as np


am = pd.read_csv('../adverse_media_training.csv')
nam = pd.read_csv('../non_adverse_media_training.csv')

# Keep only needed columns
am_cropped = am[['article','title','label']]
nam_cropped = nam[['article','title', 'label']]

# Combine source files and re-label to binary
am = pd.concat(
    [ am_cropped.loc[(am_cropped.label == 'am') | (am_cropped.label == 'am ')],
     nam_cropped.loc[(nam_cropped.label == 'am')] ]
)
am['label'] = 1

nam = pd.concat(
    [ am_cropped.loc[(am_cropped.label == 'nam') | (am_cropped.label == 'random')], 
     nam_cropped.loc[(nam_cropped.label == 'nam')] ]
)
nam['label'] = 0


# Combine data into one table
data = pd.concat([am,nam])

data["article"] = data["title"] + " " + data["article"]
data = data.drop(["title"], axis =1)


Now let us combine the article and title columns, remove punctuation, lowercase the text, etc.

In [22]:
import spacy
import re

nlp = spacy.load('en_core_web_sm')


# Should be (almost) the same as Canberk's, but slighlty faster, as not compiling the regex each time
regex1 = re.compile(r'(http\S+)|(#(\w+))|(@(\w+))|[^\w\s]|(\w*\d\w*)')
regex2 = re.compile(r'(\s+)|(\n+)')

def lemmatize(article):
    article = re.sub(regex1, '', article)
    article = re.sub(regex2,' ', article).strip().lower()
    
    doc = nlp(article)
    lemmatized_article = " ".join([token.lemma_ for token in doc if (token.is_stop==False)]) 
    
    return lemmatized_article

Lemmatizing the whole dataset:

In [23]:
train = data[['article', 'label']].copy()
train["article"] = train["article"].apply(lemmatize)
train = train.reset_index()
train = train.drop(['index'], axis=1)
train

Unnamed: 0,article,label
0,crooked ceos bernie madoff schedule sentence j...,1
1,fund manager force resign bbc investigation pu...,1
2,peregrine financial group boss admit fraud pub...,1
3,american accuse congo official unlawful arrest...,1
4,bitcoin foundation vice chair arrest money lau...,1
...,...,...
721,lead uk bank strengthen fight rise payment fra...,0
722,shadow chancellor call minister fulfil pledge ...,0
723,peru oust president threaten rule law washingt...,0
724,france give online firm hour pull terrorist co...,0


 https://www.analyticsvidhya.com/blog/2020/10/simple-text-multi-classification-task-using-keras-bert/


In [24]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [25]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 18.9MB/s eta 0:00:01[K     |▋                               | 20kB 25.5MB/s eta 0:00:01[K     |▉                               | 30kB 24.8MB/s eta 0:00:01[K     |█▏                              | 40kB 18.1MB/s eta 0:00:01[K     |█▌                              | 51kB 15.6MB/s eta 0:00:01[K     |█▊                              | 61kB 17.6MB/s eta 0:00:01[K     |██                              | 71kB 14.2MB/s eta 0:00:01[K     |██▍                             | 81kB 15.5MB/s eta 0:00:01[K     |██▋                             | 92kB 15.1MB/s eta 0:00:01[K     |███                             | 102kB 13.8MB/s eta 0:00:01[K     |███▎                            | 112kB 13.8MB/s eta 0:00:01[K     |███▌        

In [26]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging
logging.basicConfig(level=logging.INFO)

In [75]:
import tensorflow_hub as hub
import tokenization
# module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
module_path = 'bert-layer'
bert_layer = hub.KerasLayer(module_path, trainable=True)

INFO:absl:resolver HttpCompressedFileResolver does not support the provided handle.
INFO:absl:resolver GcsCompressedFileResolver does not support the provided handle.
INFO:absl:resolver HttpUncompressedFileResolver does not support the provided handle.


# Possible improvement: 
Read TODO comment below

In [76]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            

        # TODO: Should be changed to split the text into chunks, process each chunk separately, and later combine
        text = text[:max_len-2]



        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# Possible improvents: 
Add layers, change params, replace with any model really, but it works for now just keep the "bert_layer" in there as one of the first steps

In [77]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(2, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [78]:
## test-train split: 
from sklearn.model_selection import train_test_split
train = pd.read_csv('all_lemmatized.csv', lineterminator='\n').iloc[:, 1:3]

bert_train = train.sample(frac = 1) 

x_train, x_val, y_train, y_val = train_test_split(bert_train['article'], 
                                                    bert_train['label'], 
                                                    test_size=0.1, 
                                                    random_state=42,
                                                    stratify= bert_train['label'])

print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(642,) (72,) (642,) (72,)


# Increase the max_len param for better results, but more time taken

In [79]:
import keras
max_len = 500 # Larger takes longer
train_input = bert_encode(x_train, tokenizer, max_len=max_len)
test_input = bert_encode(x_val, tokenizer, max_len=max_len)
train_labels = keras.utils.to_categorical(y_train, num_classes=2)

In [80]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 500)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model-again.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=10,
    callbacks=[checkpoint, earlystopping],
    batch_size=8,
    verbose=1)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.89922, saving model to model.h5
Epoch 2/10
Epoch 00002: val_accuracy did not improve from 0.89922
Epoch 3/10
Epoch 00003: val_accuracy did not improve from 0.89922
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.89922
Epoch 5/10
Epoch 00005: val_accuracy did not improve from 0.89922
Epoch 6/10
Epoch 00006: val_accuracy did not improve from 0.89922
Epoch 00006: early stopping


In [81]:
model.load_weights('model.h5')
test_pred = model.predict(test_input)

In [82]:
# from probability to binary
pred = [1 if el[1]> 0.5 else 0 for el in test_pred]
pred[:5]

[0, 1, 0, 1, 1]

In [83]:
from sklearn.metrics import f1_score


val_f1_score = f1_score(y_val, pred)

print('F1 score for model on validation data:', round(val_f1_score*100, 3))

F1 score for model on validation data: 93.506


In [84]:
public_test = pd.read_csv('../public_test.csv')
public_test

Unnamed: 0,id,title,article,label
0,931,Caputo concealed Cayman Island offshore firms ...,"By Sandra Crucianelli, Emilia Delfino y From B...",1
1,644,California Man Pleads Guilty in $6 Million Art...,A California man pleaded guilty in federal cou...,1
2,881,Couple jailed for laundering £50m,A couple who ran a diamond trading business ha...,1
3,841,John Gilligan charged with money laundering of...,image copyrightRTÉ\n\nA Dublin man has been ch...,1
4,31,Grace Mugabe faces arrest in Mary Chiwenga Sty...,Zimbabwe News\n\nGrace Mugabe faces arrest in ...,1
...,...,...,...,...
154,348,Kanye West's strange presidential bid unravels...,(CNN) Kanye West is on the ballot in Minnesota...,0
155,295,Anti-money laundering software startup TookiTa...,"TookiTaki, a startup that develops machine lea...",0
156,311,If we really want to know what makes terrorist...,In the last two and half years I’ve studied th...,0
157,545,An effective e-declaration system will be a wa...,"BY MARCUS BRAND,\n\nTwo-and-a-half years ago, ...",0


In [85]:
public_test["article"] = public_test["title"] + " " + public_test["article"]
public_test.drop(["title"], axis =1)

Unnamed: 0,id,article,label
0,931,Caputo concealed Cayman Island offshore firms ...,1
1,644,California Man Pleads Guilty in $6 Million Art...,1
2,881,Couple jailed for laundering £50m A couple who...,1
3,841,John Gilligan charged with money laundering of...,1
4,31,Grace Mugabe faces arrest in Mary Chiwenga Sty...,1
...,...,...,...
154,348,Kanye West's strange presidential bid unravels...,0
155,295,Anti-money laundering software startup TookiTa...,0
156,311,If we really want to know what makes terrorist...,0
157,545,An effective e-declaration system will be a wa...,0


In [49]:
public_test_lemmatized = public_test[['article', 'label']].copy()
public_test_lemmatized["article"] = public_test_lemmatized["article"].apply(lemmatize)
public_test_lemmatized = public_test_lemmatized.reset_index()
public_test_lemmatized = public_test_lemmatized.drop(['index'], axis=1)
public_test_lemmatized

Unnamed: 0,article,label
0,caputo conceal cayman island offshore firm arg...,1
1,california man plead guilty million art fraud ...,1
2,couple jail launder couple run diamond trading...,1
3,john gilligan charge money laundering offence ...,1
4,grace mugabe face arrest mary chiwenga style s...,1
...,...,...
154,kanye west strange presidential bid unravel th...,0
155,antimoney laundering software startup tookitak...,0
156,want know make terrorist commit atrocity half ...,0
157,effective edeclaration system watershed countr...,0


In [86]:

public_test_tokenized = bert_encode(public_test_lemmatized['article'], tokenizer, max_len=max_len)
public_test_pred = model.predict(public_test_tokenized)

public_test_pred = [1 if el[1]> 0.5 else 0 for el in public_test_pred]

public_test_f1_score = f1_score(public_test_lemmatized['label'], public_test_pred)

print('F1 score for model on public test data:', round(public_test_f1_score*100, 3))

F1 score for model on public test data: 94.301
