In [None]:
from google.colab import drive
drive.mount('/content/drive')
from IPython.display import clear_output

## Importing dependencies

In [None]:
!pip3 install -q ktrain
clear_output()

In [None]:
import ktrain
from ktrain import text
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

DATA_PATH = '/content/drive/MyDrive/Colab_Notebooks/NLP_TeamG/datasets/all_data.csv'

## Preprocessing

In [None]:
data = pd.read_csv(DATA_PATH, index_col=0)
data.label.value_counts()

In [None]:
data

## Split all data to train and test

In [None]:
train_1 = data[:2000]
train_0 = data[2181: 4181]

test_1 =  data[2000:2181]
test_0 =  data[4181: 4362]

print(len(train_1))
print(len(train_0))
print(len(test_1))
print(len(test_0))

In [None]:
train = pd.concat([train_1, train_0])
test = pd.concat([test_1, test_0])

train = shuffle(train, random_state=0)
test = shuffle(test, random_state=0)

train.reset_index(inplace = True)
del train['index']

test.reset_index(inplace = True)
del test['index']

In [None]:
train.label.value_counts()

In [None]:
test.label.value_counts()

In [None]:
# pd.options.display.max_rows = 4500
# df
# df.head()

In [None]:
train.info()

In [None]:
X, y = train.content, train.label

# shuffle the data
X, y = shuffle(X, y, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)
x_train = x_train.values
x_val = x_val.values
y_train = y_train.values
y_val = y_val.values

In [None]:
print(len(x_train), len(x_val))

## Create a Transformer Model

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
classes = y.unique()
t = text.Transformer(MODEL_NAME, maxlen=500, classes=classes)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_val, y_val)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=8)

## Find the best possible learning rate

In [None]:
# learner.lr_estimate()

In [None]:
# use learner.lr_find() to find the best pissible learning rate
# learner.lr_find(show_plot=True, max_epochs=3)

## Train the model

In [None]:
learner.fit_onecycle(5e-5, 3)

In [None]:
learner.model.summary()

## Visualise the learning process

In [None]:
learner.plot()

In [None]:
# learner.view_top_losses(n=1, preproc=t)

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
print(predictor.predict('He likes money laundering.'))
print(predictor.predict_proba('He likes money laundering.'))
print('\n')
print(predictor.predict('money laundering.'))
print(predictor.predict_proba('money laundering.'))
print('\n')
print(predictor.predict('money.'))
print(predictor.predict_proba('money.'))
print('\n')
print(predictor.predict('laundering.'))
print(predictor.predict_proba('laundering.'))

## Visualise the contribution

In [None]:
!pip3 install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1

In [None]:
print(predictor.predict('Increased scrutiny by regulators is driving a global increase in the number and value of anti-money laundering penalties issued to financial institutions. Financial institutions worldwide are subsequently taking steps to mitigate the risk of receiving anti-money laundering fines from regulators through preventing exposure to sanctioned parties and ensuring anti-money laundering compliance.'))

In [None]:
predictor.explain('Increased scrutiny by regulators is driving a global increase in the number and value of anti-money laundering penalties issued to financial institutions. Financial institutions worldwide are subsequently taking steps to mitigate the risk of receiving anti-money laundering fines from regulators through preventing exposure to sanctioned parties and ensuring anti-money laundering compliance.')

## Testing

In [None]:
TEST_PATH = '/content/drive/MyDrive/Colab_Notebooks/NLP_Testing/datasets/test2.csv'

In [None]:
test = pd.read_csv(TEST_PATH, index_col=0)
test.label.value_counts()

In [None]:
# mini_test = test[:100]
# mini_test

In [None]:
correct = 0
n = len(test)
# n = len(mini_test)
for i in range(n):
    prediction = predictor.predict(test.iloc[i,0])
    if prediction == 0:
        correct += 1
acc = correct / n
print(f'The testing accuracy is {acc}')