In [None]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, confusion_matrix

import pandas as pd
pd.set_option('display.max_colwidth', None)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import metrics

from tqdm import tqdm
tqdm.pandas()
import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot

# save and load tokenizer
import pickle 
import json

In [None]:
cwd = Path(os.getcwd())
data_dir = cwd.parent/'data'
resources_dir = cwd.parent/'resources'
target_dir = cwd.parent.parent.parent/'target/ml'

In [None]:
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Prepare train and test set

In [None]:
datafile =data_dir/'questions.csv'
data = pd.read_csv(datafile, index_col=0)

data.head()

### Set Parameters

In [None]:
max_features = 10000

embedding_vecor_length = 300

meta_parameters = {
    'padding': 'pre',
    'truncating':'post'
}

### Decide maxlen

In [None]:
data.question_text.str.split().str.len().describe()

In [None]:
maxlen = 50

### Data balance and upsampling

In [None]:
train, test = train_test_split(data, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

print("Training dataset: {}".format(len(train)))
print("Testing dataset: {}".format(len(test)))
print("validate dataset: {}".format(len(val)))

In [None]:
prep = len(train[train.target == 0])/len(train[train.target == 1])
print("Training dataset pos/neg: {}\n".format(prep))

numNeg = len(train[train.target == 0])
pos = train[train.target == 1].sample(numNeg, replace = True, random_state=42).reset_index(drop=True)
neg = train[train.target == 0].reset_index(drop=True)
train = pd.concat([pos, neg]).reset_index(drop=True)

print("Training dataset pos: {}".format(len(train[train.target == 1])))
print("Training dataset neg: {}".format(len(train[train.target == 0])))

### Preprocessing

In [None]:
def preprocessing(question, tokenizer, meta_parameters):

    question_X = question["question_text"].fillna("_na_").values
    
    question_X = tokenizer.texts_to_sequences(question_X)
 
    question_X = pad_sequences(question_X, maxlen=maxlen, padding=meta_parameters['padding'], truncating=meta_parameters['truncating'])
    
    return question_X

In [None]:
tokenizerfile = resources_dir/"tokenizer.pickle"
with open(tokenizerfile, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
train_X = preprocessing(train, tokenizer, meta_parameters)
train_y = train["target"]
test_X = preprocessing(test, tokenizer, meta_parameters)
test_y = test["target"]
val_X = preprocessing(val, tokenizer, meta_parameters)
val_y = val["target"]

### Load embedding

In [None]:
import lzma
matrixfile = resources_dir/"embedding_matrix.pkl.lzma"
with lzma.open(matrixfile, 'rb') as handle:
    embedding_matrix = pickle.load(handle)

embedding_matrix.shape  

## Train Model

In [None]:
def model_init():
    model = Sequential()
    model.add(Embedding(max_features, embedding_vecor_length, input_length=maxlen,  weights=[embedding_matrix]))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[metrics.AUC()])
    print(model.summary())
    return model

In [None]:
model = model_init()
history = model.fit(train_X, train_y, epochs=1, batch_size=64, validation_data=(val_X, val_y)
                    , callbacks=[EarlyStopping(monitor='val_loss', patience=1, min_delta=0.0001)])

### Evaluate test set

In [None]:
prediction = model.predict(test_X)
labels = [x for x in test_y]

In [None]:
precision_recall = precision_recall_curve(labels, prediction)
precision_recall = pd.DataFrame.from_records(precision_recall).T
precision_recall.columns = ['Precision', 'Recall', 'Thresholds']
precision_recall["f1_score"] = 2 * (precision_recall.Precision * precision_recall.Recall) / (precision_recall.Precision + precision_recall.Recall)
# Find best model by max. f1-score
print("Max f1-Score: %.2f%%" % (max(precision_recall["f1_score"])))

In [None]:
# Find threshold for precision 
with open(cwd/"output/qc_contact_request_deletion_reason_thresholds.txt","w") as file_thresholds:
    precisions_thresholds = [0.8, 0.85, 0.9, 0.95] 
    for p in precisions_thresholds: 
        t = min(precision_recall[precision_recall['Precision']>p]["Thresholds"])
        msg = "Threshold with precision > %.2f%%: %.2f%%" % (p, t)
        print(msg)
        file_thresholds.write(msg)

In [None]:
trace1 = go.Scatter(x=precision_recall.Thresholds, y = precision_recall.Precision,  marker=dict(size=5,
                line=dict(width=1),
                color="blue"
               ), 
                    mode="lines", name='Precision')
trace2 = go.Scatter(x=precision_recall.Thresholds, y = precision_recall.Recall,  marker=dict(size=5,
                line=dict(width=1),
                color="orange"
               ), 
                    mode="lines", name='Recall')

trace3 = go.Scatter(x=precision_recall.Thresholds, y = precision_recall.f1_score,  marker=dict(size=5,
                line=dict(width=1),
                color="green"
               ), 
                    mode="lines", name='f1_score')



data1 = [trace1, trace2, trace3]
layout1=go.Layout(title="Threshold vs P, R, F1", xaxis={'title':'Threshold'}, yaxis={'title':'Threshold Tradeoff'})
figure1=go.Figure(data=data1,layout=layout1)
iplot(figure1)

In [None]:
predLabel = pd.DataFrame(prediction, columns = ["Probability"]).assign(Label=labels)
trace4 = go.Histogram(
    x=predLabel[predLabel.Label == 0].Probability,
    opacity=1,
    name = "1"
)
trace5 = go.Histogram(
    x=predLabel[predLabel.Label == 1].Probability,
    opacity=0.3,
    name = "0"
)

data45 = [trace4, trace5]
layout45 = go.Layout(barmode='overlay')
figure45 = go.Figure(data=data45, layout=layout45)

iplot(figure45, filename='probability overlaid histogram')

In [None]:
threshold = 0.9
y_true = [x for x in test_y]
y_pred = [1 if x > threshold else 0 for x in prediction]
confusion_matrix(y_true, y_pred)

In [None]:
predictionDF = pd.DataFrame({'prediction':prediction[:,0]})
predicted_test = test.join(predictionDF, how='inner')
predicted_test['y_pred'] = predicted_test['prediction'].progress_apply(lambda x: 1 if x > threshold else 0)

In [None]:
# false negatives
predicted_test[(predicted_test['y_pred'] == 0) & (predicted_test['target'] == 1)]

In [None]:
# false positive
predicted_test[(predicted_test['y_pred'] == 1) & (predicted_test['target'] == 0)]

### Save and package

In [None]:
# save model
modelfile = target_dir/"qc_model.h5"
model.save(modelfile)

# save tokenizer for model
with open(target_dir/'qc_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(target_dir/'qc_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer.to_json())    

# save meta parameters
with open(target_dir/'qc_meta_parameters.json', 'w', encoding='utf-8') as f:
    json.dump(meta_parameters, f) 