In [None]:
import warnings
# warnings.filterwarnings('ignore')      # never show warnings _> BIG CHANGE
warnings.filterwarnings(action='once') # show warnings once and never again

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras import optimizers
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, LSTM, CuDNNLSTM
from keras.callbacks import ModelCheckpoint, TensorBoard

import tensorflow as tf

import pyarrow.parquet as pyparquet
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
tqdm.pandas()
import math
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

pd.set_option('display.max_colwidth', -1)

# save and load tokenizer
import pickle 
import json

import nltk
from nltk.corpus import stopwords

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Importing the graph_objs module which contains plotting objects
import plotly.graph_objs as go
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
import matplotlib.pyplot as plt

In [None]:
import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Find threshold for precision 
with open("docker_save_data.txt","w") as file_thresholds:
    precisions_thresholds = [0.8, 0.85, 0.9, 0.95] 
    for p in precisions_thresholds: 
        t = 3333
        msg = "Threshold with precision > %.2f%%: %.2f%%" % (p, t)
        print(msg)
        file_thresholds.write(msg)

### Set Parameters

In [None]:
max_features = 10000

embedding_vecor_length = 300


### Prepare train and test set

In [None]:
posfile = 'data/qc-deletionreason-contactrequest.snappy.parquet'
negfile = 'data/qc-notdeleted.snappy.parquet'

pos = pyparquet.read_table(posfile).to_pandas()[["label", "decoded_title", "decoded_body"]]
neg = pyparquet.read_table(negfile).to_pandas()[["label", "decoded_title", "decoded_body"]]

print("Pos dataset: {}".format(len(pos)))
print("Neg dataset: {}".format(len(neg)))

In [None]:
# reduce the size of neg and pos to N
N = 10000.0
pos = pos.sample(frac=N/len(pos)).reset_index(drop=True)
neg = neg.sample(frac=N/len(neg)).reset_index(drop=True)

print("Pos dataset: {}".format(len(pos)))
print("Neg dataset: {}".format(len(neg)))

In [None]:
pos.head()

In [None]:
neg.head()

In [None]:
neg_train, neg_test = train_test_split(neg, test_size=0.2)
pos_train, pos_test = train_test_split(pos, test_size=0.2)

# # upsample legit questions in training dataset
# legit_train = pd.concat([legit_train]*2, ignore_index=True) # Ignores the index

print("Training dataset pos: {}".format(len(pos_train)))
print("Testing dataset pos: {}".format(len(pos_test)))
print("Training dataset neg: {}".format(len(neg_train)))
print("Testing dataset neg: {}".format(len(neg_test)))

# concatinate adult and legit and shuffle the resulting test and train sets
train = pd.concat([pos_train, neg_train]).sample(frac=1).reset_index(drop=True)
test = pd.concat([pos_test, neg_test]).sample(frac=1).reset_index(drop=True)

print("Training dataset: {}".format(len(train)))
print("Testing dataset: {}".format(len(test)))

print("Pos dataset: {}".format(len(pos)))
print("Neg dataset: {}".format(len(neg)))

In [None]:
train['question_text'] = train['decoded_title'].map(str) + " " + train['decoded_body'].map(str)
test['question_text'] = test['decoded_title'].map(str) + " " + test['decoded_body'].map(str)


In [None]:
train.question_text.str.split().str.len().describe()

In [None]:
test.question_text.str.split().str.len().describe()

- set maxlen parameter

In [None]:
maxlen = 100

In [None]:
## fill up the missing values
train_X = train["question_text"].fillna("_na_").values
test_X = test["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features, filters='', lower=False)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post')
test_X = pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post')

train["target"] = ["0" if x =="__label__legit" else "1" for x in train["label"]]
test["target"] = ["0" if x =="__label__legit" else "1" for x in test["label"]]

test_y = test['target']
train_y = train['target']

### Load embedding

In [None]:
# comment it because of lower word coverage (v4)
# EMBEDDING_FILE = '../input/gf-embeddings-v2/gfmodel_v6.vec' --> memory error
EMBEDDING_FILE = '../data/gfmodel_v4.vec'

embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in tqdm(f):
    values = line.split(" ")
    if len(values) == 302:
        word = values[0]
        coefs = np.asarray(values[1:301], dtype='float32')
        embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
## embedding setup

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        

### Model

![](http://)- number of memory cells set to less than 80-400 based on ([link](https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm))

In [None]:
# create the model
from keras.callbacks import EarlyStopping

def model_init():
    model = Sequential()
    model.add(Embedding(max_features, embedding_vecor_length, input_length=maxlen,  weights=[embedding_matrix]))
    model.add(Dropout(0.8))
    model.add(LSTM(100))
    model.add(Dropout(0.8))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = model_init()
tensorboard_callback = TensorBoard("logs")
history = model.fit(train_X, train_y, epochs=10, batch_size=64, validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001), tensorboard_callback])

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

The plots suggest that the model has a little over fitting problem, more data may help, but more epochs will not help using the current data.

In [None]:
# Load the extension and start TensorBoard

# %load_ext tensorboard.notebook
# %tensorboard --logdir logs
# %reload_ext tensorboard.notebook

In [None]:
epoch_num = 3

model = model_init()

history = model.fit(train_X, train_y, epochs=epoch_num, batch_size=64, validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001), tensorboard_callback])

### Evaluate test set

In [None]:
# Final evaluation of the model
scores = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
prediction = model.predict(test_X)
labels = [1 if x == "1" else 0 for x in test_y]


In [None]:
precision_recall = precision_recall_curve(labels, prediction)
precision_recall = pd.DataFrame.from_records(precision_recall).T
precision_recall.columns = ['Precision', 'Recall', 'Thresholds']
precision_recall["f1_score"] = 2 * (precision_recall.Precision * precision_recall.Recall) / (precision_recall.Precision + precision_recall.Recall)

In [None]:
# Find best model by max. f1-score
print("Max f1-Score: %.2f%%" % (max(precision_recall["f1_score"])))


In [None]:
# Find threshold for precision 
with open("qc_contact_request_deletion_reason_thresholds.txt","w") as file_thresholds:
    precisions_thresholds = [0.8, 0.85, 0.9, 0.95] 
    for p in precisions_thresholds: 
        t = min(precision_recall[precision_recall['Precision']>p]["Thresholds"])
        msg = "Threshold with precision > %.2f%%: %.2f%%" % (p, t)
        print(msg)
        file_thresholds.write(msg)

In [None]:
# # print precision, recall, f1-score curve
# # precision_recall.iplot(x='Thresholds', xTitle='Decision Threshold',yTitle='Score', title='Precision/Recall Tradeoff')
# precision_recall.plot(x='Thresholds')

# Trace1 can be viewed like a geom_point() layer with various arguements
trace1 = go.Scatter(x=precision_recall.Thresholds, y = precision_recall.Precision,  marker=dict(size=5,
                line=dict(width=1),
                color="blue"
               ), 
                    mode="lines", name='Precision')
trace2 = go.Scatter(x=precision_recall.Thresholds, y = precision_recall.Recall,  marker=dict(size=5,
                line=dict(width=1),
                color="orange"
               ), 
                    mode="lines", name='Recall')

trace3 = go.Scatter(x=precision_recall.Thresholds, y = precision_recall.f1_score,  marker=dict(size=5,
                line=dict(width=1),
                color="green"
               ), 
                    mode="lines", name='f1_score')



data1 = go.Data([trace1, trace2, trace3])
layout1=go.Layout(title="Threshold vs P, R, F1", xaxis={'title':'Threshold'}, yaxis={'title':'Threshold Tradeoff'})
figure1=go.Figure(data=data1,layout=layout1)
iplot(figure1)

In [None]:
predLabel = pd.DataFrame(prediction, columns = ["Probability"]).assign(Label=labels)
trace4 = go.Histogram(
    x=predLabel[predLabel.Label == 0].Probability,
    opacity=1,
    name = "1"
)
trace5 = go.Histogram(
    x=predLabel[predLabel.Label == 1].Probability,
    opacity=0.3,
    name = "0"
)

data45 = go.Data([trace4, trace5])
layout45 = go.Layout(barmode='overlay')
figure45 = go.Figure(data=data45, layout=layout45)

iplot(figure45, filename='probability overlaid histogram')


In [None]:
threshold = 0.9
y_true = [1 if x =="1" else 0 for x in test_y]
y_pred = [1 if x > threshold else 0 for x in prediction]
confusion_matrix(y_true, y_pred)


In [None]:
predictionDF = pd.DataFrame({'prediction':prediction[:,0]})
predicted_test = test.join(predictionDF, how='inner')
predicted_test['y_pred'] = predicted_test['prediction'].progress_apply(lambda x: 1 if x > threshold else 0)

In [None]:
# false negatives
predicted_test[(predicted_test['y_pred'] == 0) & (predicted_test['target'] == '1')]


In [None]:
# false positive
predicted_test[(predicted_test['y_pred'] == 1) & (predicted_test['target'] == '0')]

### Evaluate on sample from "allquestion" dataset

### Train model on all data

In [None]:
# concatenate positive and negative samples
data = pd.concat([pos, neg]).sample(frac=1).reset_index(drop=True)

data['question_text'] = data['decoded_title'].map(str) + " " + data['decoded_body'].map(str)

## fill up the missing values
data_X = data["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features, filters='', lower=False)
tokenizer.fit_on_texts(list(data_X))
data_X = tokenizer.texts_to_sequences(data_X)

## Pad the sentences 
data_X = pad_sequences(data_X, maxlen=maxlen, padding='post', truncating='post')

data["target"] = ["0" if x =="__label__legit" else "1" for x in data["label"]]

data_y = data['target']


In [None]:
# save tokenizer for model
with open('qc_contact_request_deletion_reason_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('qc_contact_request_deletion_reason_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer.to_json())    


In [None]:
model = model_init()

model.fit(data_X, data_y, epochs=epoch_num, batch_size=64, validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

# save model
modelfile = "qc_contact_request_deletion_reason_model.h5"
model.save(modelfile)

In [None]:
import os
import platform
import subprocess


def res_cmd(cmd):
  return str(subprocess.Popen(cmd, stdout=subprocess.PIPE,shell=True).communicate()[0])

def str_md5sum(abspath_file):
    str_res = res_cmd('md5sum "%s"' % abspath_file)
    return str_res.split("'")[1].replace('\\n', '')

print(str_md5sum(modelfile))   