## Imports and constants

In [None]:
import requests
import pandas as pd
import numpy as np
import re
import json
import multiprocessing
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import (
    confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
)

from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn import model_selection, linear_model
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec, KeyedVectors 

DFS_PATH = '../data/training_set.csv'

SIZE = 100
WINDOW = 10
MIN_COUNT = 1
cores = multiprocessing.cpu_count()

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

## Read dataset

In [None]:
df = pd.read_csv(DFS_PATH)
print(len(df))
df = df[['content','label']].dropna().reset_index(drop=True)

In [None]:
len(df)

In [None]:
def transform_vectors_mean(corpus, model, size):
    result = []

    for sent in corpus:
        s = []
        for w in sent:
            try:
                s.append(model[w])
            except KeyError:
                s.append(np.zeros(size))

        result.append(np.mean(s, axis=0))

    return result


def read_w2v_corpus(df):
    r = []
    for i in range(len(df)):
        r.append(gensim.utils.simple_preprocess(df[i]))
    return r

def clean_text(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

## Train/Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['content'], 
                                                    df['vote'], 
                                                    test_size=0.1, 
                                                    random_state=42)



In [None]:
training = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)

In [None]:
testing = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

## DeepMoji

In [None]:
from deepmoji.model_def import deepmoji_emojis
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
from deepmoji.sentence_tokenizer import SentenceTokenizer

with open(VOCAB_PATH, 'r') as f:
    VOCAB = json.load(f)

MAXLEN=30

st = SentenceTokenizer(VOCAB, MAXLEN)
model = deepmoji_emojis(MAXLEN, PRETRAINED_PATH)

In [None]:
tokenized, _, _ = st.tokenize_sentences(training['content'])

In [None]:
predictions_emojis = model.predict(tokenized)

In [None]:
len(predictions_emojis)

In [None]:
training['deepmoji'] = list(predictions_emojis)

In [None]:
tokenized, _, _ = st.tokenize_sentences(testing['content'])
predictions_emojis = model.predict(tokenized)
test_deepmoji = list(predictions_emojis)

## Clean Text

In [None]:
training['clean_text'] = [clean_text(text) for text in training['content'].values]
testing['clean_text'] = [clean_text(text) for text in testing['content'].values]

## W2V

In [None]:
training['content'] = read_w2v_corpus(training['content'])
X_test = read_w2v_corpus(X_test.values)

In [None]:
model = Word2Vec(training['content'], 
                 size=SIZE, 
                 window=WINDOW,
                 min_count=MIN_COUNT,
                 workers=cores)

model.train(training['content'], total_examples=model.corpus_count, epochs=250)

In [None]:
training['w2v'] = transform_vectors_mean(training['content'], model, SIZE)
X_test = transform_vectors_mean(X_test, model, SIZE)

In [None]:
t = []
for i in range(len(training)):
    try:
        if len(training['w2v'][i]) == 0:
            print('youpi')
    except:
        t.append(i)
        
training = training.drop(t).reset_index(drop=True)

In [None]:
train = training['w2v']
train_label = training['vote']

In [None]:
len(train_label)

## Classification Text

In [None]:
clf = linear_model.SGDClassifier(n_jobs=-1, max_iter=1000, tol=1e-4, n_iter=None)
model_calibrated = CalibratedClassifierCV(base_estimator=clf, cv=3, method='sigmoid')


In [None]:
model_calibrated.fit(list(train), train_label)

In [None]:
predictions = model_calibrated.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
confusion_matrix(y_test, predictions)

## Classification Deepmojis

In [None]:
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1, class_weight='balanced')
clf.fit(list(training['deepmoji'].values), train_label)

In [None]:
predictions = clf.predict(test_deepmoji)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
confusion_matrix(y_test, predictions)

## Combine Deepmojis and W2V Features

In [None]:
train_features = [np.concatenate((train[i], training['deepmoji'][i]), axis=None)
                  for i in range(len(train))]

In [None]:
test_features = [np.concatenate((X_test[i], test_deepmoji[i]), axis=None)
                  for i in range(len(test_deepmoji))]

## Classification text + Deepmojis 

In [None]:
clf = linear_model.SGDClassifier(n_jobs=-1, max_iter=1000, tol=1e-4, n_iter=None)
model_calibrated = CalibratedClassifierCV(base_estimator=clf, cv=3, method='sigmoid')


In [None]:
model_calibrated.fit(list(train_features), train_label)

In [None]:
predictions = model_calibrated.predict(test_features)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
confusion_matrix(y_test, predictions)

## Classification text TF-IDF

In [None]:
def content_column(X):
    return list(X['clean_text'].values)

clf = linear_model.SGDClassifier(n_jobs=-1, max_iter=1000, tol=1e-4, n_iter=None)
model_calibrated = CalibratedClassifierCV(base_estimator=clf, cv=3, method='sigmoid')

tfidf_model = make_pipeline(FunctionTransformer(content_column, validate=False),
                      TfidfVectorizer(min_df=0., max_df=1., use_idf=True, max_features=20000),
                      model_calibrated)

In [None]:
tfidf_model.fit(training, training['vote'].values)

In [None]:
predictions = tfidf_model.predict(testing)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
confusion_matrix(y_test, predictions)

## Classification text LSA

In [None]:
svd_model = TruncatedSVD(n_components=500, 
                         algorithm='randomized',
                         n_iter=10, 
                         random_state=42)

In [None]:
clf = linear_model.SGDClassifier(n_jobs=-1, max_iter=1000, tol=1e-4, n_iter=None)
model_calibrated = CalibratedClassifierCV(base_estimator=clf, cv=3, method='sigmoid')

lsa_model = make_pipeline(FunctionTransformer(content_column, validate=False),
                      TfidfVectorizer(min_df=0., max_df=1., use_idf=True, max_features=20000),
                      svd_model,
                      model_calibrated)

In [None]:
lsa_model.fit(training, training['vote'].values)

In [None]:
predictions = lsa_model.predict(testing)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
confusion_matrix(y_test, predictions)

## Stacking

In [None]:
def deepmoji_column(X):
    return list(X['deepmoji'].values)

def w2v_feature(X):
    return list(X['w2v'].values)

def content_column(X):
    return list(X['clean_text'].values)

In [None]:
testing['w2v'] = X_test
testing['deepmoji'] = test_deepmoji

In [None]:
clf = linear_model.SGDClassifier(n_jobs=-1, max_iter=1000, tol=1e-4, n_iter=None)
model_calibrated = CalibratedClassifierCV(base_estimator=clf, cv=3, method='sigmoid')

pipe1 = make_pipeline(FunctionTransformer(deepmoji_column, validate=False),
                      RandomForestClassifier(n_estimators=250, n_jobs=-1))

pipe2 = make_pipeline(FunctionTransformer(w2v_feature, validate=False),
                      RandomForestClassifier(n_estimators=500, n_jobs=-1))

pipe3 = make_pipeline(FunctionTransformer(w2v_feature, validate=False),
                      model_calibrated)

pipe4 = make_pipeline(FunctionTransformer(w2v_feature, validate=False), 
                      GaussianNB())

tfidf_model = make_pipeline(FunctionTransformer(content_column, validate=False),
                      TfidfVectorizer(min_df=0., max_df=1., use_idf=True, max_features=20000),
                      model_calibrated)

lsa_model = make_pipeline(FunctionTransformer(content_column, validate=False),
                      TfidfVectorizer(min_df=0., max_df=1., use_idf=True, max_features=20000),
                      svd_model,
                      model_calibrated)

sclf = StackingClassifier(classifiers=[pipe1, pipe2, pipe3, pipe4, lsa_model, tfidf_model],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=RandomForestClassifier(n_estimators=50, n_jobs=-1, class_weight='balanced'))

In [None]:
sclf.fit(training, training['vote'].values)

In [None]:
predictions = sclf.predict(testing)

In [None]:
accuracy_score(testing['vote'].values, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
recall_score(y_test, predictions, average=None)

In [None]:
confusion_matrix(y_test, predictions)

## Simple NN & TF-IDF

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from sklearn.utils import class_weight
from sklearn import preprocessing
from keras.models import Model, load_model
from keras.layers import (
    Input, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, GlobalMaxPool1D, 
    Embedding, Bidirectional, GlobalMaxPooling1D, Dropout
)
from keras.optimizers import Adam
from keras import regularizers
from keras.utils import multi_gpu_model
from keras.utils.np_utils import to_categorical
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv(DFS_PATH)
print(len(df))
df = df[['content','label']].dropna().reset_index(drop=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['content'], 
                                                    df['vote'], 
                                                    test_size=0.1, 
                                                    random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, 
                                                y_test, 
                                                test_size=0.5, 
                                                random_state=42)


In [None]:
tf = TfidfVectorizer(min_df=0., max_df=1., use_idf=True, max_features=20000)
norm_corpus_matrix_train = tf.fit_transform(X_train['content'])
norm_corpus_matrix_val   = tf.transform(X_val['content'])
norm_corpus_matrix_test  = tf.transform(X_test['content'])

norm_corpus_matrix_train = norm_corpus_matrix_train.toarray()
norm_corpus_matrix_val = norm_corpus_matrix_val.toarray()
norm_corpus_matrix_test = norm_corpus_matrix_test.toarray()

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
d1 = Dense(100)(sequence_input)
d2 = Dense(50)(d1)
preds = Dense(labels.shape[1], activation='softmax')(d2)

model = Model(sequence_input, preds)
model_summary = model.summary()
print(model_summary)

In [None]:
model_summary = []
model.summary(print_fn=lambda x: model_summary.append(x + '\n'))
model_summary = ' '.join(model_summary)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=['acc'])

In [None]:
filepath = "models/nn.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')

callbacks_list = [checkpoint, early_stopping]

In [None]:
model.fit(norm_corpus_matrix_train,
          y_train,
          batch_size=1024,
          epochs=20,
          callbacks=callbacks_list,
          validation_data=(norm_corpus_matrix_val, y_val))

In [None]:
#load best model
model.load_weights(filepath)

score = model.evaluate(norm_corpus_matrix_test, y_test, batch_size=1024)

print('Test loss:', score[0])
print('Test accuracy:', score[1])