In [489]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
import functools as ft
from itertools import chain

import tensorflow as tf
import keras as k

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, Activation
from keras.models import Model, Sequential
from keras.layers.recurrent import LSTM
from keras.initializers import Constant

from livelossplot import PlotLossesKeras

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.utils import resample

from yellowbrick.classifier import (ClassBalance, ClassificationReport,
                                    ClassPredictionError, ConfusionMatrix)
from yellowbrick.features.importances import FeatureImportances


from gensim.models.word2vec import Word2Vec
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title='Normalized confusion matrix'
    else:
        title='Confusion matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
# This class will provide a mean value embedding vectorizer.  That is, for a sentence, it will take each word
# vectorize it and then take the mean of those vectors and return a single vector.
# It defines "fit" and "transform" so it can fit into a python pipeline
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        retVal = []
        for line in X:
            temp = np.array(np.mean([self.word2vec[w] for w in line.split() if w in self.word2vec] 
                                or [np.zeros(self.dim)], axis=0))
            retVal.append(temp)
        return retVal

In [None]:
def openEmbeddings(file):
    embeddings_index = {}
    with open(file, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def word2idx(word, word_model):
  return word_model.wv.vocab[word].index
def idx2word(idx, word_model):
  return word_model.wv.index2word[idx]

In [None]:
def modelStats(model):
    pred = model.predict(X_test_k)
    classes = pred.round()# pred.argmax(axis=-1)
    print("Accuracy : "+ str(accuracy_score(y_test_k, classes)))
    print(classification_report(y_test_k, classes, digits=5))  

    cnf_matrix = confusion_matrix(y_test_k, classes)
    print(cnf_matrix)
    plot_confusion_matrix(cnf_matrix, classes=y.unique())

In [None]:
pd.options.display.max_seq_items = 100
pd.options.display.max_columns = 20

In [None]:
TRAIN_SET_PATH = 'federalist.csv'

cols = ['number', 'author', 'text']
target_col = 'author'
data = pd.read_csv(TRAIN_SET_PATH, sep=',')

In [None]:
data = data.mask(data.eq('JAY')).dropna()

In [None]:
classes = data['author'].value_counts(sort=True).index.tolist()
majo = classes[0]
mino = classes[1]
data_maj = data[data[target_col] == majo]
data_min = data[data[target_col] == mino]

len(data_min), len(data_maj)

In [None]:
class_min = len(data_min)
class_max = len(data_maj)
class_min, class_max

In [None]:
class_diff = class_max - class_min 
desired_target = class_min + round(class_diff/2)

# Upsample the minority class
data_min_up = resample(data_min, 
                replace=True,     # sample with replacement
                n_samples=desired_target,    
                random_state=1) # reproducible results


# Upsample the minority class
data_maj_down = resample(data_maj, 
                    replace=True,     # sample with replacement
                    n_samples=desired_target,    
                    random_state=1) # reproducible results
    
# Combine upsampled minority class with downsampled majority class
bal_set = pd.concat([data_min_up, data_maj_down])
 
# Display new class counts
bal_set.author.value_counts()

# use our balanced set
data = bal_set

In [None]:
enc = LabelEncoder()
lab = data[target_col]
enc.fit(lab)

y = pd.Series(enc.transform(lab))

In [None]:
lables_index = dict(zip(lab.unique(), y.unique()))

In [None]:
data = data.drop([target_col], axis=1)
data = data.drop(['number'], axis=1)
data = data.reset_index(drop=True)

In [None]:
data = data.applymap(lambda x: x.lower() if type(x) is str else x)

In [None]:
embeddings_index_glove_100 = openEmbeddings('glove.6B.100d.txt')
embeddings_index_glove_300 = openEmbeddings('glove.6B.300d.txt')

In [None]:
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000

texts = data['text']

# manual tokens, first we split on the periods
sentences = [[sent for sent in d.split('<p>')] for d in texts]

# combine our stentences into a list of lists to be vectorized later
sentences_list = list(chain.from_iterable(sentences))

# w2v training wants a list of list of tokens for each sentence
sentences_train = [l.split() for l in sentences_list]

# keras
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)

# this will give us a list of lists of our tokens
sequences = tokenizer.texts_to_sequences(texts)

#get our unique words
word_index = tokenizer.word_index

#this will reshape our list of list of tokens into a matrix of <entries> x <maxlen> and pad shorter ones
X_k = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [503]:
tagged_documents = [(TaggedDocument(simple_preprocess(line),[i])) for i,line in enumerate(texts)]

In [510]:
d2v_model = model = Doc2Vec(vector_size=300, min_count=2, epochs=40)
d2v_model.build_vocab(tagged_documents)
d2v = {d: vec for d, vec in zip(d2v_model.wv.index2word, d2v_model.wv.syn0)}

In [535]:
X_doc = [d2v_model.docvecs[i] for i in range(len(d2v_model.docvecs))]
X_doc = pd.DataFrame(data=X_doc)

80

In [None]:
w2v_model = Word2Vec(sentences_train, size=300, window=5, min_count=5, iter=100, workers=-1)
ft_model = FastText(sentences_train, size=300, window=5, min_count=5, iter=100, workers=-1)

In [None]:
w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}
ftm = {w: vec for w, vec in zip(ft_model.wv.index2word, ft_model.wv.syn0)}

In [None]:
w2v['union'].shape, ftm['union'].shape, embeddings_index_glove_100['union'].shape, embeddings_index_glove_300['union'].shape

In [None]:
# create a mean value embedding for each sentences here
sentence_vectors = [MeanEmbeddingVectorizer(d2v).transform(s) for s in sentences]

In [None]:
flat_vectors = [list(chain.from_iterable(vec)) for vec in sentence_vectors]
flat_vectors.sort(key=len)
shortest = len(flat_vectors[0])
trimmed_flat_vectors = [l[:shortest] for l in flat_vectors]
X_m = pd.DataFrame(data=trimmed_flat_vectors)

In [None]:
print(len(sequences), X_k.shape, y.shape)
print(y.unique().shape)
X_m.head(10)

In [None]:
embeddings_index = embeddings_index_glove_300 #can be embeddings_index_glove_300, 100, w2v, ftm
embedding_dim = len(next(iter(embeddings_index.values())))

In [536]:
# keras
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_k, y, test_size=0.2, random_state=1, stratify=y)

#manual
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y, test_size=0.2, random_state=1, stratify=y)

#doc2vec
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_doc, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
#This is assuming the use of the keras tokenizer to generate word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index[word] if word in embeddings_index else None
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 2, activation='relu')(embedded_sequences)
x = Dropout(0.3)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(X_train_k, y_train_k,
          batch_size=20,
          epochs=50,
          validation_data=(X_test_k, y_test_k),
          callbacks=[PlotLossesKeras()])

score_train = model.evaluate(X_train_k, y_train_k, batch_size=20, verbose=1)
score_test = model.evaluate(X_test_k, y_test_k, batch_size=20, verbose=1)

print(score_train, score_test)

In [None]:
modelStats(model)

In [None]:
pretrained_weights = w2v_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['union', 'constitution', 'life', 'liberty']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) 
                           for similar, dist in w2v_model.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))


In [None]:
# embeddings_index = w2v
# embedding_matrix = np.zeros((MAX_NUM_WORDS, 100))
# for word, index in tokenizer.word_index.items():
#     if index > MAX_NUM_WORDS - 1:
#         break
#     else:
        
#         embedding_vector = embeddings_index.get(word) if (word in embeddings_index) else None
#         if embedding_vector is not None:
#             embedding_matrix[index] = embedding_vector

In [None]:
## create model
model_glove = Sequential()
model_glove.add(Embedding(len(embedding_matrix), embedding_dim, input_length=1000, weights=[embedding_matrix], trainable=False))
#model_glove.add(Dropout(0.2))
#model_glove.add(Conv1D(64, 5, activation='relu'))
#model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model_glove.fit(X_train_k, y_train_k,
          batch_size=20,
          epochs=20,
          validation_data=(X_test_k, y_test_k),
          callbacks=[PlotLossesKeras()])

score_train = model_glove.evaluate(X_train_k, y_train_k, batch_size=20, verbose=1)
score_test = model_glove.evaluate(X_test_k, y_test_k, batch_size=20, verbose=1)

print(score_train, score_test)

In [None]:
modelStats(model_glove)

In [537]:
clf = ExtraTreesClassifier(n_estimators=500, n_jobs=4)

In [538]:
clf.fit(X_train_d, y_train_d)
print(clf.score(X_train_d, y_train_d))
print(clf.score(X_test_d, y_test_d))

1.0
0.625


In [None]:
visualizer = ConfusionMatrix(clf, classes=y.unique())

fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

for label in visualizer.ax.texts:
    label.set_size(12)

visualizer.fit(X_train_k, y_train_k)
visualizer.score(X_test_k, y_test_k)

visualizer.poof()