In case you don't have tokenized books in the `./res` folder,

download this file:
https://drive.google.com/uc?export=download&id=1HZxpWE_T-ZhZyLAB83wO7c7dFR6_xe7T
   
and put its content to the `./res` folder

In [1]:
import sys
sys.path.append('..')

import os
import pickle
import time
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix
import pandas as pd
import gensim
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from IPython.display import display, clear_output
import logging
logger = logging.getLogger().setLevel('WARNING')
from src.utils import pickle_partial, unpickle, load_snippets

import keras
from keras import Sequential, regularizers, Model
from keras.layers import Input, Dense, Dropout, Convolution1D, Embedding, MaxPooling1D, Flatten, Concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import nltk
from nltk.corpus import stopwords

from matplotlib import pyplot as plt

import collections
import multiprocessing

cores = multiprocessing.cpu_count()

def evaluate_clf(clf_name, results, y_train, y_pred_train, y_test, y_pred_test, training_time):
    f1_score_macro_train = f1_score(y_train, y_pred_train, average='macro')
    accuracy_score_train = accuracy_score(y_train, y_pred_train)
    f1_score_macro_test = f1_score(y_test, y_pred_test, average='macro')
    accuracy_score_test = accuracy_score(y_test, y_pred_test)
    results.loc[clf_name,:] = [f1_score_macro_train, f1_score_macro_test,
                               accuracy_score_train, accuracy_score_test, training_time]
    results.sort_values(['test f1-macro'], ascending=False, inplace=True)
    display(results.loc[[clf_name]])

Using TensorFlow backend.


### Load documents with length 200

In [24]:
vocab_sizes = [1000, 2000, 5000, 10000, 20000, 30000, 40000, 50000]
snippet_length = 200
train_set, test_set, y_train, y_test = load_snippets(doc_size=snippet_length, word_tokenized=False)
classes = sorted(set(y_train))
genre_results = pd.DataFrame(columns=['train f1-macro','test f1-macro','train acc','test acc', 'train time'])

(191363,)
(33771,)


### Create bigrams on train and test set

In [3]:
# you might have to download nltk stop words
stops = set(stopwords.words('english'))

In [4]:
%%time
for t in train_set:
    t.extend(['_'.join(b) for b in nltk.bigrams(t) 
           if b[0] not in stops and b[1] not in stops])

CPU times: user 4.49 s, sys: 13.1 ms, total: 4.5 s
Wall time: 4.51 s


In [5]:
%%time
for t2 in test_set:
    t2.extend(['_'.join(b) for b in nltk.bigrams(t2) 
           if b[0] not in stops and b[1] not in stops])

CPU times: user 852 ms, sys: 5.14 ms, total: 857 ms
Wall time: 857 ms


### Create dictionary (vocabulary) if not stored yet

In [6]:
dictionary_path = '../res/models/bow_dictionary_all_{}.pkl'.format(snippet_length)
if os.path.exists(dictionary_path):
    dictionary = gensim.corpora.Dictionary.load(dictionary_path)
else:
    dictionary = gensim.corpora.Dictionary(train_set)
    dictionary.save(dictionary_path)

### Most common bigrams

In [7]:
top_n = 20
bigram_key_val = {k:v for k,v in dictionary.token2id.items() if len(k.split('_')) > 1}
bigram_key_val_freq = [(k,v,dictionary.dfs[v]) for k,v in bigram_key_val.items()]
bigram_key_val_freq = sorted(bigram_key_val_freq, key=lambda x:x[2])[::-1]
pd.DataFrame(bigram_key_val_freq, columns=['bigram', 'token_id', 'document_frequency']).set_index('token_id').head(20)

Unnamed: 0_level_0,bigram,document_frequency
token_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3774,old_man,997
1616,young_man,844
1851,let_us,760
1622,could_see,683
2048,one_day,600
24525,come_back,487
8394,new_york,476
3476,first_time,473
2536,long_time,446
1953,every_one,434


### Train algorithms for given vocabulary lengths and store the models

In [8]:
vocab_sizes = [1000, 2000, 5000, 10000, 20000, 30000]

In [None]:
for vocab_size in vocab_sizes:
    # process vocabulary and prepare train and test set
    dict_filtered_path = '../res/models/bow_dictionary_{}_{}.pkl'.format(snippet_length, vocab_size)
    if os.path.exists(dict_filtered_path):
            dictionary = gensim.corpora.Dictionary.load(dict_filtered_path)
    else:
        dictionary = gensim.corpora.Dictionary.load(dictionary_path)
        dictionary.filter_extremes(keep_n=vocab_size)
        dictionary.save(dict_filtered_path)
    
    X_train = [dictionary.doc2bow(tokens) for tokens in train_set]
    X_test = [dictionary.doc2bow(tokens) for tokens in test_set]

    X_train = gensim.matutils.corpus2csc(X_train, num_terms=vocab_size).T
    X_test = gensim.matutils.corpus2csc(X_test, num_terms=vocab_size).T
    
    ###################
    ### NAIVE BAYES ###
    ###################
    nb = MultinomialNB()
    t = time.time()
    nb.fit(X_train, y_train)
    t = time.time() - t
    pickle.dump(nb, open('../res/models/mnb_{}_{}.pkl'.format(snippet_length,vocab_size),'wb'))
    y_pred_train = nb.predict(X_train)
    y_pred_test = nb.predict(X_test)
    
    evaluate_clf("MultinomialNB_{}_{}".format(snippet_length,vocab_size),
                 genre_results,
                 y_train, y_pred_train, y_test, y_pred_test, t)
    
    nb = BernoulliNB() 
    t = time.time()
    nb.fit(X_train, y_train)
    t = time.time() - t
    pickle.dump(nb, open('../res/models/bnb_{}_{}.pkl'.format(snippet_length,vocab_size),'wb'))
    y_pred_train = nb.predict(X_train)
    y_pred_test = nb.predict(X_test)
    
    evaluate_clf("BernoulliNB_{}_{}".format(snippet_length,vocab_size),
                 genre_results,
                 y_train, y_pred_train, y_test, y_pred_test, t)
    
    ###########################
    ### LOGISTIC REGRESSION ###
    ###########################
    clf = SGDClassifier(loss='log', max_iter=1000, n_jobs=-1, verbose=0, 
                    class_weight='balanced', random_state=42, tol=1e-6)
    params = {
     'alpha': [1e-3, 3e-4, 1e-4, 3e-5, 1e-5]
    }
    params = {
     'alpha': [1e-3, 5e-4, 2e-4, 1e-4, 5e-5]
    }
    clf_log = GridSearchCV(
        estimator=clf,
        param_grid=params,
        n_jobs=-1,
        verbose=0)
    t = time.time()
    clf_log.fit(X_train, y_train)
    t = time.time() - t
    
    pickle.dump(clf_log, open('../res/models/logreg_{}_{}.pkl'.format(snippet_length,vocab_size),'wb'))
    y_pred_train = clf_log.predict(X_train)
    y_pred_test = clf_log.predict(X_test)
    evaluate_clf("logreg_{}_{}_{}".format(clf_log.best_params_['alpha'], snippet_length, vocab_size),
                 genre_results,
                 y_train, y_pred_train, y_test, y_pred_test, t)
    
    ##############
    ### TF-IDF ###
    ##############
    X_train = [dictionary.doc2bow(tokens) for tokens in train_set]
    X_test = [dictionary.doc2bow(tokens) for tokens in test_set]
    tfidf = gensim.models.TfidfModel(X_train)
    X_train = [tfidf[x] for x in X_train]
    X_test = [tfidf[x] for x in X_test]
    tfidf.save('../res/models/tfidf_{}_{}.pkl'.format(snippet_length,vocab_size))
    X_train = gensim.matutils.corpus2csc(X_train, num_terms=vocab_size).T
    X_test = gensim.matutils.corpus2csc(X_test, num_terms=vocab_size).T
    
    clf = SGDClassifier(loss='log', max_iter=1000, n_jobs=-1, verbose=0, 
                    class_weight='balanced', random_state=42, tol=1e-6)
    params = {
     'alpha': [3e-05, 1e-05, 3e-06, 1e-06, 3e-07]
    }
    params = {
     'alpha': [5e-6, 2e-6, 1e-6, 5e-7, 2e-7]
    }
    clf_tfidf = GridSearchCV(
        estimator=clf,
        param_grid=params,
        n_jobs=-1,
        verbose=0)
    t = time.time()
    clf_tfidf.fit(X_train, y_train)
    t = time.time() - t
    
    pickle.dump(clf_tfidf, open('../res/models/logreg_tfidf_{}_{}.pkl'.format(snippet_length,vocab_size),'wb'))
    y_pred_train = clf_tfidf.predict(X_train)
    y_pred_test = clf_tfidf.predict(X_test)
    evaluate_clf("tfidf_{}_{}_{}".format(clf_tfidf.best_params_['alpha'], snippet_length, vocab_size),
                 genre_results,
                 y_train, y_pred_train, y_test, y_pred_test, t)
display(genre_results)

In [8]:
display(genre_results)

Unnamed: 0,train f1-macro,test f1-macro,train acc,test acc,train time
tfidf_1e-06_200_30000,0.672189,0.395043,0.655492,0.417636,45.1301
tfidf_1e-06_200_20000,0.610806,0.375022,0.597629,0.398893,46.1225
logreg_1e-05_200_30000,0.613191,0.374433,0.612877,0.398004,81.9884
MultinomialNB_200_30000,0.477355,0.360058,0.520341,0.413017,0.722993
MultinomialNB_200_20000,0.458954,0.352365,0.485324,0.395458,0.700714
BernoulliNB_200_30000,0.45119,0.347547,0.519824,0.411596,0.868539
logreg_0.0001_200_20000,0.453735,0.344799,0.467133,0.377424,68.9139
BernoulliNB_200_20000,0.435793,0.341407,0.486259,0.394806,0.875557
MultinomialNB_200_10000,0.416154,0.334896,0.426916,0.366942,0.709456
tfidf_1e-05_200_10000,0.418573,0.33351,0.436605,0.363152,43.056


### NN
with early stopping based on validation

In [62]:
import keras
from keras import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelBinarizer

In [63]:
vocab_size = 1000

In [69]:
dict_filtered_path = '../res/models/bow_dictionary_{}_{}.pkl'.format(snippet_length, vocab_size)
if os.path.exists(dict_filtered_path):
        dictionary = gensim.corpora.Dictionary.load(dict_filtered_path)
else:
    dictionary = gensim.corpora.Dictionary.load(dictionary_path)
    dictionary.filter_extremes(keep_n=vocab_size)
    dictionary.save(dict_filtered_path)

X_train_all = [dictionary.doc2bow(tokens) for tokens in train_set]
X_test = [dictionary.doc2bow(tokens) for tokens in test_set]

X_train_all = gensim.matutils.corpus2csc(X_train_all).T
X_test = gensim.matutils.corpus2csc(X_test).T

In [70]:
X_train, X_valid, y_train_, y_val_ = train_test_split(X_train_all, y_train)

mlb = LabelBinarizer()
y_tr = mlb.fit_transform(list(y_train_))
print(y_train.shape)
y_te = mlb.transform(list(y_test))
print(y_test.shape)
y_val = mlb.transform(list(y_val_))
print(y_val.shape)

(191363,)
(33771,)
(47841, 14)


In [71]:
n = X_train.shape[1]
output_shape = mlb.classes_.shape[0]
nn = Sequential()
nn.add(Dropout(0.4,input_shape=(n,)))
nn.add(Dense(200, activation='relu'))
nn.add(Dropout(0.1))
nn.add(Dense(100, activation='relu'))
#nn.add(Dropout(0.25))
nn.add(Dense(output_shape, activation='softmax'))
nn.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
display(nn.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_33 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_49 (Dense)             (None, 200)               200200    
_________________________________________________________________
dropout_34 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_51 (Dense)             (None, 14)                1414      
Total params: 221,714
Trainable params: 221,714
Non-trainable params: 0
_________________________________________________________________


None

In [72]:
genre_results_nn = pd.DataFrame(columns=['train f1-macro','test f1-macro','train_accuracy','test_accuracy', 'training_time'])
clf = nn
epochs = 0
max_epochs = 100
batch_size = 256
epochs_without_improvement = 0

t = time.time()

for i in range(max_epochs):
    clear_output()
    display(genre_results_nn)
    
    clf.fit(X_train, y_tr, batch_size=batch_size, epochs=1)
    clf_name = 'nn_{}'.format(epochs)

    t = time.time() - t
    y_pred_train = clf.predict(X_train)
    y_pred_val = clf.predict(X_valid)
    y_pred_train = mlb.inverse_transform(y_pred_train)
    y_pred_val = mlb.inverse_transform(y_pred_val)
    evaluate_clf(clf_name,
                 genre_results_nn,
                 y_train_, y_pred_train, y_val_, y_pred_val, t)

    new_result = genre_results_nn.loc[clf_name,'test f1-macro']
    genre_results_nn.sort_values(['test f1-macro'], ascending=False, inplace=True)

    if genre_results_nn.iloc[0]['test f1-macro'] > new_result:
        if epochs_without_improvement >= 2:
            break
        else:
            epochs_without_improvement += 1
    else:
        epochs_without_improvement = 0
    
    epochs += 1

Unnamed: 0,train f1-macro,test f1-macro,train_accuracy,test_accuracy,training_time
nn_8,0.30232,0.232526,0.3765,0.29805,77.1549
nn_10,0.312094,0.231048,0.386171,0.297903,97.0202
nn_9,0.307526,0.230854,0.378123,0.29644,1532120000.0
nn_5,0.273738,0.22685,0.352127,0.296837,1532120000.0
nn_4,0.269353,0.226651,0.347793,0.296754,29.2598
nn_6,0.280819,0.226375,0.360523,0.296754,50.3621
nn_7,0.288041,0.224695,0.362774,0.293932,1532120000.0
nn_3,0.257556,0.223849,0.337244,0.296357,1532120000.0
nn_2,0.249216,0.220777,0.329127,0.293869,17.5607
nn_1,0.222849,0.202965,0.313666,0.287828,1532120000.0


Epoch 1/1


Unnamed: 0,train f1-macro,test f1-macro,train_accuracy,test_accuracy,training_time
nn_11,0.329576,0.230176,0.395535,0.294183,1532120000.0


### Output on the test set

In [73]:
y_pred_test = clf.predict(X_test)
y_pred_test = mlb.inverse_transform(y_pred_test)
evaluate_clf(f'nn_{snippet_length}_{vocab_size}',
         genre_results,
         y_train_, y_pred_train, y_test, y_pred_test, 0)

Unnamed: 0,train f1-macro,test f1-macro,train_accuracy,test_accuracy,training_time
nn_200_1000,0.329576,0.23044,0.395535,0.291167,0
