In [None]:
import pandas as pd
import numpy as np
import scipy
import re
from scipy.stats import norm

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import mpl_toolkits.mplot3d as mplt3d
from mpl_toolkits.mplot3d import Axes3D 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

%matplotlib notebook
%load_ext autoreload
%autoreload 2

# shouldn't be enabled when using interactive 3D plots
# %pylab inline
# pylab.rcParams['figure.figsize'] = (10, 7)


### Cleaning & preparing the data

In [None]:
# naming chosen for same-length, to look pretty
kicked = pd.read_csv('data/dismissed_complete.csv')
stayed = pd.read_csv('data/nodismissed_complete.csv')

In [None]:
kicked.head()

In [None]:
stayed.head()

In [None]:
kicked.shape, stayed.shape

In [None]:
# Just an experiment to try to learn on journal names, not on article names
# kicked['Title paper'] = kicked['Journal']
# stayed['Title paper'] = stayed['Journal']

In [None]:
kicked1 = kicked[['Author','Title paper', 'labels']]
stayed1 = stayed[['Author','Title paper', 'labels']]
stayed1 = stayed1.sample(frac=(1.0 * kicked.shape[0])/stayed.shape[0]) # random_state = 0

In [None]:
# make arrays equal in size
kicked1.shape, stayed1.shape

In [None]:
# check some basic invariants on the input data, all should return True
print(kicked1['Author'].apply(lambda x: x[:3] == 'AU=').all())
print(stayed1['Author'].apply(lambda x: x[:3] == 'AU=').all())
print(kicked1['labels'].apply(lambda x: x == 1).all())
print(stayed1['labels'].apply(lambda x: x == 0).all())
print(kicked1['Title paper'].apply(lambda x: x[0] == '[' and x[-1] == ']').all())
print(stayed1['Title paper'].apply(lambda x: x[0] == '[' and x[-1] == ']').all())

In [None]:
df0 = pd.concat([kicked1, stayed1])

In [None]:
df0.head()

In [None]:
df1 = df0.copy()
df1['Author'] = df0['Author'].apply(lambda x: x[3:])
df1['Label'] = df0['labels'].apply(lambda x: int(x))
df1['Title paper'] = df0['Title paper'].apply(lambda s: s[1:][:-1])
df1 = df1.drop(columns=['labels'])
df1.head()

In [None]:
# remove trash author names (whose length < 6)
print('Rows to remove:')
print(df1[df1['Author'].apply(lambda s : len(s) < 6)]['Author'])
df2 = df1[df1['Author'].apply(lambda s : len(s) >= 6)]

In [None]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [None]:
# In fact we will do classification on "Title paper", so further working on it
print('Shape of not English words: ', df2[df2['Title paper'].apply(lambda s : not isEnglish(s))].shape)
df3 = df2.copy()
df3['Title paper'] = df2['Title paper'].apply(
    lambda s : s.lower()
).apply(
    lambda s : re.sub(r"[.,/()?:'%\";\[\]!\{\}><]", "", s) # delete all not-letters
).apply(
    lambda s : re.sub(r"[- + = @ & * # |]", " ", s) # substitute defis with spaces
).apply(
    lambda s : re.sub(r"\d", " ", s) # substitute numbers with spaces
).apply(
    lambda s : re.sub(r"\W\w{1,2}\W", " ", s) # naive removal of super-short words
).apply(
    lambda s : re.sub(r"\s+", " ", s) # substitute multiple spaces with one
)
df3 = df3[df3['Title paper'].apply(
    lambda s: s != 'untitled' and s != 'editorial' # drop some common but not-interesting names
)]

In [None]:
# try to find strange symbols in "Title paper" and print them 
symbols = df3['Title paper'].apply(
    lambda s: ''.join(c for c in s if not c.isalpha() and c != ' ')
)
print(symbols[symbols.apply(lambda s: s != '')])

In [None]:
# okay, now in df3 in "Title paper" we have clean sentences, great, analysis should work

df4 = df3.drop(columns=['Author'])
df4.head()

In [None]:
# the interesting observation is that now we have a dataset of (title, label), where label is
# if the author of the article was fired or not. Such dataset may be biased, because for one author
# there can be a lot of different articles. Thus many points are produced with single observation

# Let's try also with another dataset, where we will also have (conc_title, label), where conc_title
# will stay for all the titles of one author, being concatenated

In [None]:
df3.head()

In [None]:
titles_num = df4.shape[0]
kicked_titles_num = df4[df4['Label'] == 1].shape[0]
stayed_titles_num = df4[df4['Label'] == 0].shape[0]
print('For first dataset we have ' + str(titles_num) + ' titles, from them ' + str(kicked_titles_num) + ' kicked and ' + str(stayed_titles_num) + ' stayed')

# authors_num = len()

In [None]:
titles_per_author = {} # author -> article
labels_per_author = {} # author -> label

for i, r in df3.iterrows():
    author = r['Author']
    title = r['Title paper']
    label = int(r['Label'])
    
    titles_per_author[author] = titles_per_author.get(author, '') + ' ' + title # do concatenation
    labels_per_author[author] = label

kicked_cnt = 0
for k, v in labels_per_author.items():
    if v == 1: kicked_cnt+= 1
        
print('After aggregation we got ' + str(len(titles_per_author)) + ' authors, from which ' + str(cnt) + ' were kicked and ' + str(len(titles_per_author) - cnt) + ' not')
print('Taking prefix of required size for not-kicked')

authors = []
titles = []
labels = []
stayed_limit = cnt

for k, v in titles_per_author.items():
    if labels_per_author[k] == 0:
        if stayed_limit > 0: stayed_limit -= 1
        else: continue
    
    authors.append(k)
    titles.append(v)
    
    labels.append(labels_per_author[k])
#     labels.append(1)
    
# aggregated DataFrame
adf4 = pd.DataFrame(data={'Title paper' : titles, 'Label' : labels}) # columns names are ugly, but for backwards-compatibility
print('Got aggregated dataset of size ' + str(len(authors)))


### Naive Bayes classification

In [None]:
df = df4
# df = adf4

data_train, data_test = train_test_split(df, test_size=0.3) # random_state = 0

X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

X_train.shape, X_test.shape

In [None]:
# Let's do the following model
# Features are unique words
# Samples are titles

# 1) Naive : for every sample we have binary value for every word (present / absent)
# 2) sklearn.CountVectorizer : counting
# 3) sklearn.TfidfVectorizer : with usual counting more weight is given to longer sentences, that's not really
#                               fair, TF-IDF (term frequency _times_ inverse document frequency) also gives
#                               every sample a weight for each present word, but in more sophisticated way

# We are doing (3) classificator

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
print('Train shape: ', X_train_tfidf.shape)

X_test_tfidf = tfidf_vect.transform(X_test)
print('Test  shape: ', X_test_tfidf.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
np.mean(clf.predict(X_test_tfidf) == y_test)

### word2vec experiments + catboost

In [None]:
# Okay, we had a look at NB combined with TF-IDF
# Let's now work with word2vec. We need to handle sentences, thus we have 2 options:
# 1) Do simple averaging of all the words in sentence
# 2) Do TF-IDF weighting of every word in a sentence and then addition
# Then for classification we use catboost (ie gradient boosting ie combination of decision trees)

# We will try both

In [None]:
import gensim

# Load Google's pretrained model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./data/pretrained_models/GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [None]:
# df = df4
df = adf4

data_train, data_test = train_test_split(df, test_size=0.5) # random_state = 0

X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

print('Train size: ' + str(X_train.shape[0]) + ' vs test size: ' + str(X_test.shape[0]))

In [None]:
df.head(n = 50)

In [None]:
def get_mean_w2v_embeddings(titles):
    embs = []
    for title in titles:
        title_emb = np.zeros(300)
        words = title.split(' ')
        for w in words:
            if w in word2vec_model:
                scalar = 1.
#                 scalar = 1. / len(words)
                
                vector = word2vec_model[w]
                
                title_emb += scalar * vector
        embs.append(title_emb)
    return embs

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Series is provided, indexation is made with iloc
def get_tfidf_w2v_embeddings(titles):
    tfidf_vect = TfidfVectorizer()
    
    titles_tfidf_matrix = tfidf_vect.fit_transform(titles)
    # have matrix, where rows are titles and cols are words from vocabulary
    tfidf_words_indices = {word : index for (word, index) in tfidf_vect.vocabulary_.items()}
    
    embs = []
    for i in range(len(titles)):
        title = titles.iloc[i]
        words = title.split(' ')
        
        # make sparse matrix row a dict:
        matrix_row = titles_tfidf_matrix[i]
        matrix_row_dict = {}
        indices = matrix_row.indices
        data    = matrix_row.data
        for i in range(len(data)):
            matrix_row_dict[indices[i]] = data[i]
        
        title_emb = np.zeros(300)
        for w in words:
            if w in word2vec_model:
                vector = word2vec_model[w]
                
                if w in tfidf_words_indices:
                    word_index = tfidf_words_indices[w]
                    scalar = matrix_row_dict.get(word_index, 0)
                else:
                    scalar = 1. / len(words) # take scalar as in mean
#                     scalar = 1.
                
                title_emb += scalar * vector
                
        embs.append(title_emb)
    return embs

In [None]:
if True:
    X_train_embs = get_mean_w2v_embeddings(X_train)
    X_test_embs  = get_mean_w2v_embeddings(X_test)
else:
    X_train_embs = get_tfidf_w2v_embeddings(X_train)
    X_test_embs  = get_tfidf_w2v_embeddings(X_test)

In [None]:
from catboost import CatBoostClassifier

cbc_model = CatBoostClassifier(iterations=10, learning_rate=0.01, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

In [None]:
cbc_model = CatBoostClassifier(iterations=10, learning_rate=0.1, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

In [None]:
cbc_model = CatBoostClassifier(iterations=10, learning_rate=1, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

### sent2vec + catboost

In [None]:
data_train, data_test = train_test_split(df4, test_size=0.3) # random_state = 0

X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

X_train.shape, X_test.shape

In [None]:
import sent2vec # epfl-made 
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model('./data/pretrained_models/wiki_unigrams.bin') # 600 features

In [None]:
# emb = model.embed_sentence(X_train.values[0])
X_train_embs = sent2vec_model.embed_sentences(X_train.values)
X_test_embs  = sent2vec_model.embed_sentences(X_test.values)

In [None]:
len(X_train_embs), type(X_train_embs[0]), len(X_train_embs[0]), type(X_train_embs[0][0])

In [None]:
# okay, now we have 600-dim vectors for every sentence

In [None]:
from catboost import CatBoostClassifier
# Initialize data
cat_features = [0,1,2]
train_data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
train_labels = [1,1,-1]
test_data = [["a","b",2,4,6,8],["a","d",1,4,50,60]]
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss')
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')

In [None]:
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=20, learning_rate=1e-2, depth=6, loss_function='Logloss')
# Fit model
model.fit(X_train_embs, y_train)
# Get predicted classes
preds_class = model.predict(X_test_embs)

In [None]:
np.mean(preds_class == y_test)

In [None]:
# todo: averaging