# Initial Setup

## Ignore potential warnings

In [1]:
# Ignore warnings
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

## Plotly template setup

In [2]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

template_tg = go.layout.Template()
template_tg.layout.colorway = ['#253540', '#E71C24', '#F3C707', '#D4D3D4', '#747A7A']
template_tg.layout.title = {'font': {'size': 22}}
template_tg.layout.colorscale = {'sequential': [[0.0, '#253540'], [1.0, '#F3C707']]}
template_tg.layout.coloraxis = {'colorbar': {'outlinewidth': 5,'outlinecolor': 'rgb(255,255,255)'}}
template_tg.layout.font = {'family': 'Microsoft PhagsPa','color': 'rgb(0,0,0)', 'size': 15}

axis_style = {'gridcolor': 'rgb(0,0,0)','linecolor': 'rgb(0,0,0)','tickcolor': 'rgb(0,0,0)'}
template_tg.layout.xaxis = axis_style
template_tg.layout.yaxis = axis_style

template_tg.layout.height = 400
template_tg.layout.width = 600
template_tg.layout.bargroupgap = 0.15
template_tg.layout.margin=dict(l=0, r=0, b=0, t=0)

# Combining user-defined template with base template
pio.templates["template_tg"] = template_tg
pio.templates.default = "simple_white+template_tg"

## Reading data

In [3]:
import datetime as dt
import pandas as pd
import numpy as np
import json
import os

# Mount filename
base_path = "../../assets/data/nips"

# Parameters
no_topics = 25

### Past data set + label + vocabulary

In [4]:
# Reading past csv
past_df = pd.read_csv(os.path.abspath(os.path.join(base_path, "past.csv")))
display(past_df.head())

# Reading labels
past_labels = pd.read_csv('data/past_labels.csv')
display(past_labels.head())

# Reading vocabulary
with open('data/past_vocabulary.json', 'r') as f:
    past_vocabulary = json.load(f)

Unnamed: 0,year,id,text
0,1987,1,self organization associative database applica...
1,1987,2,capacity kanerva associative memory exponentia...
2,1987,3,supervise learning probability distribution ne...
3,1987,4,constrained differential optimization constrai...
4,1987,5,towards organize principle layered perceptual ...


Unnamed: 0,id,past_00,past_01,past_02,past_03,past_04,past_05,past_06,past_07,past_08,...,past_15,past_16,past_17,past_18,past_19,past_20,past_21,past_22,past_23,past_24
0,1,1,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,1,0,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,5,0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0


### Present data set + label

In [5]:
# Reading past csv
pres_df = pd.read_csv(os.path.abspath(os.path.join(base_path, "present.csv")))
display(pres_df.head())

# Reading labels
pres_labels = pd.read_csv('data/pres_labels.csv')
display(pres_labels.head())

Unnamed: 0,year,id,text
0,2003,2345,error bound transductive learning compression ...
1,2003,2346,predict speech intelligibility population neur...
2,2003,2347,markov model automated interval analysis marko...
3,2003,2348,perception structure physical world unknown mu...
4,2003,2349,find probable configuration loopy belief propa...


Unnamed: 0,id,pres_00,pres_01,pres_02,pres_03,pres_04,pres_05,pres_06,pres_07,pres_08,...,pres_15,pres_16,pres_17,pres_18,pres_19,pres_20,pres_21,pres_22,pres_23,pres_24
0,2345,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,2346,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2347,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
3,2348,0,0,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
4,2349,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Combination

In [6]:
# Saving topic combination
combine_df = pd.read_csv('data/combination.csv')
combine_df.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,615,616,617,618,619,620,621,622,623,624
past,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
pres,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0
count,5.0,0.0,5.0,0.0,3.0,1.0,2.0,6.0,23.0,1.0,...,0.0,1.0,2.0,4.0,3.0,0.0,4.0,0.0,2.0,6.0
score,0.096452,0.085495,0.101099,0.095286,0.107389,0.103759,0.09564,0.098505,0.055031,0.094305,...,0.127274,0.097144,0.111584,0.095678,0.091891,0.101373,0.111027,0.07804,0.10007,0.097415


# Training model with past dataset

## Split train-test data

In [7]:
from sklearn.model_selection import train_test_split

topic_cols = ['past_{:02d}'.format(i) for i in range(no_topics)]

# Breaking features and classes
X = past_df[['text']]
y = past_labels[topic_cols]

# Splitting train test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

## Vectorization

### Bag of Words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(vocabulary=past_vocabulary)

bow_corpus_train = bow_vectorizer.fit_transform(X_train['text'])
bow_corpus_test = bow_vectorizer.transform(X_test['text'])

bow_corpus_cv = CountVectorizer(vocabulary=past_vocabulary).fit_transform(X['text'])

In [9]:
assert len(bow_vectorizer.vocabulary_) == len(past_vocabulary)

### Term Frequency - Inverse Document Frequency

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(vocabulary=past_vocabulary)

tfidf_corpus_train = tfidf_vectorizer.fit_transform(X_train['text'])
tfidf_corpus_test = tfidf_vectorizer.transform(X_test['text'])

tfidf_corpus_cv = TfidfVectorizer(vocabulary=past_vocabulary).fit_transform(X['text'])

In [11]:
assert len(tfidf_vectorizer.vocabulary_) == len(past_vocabulary)

### Word2Vec

In [12]:
# from gensim.models import KeyedVectors
# import numpy as np

# word2vec_path = "../../assets/embeddings/GoogleNews-vectors-negative300.bin.gz"
# word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [13]:
# vocabulary = past_vocabulary
# not_in = []

# count = 0
# for word in vocabulary:
#     if word in word2vec:
#         count += 1 #word in word2vec
#     else:
#         not_in.append(word)
# print('Word2vec covers {} / {} words: {:.2%}'.format(
#     count, 
#     len(vocabulary), 
#     count/len(vocabulary))
# )

In [14]:
# def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
#     if len(tokens_list)<1:
#         return np.zeros(k)
    
#     if generate_missing:
#         vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
#     else:
#         vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
        
#     length = len(vectorized)
#     summed = np.sum(vectorized, axis=0)
#     averaged = np.divide(summed, length)
#     return averaged

# def get_word2vec_embeddings(vectors, corpus, generate_missing=False):
#     clean_questions = corpus.apply(lambda x: x.split(' '))
    
#     embeddings = clean_questions.apply(lambda x: get_average_word2vec(x, vectors, 
#                                                                                 generate_missing=generate_missing))
#     return list(embeddings)

In [15]:
# word2vec_corpus_train = get_word2vec_embeddings(word2vec, X_train['text'])
# word2vec_corpus_test = get_word2vec_embeddings(word2vec, X_test['text'])
# word2vec_corpus_cv = get_word2vec_embeddings(word2vec, X['text'])

In [16]:
# del word2vec

## Model Selection

In [17]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from tqdm import tqdm

def evaluate_models(model_func, params, X_dataset, label):

    scores = {'Train':[], 'Test':[]}

    pbar = tqdm(topic_cols)
    for topic in pbar:
        pbar.set_description(label.ljust(15))    

        # Auxiliar vectors for scores
        train_scores = []
        test_scores = []

        # K-Fold definition
        skf = StratifiedKFold(n_splits=10)    

        y_dataset = y[topic]
        for train_index, test_index in skf.split(X_dataset, y_dataset):
            X_train, X_test = X_dataset[train_index], X_dataset[test_index]
            y_train, y_test = y_dataset[train_index], y_dataset[test_index]

            model = model_func(**params)

            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)

            train_scores.append({
                'Accuracy': accuracy_score(y_train, y_pred_train),
                'Precision': precision_score(y_train, y_pred_train),
                'Recall': recall_score(y_train, y_pred_train),
                'F1': f1_score(y_train, y_pred_train),
            })

            test_scores.append({
                'Accuracy': accuracy_score(y_test, y_pred_test), 
                'Precision': precision_score(y_test, y_pred_test),
                'Recall': recall_score(y_test, y_pred_test),
                'F1': f1_score(y_test, y_pred_test),
            })

        scores['Train'].append(pd.DataFrame(train_scores).mean().to_dict())
        scores['Test'].append(pd.DataFrame(test_scores).mean().to_dict())

    for k in scores.keys():
        scores[k] = pd.DataFrame(scores[k], index=topic_cols).add_suffix(' '+k)

    scores_df = pd.concat([v for v in scores.values()], axis=1)

    s = pd.Series(['{}'.format(col[-2:]) for col in topic_cols])

    # Saving results - csv
    scores_df.to_csv(f'latex/{label}.csv')

    # Saving results - latex
    with open(f'latex/{label}.tex', 'w') as f:
        f.write(scores_df[['Precision Train', 'Precision Test', 'Recall Train', 'Recall Test', 'F1 Train', 'F1 Test']].set_index(s).round(3).to_latex())
    
    return scores_df

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

models = [
#     {
#         'model_func': MultinomialNB, 
#         'params': {'alpha': 0.1}, 
#         'X_dataset': bow_corpus_cv, 
#         'label': 'bow-nb'
#     },
#     {
#         'model_func': MultinomialNB, 
#         'params': {'alpha': 0.1}, 
#         'X_dataset': tfidf_corpus_cv, 
#         'label': 'tfidf-nb'
#     },
#     {
#         'model_func': SVC, 
#         'params': {'C':1.5, 'kernel': 'rbf', 'gamma': 'auto'},
#         'X_dataset': bow_corpus_cv, 
#         'label': 'bow-svm'
#     },
#     {
#         'model_func': SVC,
#         'params': {'C':1.5, 'kernel': 'rbf', 'gamma': 'auto'},
#         'X_dataset': tfidf_corpus_cv, 
#         'label': 'tfidf-svm-ok-2'
#     }, 
#     {
#         'model_func': SVC,
#         'params': {'C':1.0, 'kernel': 'linear', 'gamma': 'auto'},
#         'X_dataset': np.array(word2vec_corpus_cv), 
#         'label': 'word2vec-svm-2'
#     }, 
    {
        'model_func': LogisticRegression,
        'params': {},
        'X_dataset': bow_corpus_cv, 
        'label': 'logreg-bow'
    }, 
    {
        'model_func': LogisticRegression,
        'params': {},
        'X_dataset': tfidf_corpus_cv, 
        'label': 'logreg-tfidf'
    }, 
]

In [19]:
scores = {}
for model in models:
    scores[model['label']] = evaluate_models(**model)

logreg-bow     : 100%|██████████| 25/25 [07:19<00:00, 17.57s/it]
logreg-tfidf   : 100%|██████████| 25/25 [02:35<00:00,  6.23s/it]


In [20]:
index = []
agg_scores = []

for k, v in scores.items():
    index.append(k)
    agg_scores.append(v.mean().to_dict())
    
df = pd.DataFrame(agg_scores, index=index)

df.to_csv('latex/logreg-results.csv')
with open(f'latex/logreg-results.tex', 'w') as f:
    f.write(df[['Precision Test', 'Recall Test', 'F1 Test']].round(3).to_latex())

df

Unnamed: 0,Accuracy Train,Precision Train,Recall Train,F1 Train,Accuracy Test,Precision Test,Recall Test,F1 Test
logreg-bow,0.999898,1.0,0.999557,0.999778,0.943919,0.824625,0.690954,0.741612
logreg-tfidf,0.94414,0.867265,0.472502,0.589117,0.922585,0.783026,0.370281,0.482277
