# Initial Setup

## Ignore potential warnings

In [1]:
# Ignore warnings
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

## Plotly template setup

In [2]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

template_tg = go.layout.Template()
template_tg.layout.colorway = ['#253540', '#E71C24', '#F3C707', '#0D5E94', '#51814B']
template_tg.layout.title = {'font': {'size': 22}}
template_tg.layout.colorscale = {'sequential': [[0.0, '#253540'], [1.0, '#F3C707']]}
template_tg.layout.coloraxis = {'colorbar': {'outlinewidth': 5,'outlinecolor': 'rgb(255,255,255)'}}
template_tg.layout.font = {'family': 'Microsoft PhagsPa','color': 'rgb(0,0,0)', 'size': 15}

axis_style = {'gridcolor': 'rgb(0,0,0)','linecolor': 'rgb(0,0,0)','tickcolor': 'rgb(0,0,0)'}
template_tg.layout.xaxis = axis_style
template_tg.layout.yaxis = axis_style

template_tg.layout.height = 400
template_tg.layout.width = 600
template_tg.layout.bargroupgap = 0.15
template_tg.layout.margin=dict(l=0, r=0, b=0, t=30)

# Combining user-defined template with base template
pio.templates["template_tg"] = template_tg
pio.templates.default = "simple_white+template_tg"

## Reading data

In [3]:
import datetime as dt
import pandas as pd
import numpy as np
import json
import os

# Mount filename
base_path = "../../assets/data/nips"

# Parameters
no_topics = 25

### Past data set + label + vocabulary

In [4]:
# Reading past csv
past_df = pd.read_csv(os.path.abspath(os.path.join(base_path, "past.csv")))
display(past_df.head())

# Reading labels
past_labels = pd.read_csv('data/past_labels.csv')
display(past_labels.head())

# Reading vocabulary
with open('data/past_vocabulary.json', 'r') as f:
    past_vocabulary = json.load(f)

Unnamed: 0,year,id,text
0,1987,1,self organization associative database applica...
1,1987,2,capacity kanerva associative memory exponentia...
2,1987,3,supervise learning probability distribution ne...
3,1987,4,constrained differential optimization constrai...
4,1987,5,towards organize principle layered perceptual ...


Unnamed: 0,id,past_00,past_01,past_02,past_03,past_04,past_05,past_06,past_07,past_08,...,past_15,past_16,past_17,past_18,past_19,past_20,past_21,past_22,past_23,past_24
0,1,1,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,1,0,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,5,0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0


### Present data set + label

In [5]:
# Reading past csv
pres_df = pd.read_csv(os.path.abspath(os.path.join(base_path, "present.csv")))
display(pres_df.head())

# Reading labels
pres_labels = pd.read_csv('data/pres_labels.csv')
display(pres_labels.head())

Unnamed: 0,year,id,text
0,2003,2345,error bound transductive learning compression ...
1,2003,2346,predict speech intelligibility population neur...
2,2003,2347,markov model automated interval analysis marko...
3,2003,2348,perception structure physical world unknown mu...
4,2003,2349,find probable configuration loopy belief propa...


Unnamed: 0,id,pres_00,pres_01,pres_02,pres_03,pres_04,pres_05,pres_06,pres_07,pres_08,...,pres_15,pres_16,pres_17,pres_18,pres_19,pres_20,pres_21,pres_22,pres_23,pres_24
0,2345,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,2346,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2347,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
3,2348,0,0,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
4,2349,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Future data set

In [6]:
# Reading past csv
future_df = pd.read_csv(os.path.abspath(os.path.join(base_path, "future.csv")))
display(future_df.head())

Unnamed: 0,year,id,text
0,2016,6036,optimistic gittin index start thomspon sample ...
1,2016,6037,sample newton method uniform sampling consider...
2,2016,6038,budget stream base active learning adaptive su...
3,2016,6039,sequential neural model stochastic layer effic...
4,2016,6040,stochastic gradient method distributionally ro...


### Combination

In [7]:
# Saving topic combination
combine_df = pd.read_csv('data/combination.csv')
combine_df.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,615,616,617,618,619,620,621,622,623,624
past,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
pres,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0
count,5.0,0.0,5.0,0.0,3.0,1.0,2.0,6.0,23.0,1.0,...,0.0,1.0,2.0,4.0,3.0,0.0,4.0,0.0,2.0,6.0
score,0.096452,0.085495,0.101099,0.095286,0.107389,0.103759,0.09564,0.098505,0.055031,0.094305,...,0.127274,0.097144,0.111584,0.095678,0.091891,0.101373,0.111027,0.07804,0.10007,0.097415


# Training model with past dataset

## Split train-test data

In [8]:
from sklearn.model_selection import train_test_split

topic_cols = ['past_{:02d}'.format(i) for i in range(no_topics)]

# Breaking features and classes
X = past_df[['text']]
y = past_labels[topic_cols]

# Splitting train test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

## Vectorization

### Bag of Words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(vocabulary=past_vocabulary)

# bow_corpus_train = bow_vectorizer.fit_transform(X_train['text'])
# bow_corpus_test = bow_vectorizer.transform(X_test['text'])

bow_corpus_cv = bow_vectorizer.fit_transform(X['text'])

In [10]:
assert len(bow_vectorizer.vocabulary_) == len(past_vocabulary)

### Word2Vec

### Term Frequency - Inverse Document Frequency

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(vocabulary=past_vocabulary)

# tfidf_corpus_train = tfidf_vectorizer.fit_transform(X_train['text'])
# tfidf_corpus_test = tfidf_vectorizer.transform(X_test['text'])

tfidf_corpus_cv = tfidf_vectorizer.fit_transform(X['text'])

In [12]:
assert len(tfidf_vectorizer.vocabulary_) == len(past_vocabulary)

## Train with past data

In [13]:
import numpy as np

def compute_log_loss(actual, predicted, eps=1e-14):
    predicted = np.clip(predicted, eps, 1 - eps)
    loss = -1 * np.mean(actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted))
    return loss

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from tqdm import tqdm

models = {}
full_metrics = []

pbar = tqdm(topic_cols)
for topic in pbar:
    pbar.set_description(topic) 
    
    # Defining model
#     model = MultinomialNB(alpha=1)
    model = SVC(C=1.0, kernel='linear', gamma='auto', random_state=1452, probability=True)
    
    # Splitting train test datasets
    X_train, X_test, y_train, y_test = train_test_split(tfidf_corpus_cv, y[topic], 
                                                        test_size=0.3, random_state=100, stratify=y[topic])

    # Fitting model
    model.fit(X_train, y_train)
    models[topic] = model
        
    # Evaluating with train and hold-out data
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
#     y_pred_train_prob = model.predict_proba(X_train)[:,1]
#     y_pred_test_prob = model.predict_proba(X_test)[:,1]
    
    full_metrics.append({
        'acc_train': accuracy_score(y_train, y_pred_train),       
        'acc_test': accuracy_score(y_test, y_pred_test),
        
        'pre_train': precision_score(y_train, y_pred_train),
        'pre_test': precision_score(y_test, y_pred_test),
        
        'rcl_train': recall_score(y_train, y_pred_train),        
        'rcl_test': recall_score(y_test, y_pred_test),
        
        'f1_train': f1_score(y_train, y_pred_train),
        'f1_test': f1_score(y_test, y_pred_test),
        
#         'lgloss_train': compute_log_loss(y_train, y_pred_train_prob),
#         'lgloss_test': compute_log_loss(y_test, y_pred_test_prob),
    })
    
full_metrics = pd.DataFrame(full_metrics, index=topic_cols)

past_24: 100%|██████████| 25/25 [10:59<00:00, 26.40s/it]


In [15]:
# from sklearn.metrics import classification_report, roc_auc_score, plot_roc_curve, cohen_kappa_score

# # models['past_24'].predict_proba(X_test)[:,1]

# print('roc_auc_score: {:.3f}'.format(roc_auc_score(y_test, models['past_24'].predict_proba(X_test)[:,1])))
# print('cohen_kappa_score: {:.3f}'.format(cohen_kappa_score(y_test, models['past_24'].predict(X_test))))

# print(classification_report(y_test, models['past_24'].predict(X_test)))

In [16]:
full_metrics.mean()

acc_train    0.986279
acc_test     0.942395
pre_train    0.995892
pre_test     0.951272
rcl_train    0.900845
rcl_test     0.593379
f1_train     0.944305
f1_test      0.715008
dtype: float64

# Predicting present dataset

In [17]:
bow_present_prediction = {}
proba_present_prediction = {}
bow_corpus_pres = tfidf_vectorizer.transform(pres_df['text'])

In [18]:
pbar = tqdm(topic_cols)
for topic in pbar:
    pbar.set_description(topic) 
    
    bow_present_prediction[topic] = models[topic].predict(bow_corpus_pres)
#     proba_present_prediction[topic] = models[topic].predict_proba(bow_corpus_pres)[:,1]
    
bow_present_prediction = pd.DataFrame(bow_present_prediction)

past_24: 100%|██████████| 25/25 [03:58<00:00,  9.54s/it]


## Baseline evaluation

In [19]:
top_combinations = combine_df.copy()
top_combinations['agg'] = top_combinations['count'] / top_combinations['score']
top_combinations = top_combinations.nlargest(10, 'agg')
top_combinations

Unnamed: 0,past,pres,count,score,agg
281,11,6,29,0.0259,1119.706067
255,10,5,28,0.030722,911.386297
366,14,16,22,0.039605,555.484281
509,20,9,19,0.03597,528.219018
461,18,11,20,0.042773,467.587166
187,7,12,17,0.038147,445.644278
102,4,2,27,0.061233,440.940648
8,0,8,23,0.055031,417.942587
128,5,3,16,0.046831,341.651936
557,22,7,18,0.057376,313.72128


In [40]:
rec_list = []
for idx, row in top_combinations.iterrows():
    past_topic = 'past_{:02d}'.format(int(row['past']))
    pres_topic = 'pres_{:02d}'.format(int(row['pres']))
       
    pres = precision_score(pres_labels[pres_topic], bow_present_prediction[past_topic])
    rec  = recall_score(pres_labels[pres_topic], bow_present_prediction[past_topic])
    score = f1_score(pres_labels[pres_topic], bow_present_prediction[past_topic])
#     log_loss = compute_log_loss(pres_labels[pres_topic], proba_present_prediction[past_topic])
    
    print('{} -> {} = {:.3f}'.format(past_topic, pres_topic, score))
    
#     print('{} & {} & {:.3f} & {:.3f} & {:.3f} \\\\'.format(past_topic[-2:], pres_topic[-2:], pres, rec, score))

past_11 -> pres_06 = 0.714
past_10 -> pres_05 = 0.807
past_14 -> pres_16 = 0.667
past_20 -> pres_09 = 0.614
past_18 -> pres_11 = 0.651
past_07 -> pres_12 = 0.379
past_04 -> pres_02 = 0.398
past_00 -> pres_08 = 0.533
past_05 -> pres_03 = 0.464
past_22 -> pres_07 = 0.608


In [32]:
fig = px.scatter(x=[i + 1 for i in range(10)], y=rec_list, trendline="ols")

fig.update_layout(
    xaxis_title='Position of Combination',
    yaxis_title='F1 Score',
    margin=dict(t=30), 
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.update_yaxes(range=[0, 0.9])
fig.update_xaxes(tickvals=[i + 1 for i in range(10)])


fig.show()

## Yearly evaluation

In [22]:
array = {}

for idx, row in top_combinations.iterrows():
    past_topic = 'past_{:02d}'.format(int(row['past']))
    pres_topic = 'pres_{:02d}'.format(int(row['pres']))
        
    aux = pres_df[['year', 'id']]
    aux['prediction'] = bow_present_prediction[past_topic]
#     aux['proba'] = proba_present_prediction[past_topic]
    aux['correct'] = np.array(pres_labels[pres_topic])
    
    association = '{} - {}'.format(past_topic, pres_topic)
    array[association] = []
    
    for year in sorted(aux.year.unique()):
        year_subset = aux[aux['year'] == year]

        score = f1_score(year_subset['correct'], year_subset['prediction'])
#         log_loss = compute_log_loss(year_subset['correct'], year_subset['proba'])
        
        array[association].append(score)
#         array[association].append(log_loss)

    array[association] = np.array(array[association])

In [30]:
topic_score_dfs = pd.DataFrame(array).T
topic_score_dfs.columns = sorted(aux.year.unique())
topic_score_dfs = topic_score_dfs.T
topic_score_dfs.min()

past_11 - pres_06    0.594595
past_10 - pres_05    0.714286
past_14 - pres_16    0.619048
past_20 - pres_09    0.428571
past_18 - pres_11    0.561798
past_07 - pres_12    0.285714
past_04 - pres_02    0.206897
past_00 - pres_08    0.406250
past_05 - pres_03    0.368932
past_22 - pres_07    0.378378
dtype: float64

## Visualization

In [24]:
fig = go.Figure()

columns = topic_score_dfs.columns[:5]

for topic in columns:
    fig.add_traces(go.Scatter(
                        x=topic_score_dfs.index, 
                        y=topic_score_dfs[topic], 
                        name=topic[5:7]+'-'+topic[-2:],
                        mode='lines'))    
    
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='F1 Score',
    margin=dict(t=30), 
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.update_yaxes(range=[0, 0.9])

fig.show()

In [45]:
for i in range(10):
    data_frame=topic_score_dfs.T.apply(lambda row: np.mean(row[i]))

    print(topic_score_dfs.columns[i], '=>', np.corrcoef(data_frame.index, data_frame.values)[0,1])
    fig = px.scatter(x=data_frame.index, y=data_frame.values, trendline="lowess") #["ols", "lowess"]
    
    fig.update_layout(
        xaxis_title='Year',
        yaxis_title='F1 Score',
        margin=dict(t=30), 
        showlegend=False
    )
    fig.update_yaxes(range=[0, 0.91])
    display(fig)
# go.Figure(data=[go.Scatter(x=data_frame.index, y=data_frame.values)])

past_11 - pres_06 => 0.33661229458554454


past_10 - pres_05 => -0.4812740924562936


past_14 - pres_16 => -0.8623381675670492


past_20 - pres_09 => 0.3558015595901742


past_18 - pres_11 => 0.7415516920385069


past_07 - pres_12 => -0.10325308666374049


past_04 - pres_02 => -0.2503047289564409


past_00 - pres_08 => 0.3560957871618569


past_05 - pres_03 => 0.6409794402055082


past_22 - pres_07 => 0.542177613211876
