In [315]:
import os

os.chdir("/Users/pascalweiss/dev/python/dl_project_gl_pw/")
from src.tools.mongo import collection_to_df
from src.tools.config import Config
import pandas as pd
from pymongo import MongoClient
import numpy as np
import math
from plotly.offline import init_notebook_mode, iplot
from plotly import tools, graph_objs as go



In [233]:
db = 'session_db'

port = Config.logging.port
host = 'localhost'

client = MongoClient(host, port)
df_log = collection_to_df('session_db', 'log', 'localhost', Config.logging.port)
df_args = collection_to_df('session_db', 'session_args', 'localhost', Config.logging.port,
                           filter={'_id': {'$gt': '5a9067649233a2051495fea9'}},
                           flatten=True)

df_merge = pd.merge(df_args, df_log, left_on='_id', right_on='_session_id', how='inner')
df_base = df_merge[['_session_id', 'data_factory.args.pp_params.vectorization.max_features',
                    'data_factory.args.pp_params.vectorization.min_df',
                    'data_factory.args.pp_params.vectorization.ngram_range',
                    'data_factory.args.pp_params.vectorization.tfidf',
                    'model.args.post_layer_size', 'model.args.reply_layer_size',
                    'fold', 'epoch', 'val_acc', 'val_loss', 'train_acc', 'train_loss']]

df_base.columns = ['_session_id', 'pp_max_features', 'pp_min_df', 'pp_ngram_range', 'pp_tfidf', 'post_layer_size',
                   'reply_layer_size', 'fold', 'epoch', 'val_acc', 'val_loss', 'train_acc', 'train_loss']

feature_cols = ['pp_max_features', 'pp_min_df', 'pp_ngram_range', 'pp_tfidf',
                'post_layer_size', 'reply_layer_size']

# make str of every hyperparam
df = df_base.copy(deep=True)
for col in feature_cols + ['_session_id']:
    df[col] = df[col].map(lambda el: str(el))

df = df.sort_values('val_acc', ascending=False)
df.head(5)

In [283]:
def plot_subplots(df, session_id, fold, col, n_cols=1):
    rows = math.ceil(len(session_id) / n_cols)
    init_notebook_mode(connected=True)

    fig = tools.make_subplots(rows=rows, cols=n_cols)
    for i, s_id in enumerate(session_id):
        r = (i // n_cols) + 1
        c = (len(session_id) % n_cols) + 1
        df_sub = df[(df._session_id == s_id) & (df.fold == fold)]
        df_sub = df_sub.sort_values('epoch')
        trace = go.Scatter(
            x=df_sub.epoch,
            y=df_sub[col].astype(float))
        fig.append_trace(trace, row=r, col=c)
    iplot(fig)


def plot_single(df, session_id, fold, col):
    init_notebook_mode(connected=True)
    fig = tools.make_subplots(rows=1, cols=1)
    for i, s_id in enumerate(session_id):
        df_sub = df[(df._session_id == s_id) & (df.fold == fold)]
        df_sub = df_sub.sort_values('epoch')
        trace = go.Scatter(
            x=df_sub.epoch,
            y=df_sub[col].astype(float),
            name=s_id
        )
        fig.append_trace(trace, row=1, col=1)
    iplot(fig)

# Number of training sessions

In [130]:
df_base._session_id.unique().shape[0]

# Varied Hyperparams
The hyperparameters that were varied in this training session

In [131]:
varied_hyperparams = {k: df[k].unique() for k in feature_cols}
varied_hyperparams

# random

In [316]:
session_ids = df._session_id.unique().tolist()

### Fold 1

In [318]:
plot_single(df, session_ids, np.int64(1), col='val_acc')

### Fold 2

In [317]:
plot_single(df, session_ids, np.int64(2), col='val_acc')

# By highest val_acc

In [291]:
winners = df.groupby('_session_id').val_acc.max().sort_values(ascending=False).head(8)
winners

In [301]:
plot_single(df, winners.index.tolist(), np.int64(2), col='val_acc')

In [326]:
df[df._session_id == '5a915c659233a29441341616']