Sklearn Training (& Testing)
===



In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import json
import sys
import pickle
from tqdm import tqdm

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.reccontext
import cbrec.evaluation

In [None]:
md_list = cbrec.utils.get_metadata_list(config.metadata_filepath)
len(md_list)

In [None]:
df = cbrec.utils.create_metadata_dataframe(md_list)
len(df)

In [None]:
pd.DataFrame(df.type.value_counts().rename("Total metadata count by type"))

In [None]:
# filter the md_list
md_list = [md for md in md_list if md['type'] != 'ineligible']
len(md_list)

In [None]:
def get_triples():
    db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
    #triple_metadata = []
    arrs = []
    ys = []
    
    try:
        for row in cbrec.featuredb.stream_triples(db):
            #md = {key: row[key] for key in row.keys() if not key.endswith("_arr")}
            #triple_metadata.append(md)
            target_feature_arr = np.concatenate([row['target_feature_arr'], row['source_feature_arr'] - row['target_feature_arr'], row['source_target_feature_arr']])
            alt_feature_arr = np.concatenate([row['alt_feature_arr'], row['source_feature_arr'] - row['alt_feature_arr'], row['source_alt_feature_arr']])
            arrs.append(target_feature_arr)
            ys.append(1)
            arrs.append(alt_feature_arr)
            ys.append(0)
        #df = pd.DataFrame(triple_metadata)
        #return df
    finally:
        db.close()
    return arrs, ys
        
feature_arrs, ys = get_triples()

In [None]:
X = np.vstack(feature_arrs)
y_true = np.array(ys)
X.shape, y_true.shape

In [None]:
clf = Pipeline([
    ('scaler', sklearn.preprocessing.StandardScaler()),
    ('clf', sklearn.linear_model.SGDClassifier(loss='log')),
])
clf.fit(X, y_true)

In [None]:
clf.predict(X)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
preds = clf.predict_proba(X)[:,1]
ax.hist(preds, bins=20)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
preds = clf.predict_proba(X)[:,1]
bins = np.linspace(0, 1, 20)
ax.hist(preds[y_true == 1], bins=bins, alpha=0.5)
ax.hist(preds[y_true == 0], bins=bins, alpha=0.5)
plt.show()

In [None]:
# accuracy, looking good
np.sum(clf.predict(X) == y_true) / len(y_true)

In [None]:
coef = clf['clf'].coef_
print(coef[:,0:12])
print(coef[:,12:24])
print(coef[:,24:])

In [None]:
test_md_list = cbrec.utils.get_test_metadata(md_list)
len(test_md_list)

In [None]:
tdf = df[df.type == 'test']
len(tdf)

In [None]:
def get_test_contexts(config, test_md_list, clf):
    db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
    
    try:
        for md in test_md_list:
            metadata_id = md['metadata_id']
            test_context = cbrec.featuredb.get_test_context_by_metadata_id(db, metadata_id, config)
            rc = cbrec.reccontext.RecContext.create_from_test_context(config, md, test_context)
            
            scorer = cbrec.evaluation.SklearnModelScorer(config, rc, clf, "PointwiseLogreg")
            metric_dict = scorer.score_proba()
            md['baseline_metrics']['PointwiseLogreg'] = metric_dict
    finally:
        db.close()
        
get_test_contexts(config, test_md_list, clf)

In [None]:
models = test_md_list[0]['baseline_metrics'].keys()
print(models)
model_df_dict = {}
for model in tqdm(models):
    metrics_list = []
    for md in test_md_list:
        metrics = md['baseline_metrics'][model]
        metrics['metadata_id'] = md['metadata_id']
        metrics_list.append(metrics)
    mdf = pd.DataFrame(metrics_list)
    mdf['reciprocal_rank'] = 1 / mdf.target_rank
    model_df_dict[model] = mdf
    print(model, len(mdf))
len(model_df_dict)

In [None]:
scores = []
for model in models:
    mdf = model_df_dict[model][['target_raw_score', 'target_rank', 'reciprocal_rank', 'ndcg_1', 'ndcg_5', 'ndcg_10', 'ndcg_50']]
    means = mdf.mean()
    means = pd.concat([pd.Series([np.sum(mdf.target_rank <= 5) / len(mdf),], index=['% <= rank 5',]), means])
    means = pd.concat([pd.Series([model,], index=['model',]), means])
    scores.append(means)
score_df = pd.DataFrame(scores).rename(columns={'target_rank': 'mean_rank', 'reciprocal_rank': 'mrr', 'target_raw_score': 'mean_raw_score'}).sort_values(by='mean_rank')
score_df