In [None]:
import pandas as pd
import numpy as np
import metrics

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
%matplotlib inline

import preprocessing

In [None]:
!time g++ -Wall -std=c++11 -O3 src/*.cpp -o ./bin/sgd

In [None]:
!time ./bin/sgd data/mrh nbsinDDE

In [None]:
df = pd.read_csv("./data/results.csv")
print('mean:', round(np.sum(df.prediction) / np.sum(df.exposure), 6), round(np.sum(df.target) / np.sum(df.exposure), 6))
print('rmse:', round(metrics.root_mean_square_error(df.target, df.prediction, df.exposure), 6))
print('gini:', metrics.gini_emblem_fast(df.target, df.prediction, df.exposure))
print('deviance:', np.log(metrics.poisson_deviance(df.target, df.prediction, df.exposure)))
metrics.plot_lift_curve(df.target, df.prediction, df.exposure, n_band=20)

In [None]:
metadata = preprocessing.Metadata("data", "mrh")
metadata.load()

In [None]:
n = metadata.size
p = metadata.count_features()
def load_data(file_path, dtype='int32', shape=None):
    return np.memmap(file_path, dtype=dtype, shape=shape)

data = load_data(metadata.get_feature_filename(), dtype=np.dtype('u1'), shape=(n, p))

In [None]:
df_coeffs = pd.read_csv('data/mrh/coeffs.csv').as_matrix()
df_coeffs = np.exp(df_coeffs)

def get_coeffs(feature_range):
    return df_coeffs[1 + np.array(feature_range)]    

In [None]:
test_data = data[df.row,:]

In [None]:
def plot_relativities(df, feature, idx, modalities):
    try:
        modalities = [float(m) for m in modalities]
        if sum([m - int(m) for m in modalities]) == 0:
            modalities = [int(m) for m in modalities]
    except:
        pass
    
    df['f'] = test_data[:, idx]

    m = df.target.mean()
    relativity = df.groupby(['f']).agg({'exposure': 'sum', 'target': 'mean', 'prediction': 'mean'})
    relativity.target /= m
    relativity.prediction /= m
    relativity['one'] = 1
    relativity['modalities'] = modalities
    relativity['coeffs'] = get_coeffs(metadata.get_feature_range(feature))
    if(relativity['coeffs'].sum() == relativity['coeffs'].count()):
        return
    relativity = relativity.sort_values('modalities')
    size = relativity.prediction.size
    ar = np.arange(size)
    
    max_exposure = relativity.exposure.max()
    
    fig, ax1 = plt.subplots(figsize=(10, 8))

    ax1.bar(ar, relativity.exposure, color='#fffca0', edgecolor='grey')
    ax1.set_ylim(ymax=max_exposure * 3)
    ax1.set_xticks(ar)
    ax1.set_xticklabels(labels=relativity.modalities)
  
    ax2 = ax1.twinx()
    ax2.set_title(feature)
    ax2.plot(ar, relativity.prediction, color="#0f600e", marker=".")
    ax2.plot(ar, relativity.target, color="#c242f4", marker=".")
    ax2.plot(ar, relativity.coeffs, color="#93ff9e",marker="^")
    ax2.axhline(y=1, color='black', linewidth=1, linestyle="dotted")
    ax2.set_ylim(ymin=0)
    
    plt.show()
    
for f in metadata.features:
    try:
        plot_relativities(df, f, metadata.get_feature_index(f), metadata.get_modalities(f))
    except:
        print('Error ploting relativity chart for ', f)