In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri
import re

In [None]:
rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()

In [None]:
%reload_ext rpy2.ipython

In [None]:
%%R
setwd("..")
source("dssFunctionLibrary.R")
source("dssDataSetSpecificHacks.R")
source("dssPerformanceEvaluation.R")
source("mimicUsefulFunction.R")
require(gridExtra)
require(ROCR)
require(ISLR)
require(caret)
require(magrittr)
library(foreach)
library(doMC)
registerDoMC(11)

In [None]:
dataset = pd.read_csv(os.path.join("", "passiveTrend_30s.csv"))
cols = dataset.columns.tolist()
cols = cols[-1:] + cols[:-1]
dataset = dataset[cols]
dataset['tsp'] = dataset['tsp'].values.astype('datetime64[s]')
dataset.head()

In [None]:
ids_all = dataset.id.value_counts()
ids = ids_all.index.values
dataset = dataset[dataset.id.isin(ids)]
dataset_before = dataset.iloc[::2]
dataset_after = dataset.iloc[1::2]

In [None]:
# play with range for lam0 and sweep to choose right regularization penalty
LO = [1e7, 1e6, 1e5, 1e4, 1e3, 1e2, 1e1, 1, 1e-1, 1e-2] 
# play with range for lambda-S conditioned on choice for lam0
LS = [1e7, 1e6, 1e5, 1e4, 1e3, 1e2, 1e1, 1, 1e-1, 1e-2]
lo = []
ls = []
for i in LO:
    for j in LS:
        lo.append(i)
        ls.append(j)

In [None]:
def train_model_in_R():
    %R -i XTrainScaled,XTrainTimeVector,lo,ls,sm_on_idx,sm_off_idx
    %R lo <- as.numeric(unlist(lo))
    %R ls <- as.numeric(unlist(ls))
    %R OrderPairs <- data.frame(onIdx = seq(1, nrow(XTrainScaled)/2, by = 1),\
                                offIdx = seq(nrow(XTrainScaled)/2+1, nrow(XTrainScaled), by = 1))
    %R SmoothnessPairs <- data.frame(onIdx = sm_on_idx, offIdx = sm_off_idx)
    %R -o res res <- dssTrain.Linear(XTrainScaled, OrderPairs, SmoothnessPairs,\
                                     XTrainTimeVector, lo, ls,"dssl_train", doParallel = 1)
    return res

def get_accuracy(scores):
    length = int(len(scores)/2)
    diff = scores[:length] - scores[length:]
    accuracy = np.sum(diff >= 0) * 1.0 / len(diff)
    return accuracy

def evaluate(pairs, weights):
    scores = pairs.dot(weights)
    return {'scores':scores, 'accuracy':get_accuracy(scores)}

def weight_analysis(models, names):
    for i, m in enumerate(models):
        if i == 0:
            weights = np.asarray(m['weights'])
        else:
            mweights = np.asarray(m['weights'])
            weights += mweights
    weights /= len(models)
    df = pd.DataFrame(weights, index=names, columns=['weight'])
    ii = df.weight.abs().sort_values(ascending=False)
    df["scaled"] = df.weight / max(df.weight.abs())
    return df

def get_fold_test_scores(test_pt, test_scores):
    # get idx: id and tsp
    df = dataset_before[dataset_before.id.isin(test_pt)][['id','tsp']]\
        .append(dataset_after[dataset_after.id.isin(test_pt)][['id','tsp']])
    # get score columns
    for item in test_scores:
        if item.startswith('scores'):
            df[item] = test_scores[item]
    return df

In [None]:
pts = np.unique(dataset_before.id)
pts

In [None]:
# combine select weights and lambda together 
kf = KFold(n_splits=3, shuffle=True)

fold_accuracy = []
fold_weights = []
fold_train_pt = []
fold_test_pt = []
fold_scaler = []
fold_res_train = []
fold_res_test = []

index_cols = ['id', 'tsp']
for train, test in kf.split(pts):

    train_pt = pts[train]
    test_pt = pts[test]
    
    # set aside training set
    dataset_train = dataset_before[dataset_before.id.isin(train_pt)].\
        append(dataset_after[dataset_after.id.isin(train_pt)])
    XTrain = dataset_train.drop(index_cols, axis=1)
    XTrainTsp = pd.to_datetime(dataset_train.tsp, format='%Y-%m-%d %H:%M:%S')
    XTrainTimeVector = (XTrainTsp - np.datetime64('1970-01-01 00:00:00')) / np.timedelta64(1, 's')
    XTrainTimeVector = XTrainTimeVector.astype(int)
    scaler = RobustScaler()
    XTrainScaled = scaler.fit_transform(XTrain)
    train_ids = dataset_train[['id','tsp']].sort_values(by=['id','tsp']).id.values
    train_ids_idx = dataset_train[['id','tsp']].sort_values(by=['id','tsp']).index.values
    sequential_idx_1 = train_ids[:-1] == train_ids[1:]
    sequential_idx_2 = np.insert(sequential_idx_1, 0, False)
    sequential_idx_1 = np.append(sequential_idx_1, False)
    sm_on_idx = []
    sm_off_idx = []

    for i, idx_1 in enumerate(train_ids_idx[sequential_idx_1]):
        idx_2 = train_ids_idx[sequential_idx_2][i]
        row_i_1 = dataset_train.index.get_loc(idx_1)
        row_i_2 = dataset_train.index.get_loc(idx_2)
        sm_on_idx.append(row_i_2)
        sm_off_idx.append(row_i_1)

    sm_on_idx = np.add(sm_on_idx,1)
    sm_off_idx = np.add(sm_off_idx,1)
    
    ### Call Learning Procedure ###
    res = train_model_in_R()
    print("split:", train_pt, test_pt)
    
    # test dataset
    dataset_test = dataset_before[dataset_before.id.isin(test_pt)]\
        .append(dataset_after[dataset_after.id.isin(test_pt)])
    XTest = dataset_test.drop(index_cols, axis=1)
    XTestScaled = scaler.transform(XTest)        

    acc = []
    weights = []
    res_train_list = []
    res_test_list = []
    for i,la in enumerate(lo):
        %R -i i -o w w = res[[i+1]]$estimate
        res_train = evaluate(XTrainScaled, w)
        res_test = evaluate(XTestScaled, w)
        acc.append([(i+1), lo[i], ls[i], res_train['accuracy'], res_test['accuracy']])
        weights.append(w)
        res_train_list.append(res_train)
        res_test_list.append(res_test)
    fold_accuracy.append(acc)
    fold_weights.append(weights)
    fold_train_pt.append(train_pt)
    fold_test_pt.append(test_pt)
    fold_scaler.append(scaler)
    fold_res_train.append(res_train_list)
    fold_res_test.append(res_test_list)

In [None]:
# select best hyper parameters from validation set and apply to test set
fold_models = []
best_acc = []
for i, acc in enumerate(fold_accuracy):
    model = {}
    train_acc = [val[3] for val in acc]
    test_acc = [val[4] for val in acc]
    best_wi = train_acc.index(max(train_acc))
    model['weights'] = fold_weights[i][best_wi]
    model['scaler'] = fold_scaler[i]
    model['wi'] = best_wi
    fold_models.append(model)
    print(best_wi, train_acc[best_wi], test_acc[best_wi], max(test_acc))
    best_acc.append(test_acc[best_wi])
np.mean(best_acc)

In [None]:
for i, model in enumerate(fold_models):
    wi = model['wi']
    print(i, wi)
    if i == 0:
        scores = get_fold_test_scores(fold_test_pt[i], fold_res_test[i][wi])
    else:
        scores = scores.append(get_fold_test_scores(fold_test_pt[i], fold_res_test[i][wi]))

scores.tsp = pd.to_datetime(scores.tsp)

In [None]:
df = weight_analysis(fold_models, XTrain.columns.values)
df["abs_w"] = df.weight.abs()
df["abs_w_scale"] = 100*(df.abs_w-df.abs_w.min())/df.abs_w.max() - df.abs_w.min()
df.abs_w_scale.sort_values(ascending=False).to_csv('sorted_weight_060118.csv',index=True)
df.abs_w_scale[df.abs_w_scale > 30].sort_values(ascending=False)
df.abs_w_scale[df.abs_w_scale < 1].count()
df.abs_w_scale[df.abs_w_scale < 1].sort_values(ascending=False)

In [None]:
scores_before = pd.DataFrame()
for id in scores.id.unique():
    pt_scores = scores[scores.id == id]
    l = int(len(pt_scores)/2)
    scores_before = scores_before.append(pt_scores[:l])   
scores.to_csv('pt_scores_060118.csv', index=False)
scores_before.to_csv('pt_scores_before_060118.csv', index=False)

In [None]:
scaled_scores = scores.copy()
scaled_scores.head()

In [None]:
scaler100 = RobustScaler(quantile_range=(15.0, 85.0))
scaler100.fit(scaled_scores.scores.values.reshape(-1, 1))
scaled_scores['scores_scaled'] = 50*scaler100.transform(scaled_scores.scores.values.reshape(-1, 1)) + 50
scaled_scores.scores_scaled[(scaled_scores.scores_scaled > 100)] = 100
scaled_scores.scores_scaled[(scaled_scores.scores_scaled < 0)] = 0
print(scaled_scores.scores_scaled.mean(), scaled_scores.scores_scaled.std() \
    , scaled_scores.scores_scaled.quantile(.25), scaled_scores.scores_scaled.quantile(.75))
scaled_scores.scores_scaled = scaled_scores.scores_scaled.round()
scaled_scores.head()

In [None]:
scaled_scores = pd.merge(dataset, scaled_scores,how='left')
scaled_scores.head()

In [None]:
scaled_scores.to_csv('scores_scaled_060318.csv', index=False)