In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
import sys
import pickle
sys.path.append('/Users/henriquelopes/Projects/cartolaAPI')

In [3]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from data_science_utils.result_analysis.evaluation_metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
TARGET = 'target'

BASIC_FEATURES = ['Pontos', 'PontosMedia', 'Preco', 'PrecoVariacao', 
                  'FS', 'PE', 'A', 'FT', 'FD', 'FF', 'G', 'I', 'PP', 
                  'RB', 'FC', 'GC', 'CA', 'CV', 'SG', 'DD', 'DP', 'GS',
                 'home_score', 'away_score', 'result', 'game_points', 'Home']

CATEGORICAL_FEATURES = ['mei', 'missing', 'ata', 'lat', 'zag', 'gol',
                        'GRE', 'SAO', 'FLA', 'COR', 'PAL', 'CFC', 'CRU',
                        'CAP', 'SAN', 'SPT', 'FLU', 'CHA', 'CAM', 'INT',
                        'FIG', 'VIT', 'BOT', 'PON', 'GOI', 'BAH', 'VAS',
                        'AVA', 'JEC', 'SCZ', 'CRI', 'AME', 'ATL-GO', 'True', 'False']

PLAYER_FEATURES = ['FS_avg_last_20', 'PE_avg_last_20', 'A_avg_last_20', 'FT_avg_last_20',
                   'FD_avg_last_20', 'FF_avg_last_20', 'G_avg_last_20', 'I_avg_last_20',
                   'PP_avg_last_20', 'RB_avg_last_20', 'FC_avg_last_20', 'GC_avg_last_20',
                   'CA_avg_last_20', 'CV_avg_last_20', 'SG_avg_last_20', 'DD_avg_last_20',
                   'DP_avg_last_20', 'GS_avg_last_20', 'Pontos_avg_last_20', 'Preco_avg_last_20',
                   'game_points_avg_last_20', 'FS_avg_last_10', 'PE_avg_last_10', 'A_avg_last_10',
                   'FT_avg_last_10', 'FD_avg_last_10', 'FF_avg_last_10', 'G_avg_last_10', 'I_avg_last_10',
                   'PP_avg_last_10', 'RB_avg_last_10', 'FC_avg_last_10', 'GC_avg_last_10', 'CA_avg_last_10',
                   'CV_avg_last_10', 'SG_avg_last_10', 'DD_avg_last_10', 'DP_avg_last_10', 'GS_avg_last_10',
                   'Pontos_avg_last_10', 'Preco_avg_last_10', 'game_points_avg_last_10', 'FS_avg_last_5', 
                   'PE_avg_last_5', 'A_avg_last_5', 'FT_avg_last_5', 'FD_avg_last_5', 'FF_avg_last_5', 
                   'G_avg_last_5', 'I_avg_last_5', 'PP_avg_last_5', 'RB_avg_last_5', 'FC_avg_last_5',
                   'GC_avg_last_5', 'CA_avg_last_5', 'CV_avg_last_5', 'SG_avg_last_5', 'DD_avg_last_5',
                   'DP_avg_last_5', 'GS_avg_last_5', 'Pontos_avg_last_5', 'Preco_avg_last_5', 
                   'game_points_avg_last_5', 'FS_avg_home_last_20', 'PE_avg_home_last_20', 'A_avg_home_last_20',
                   'FT_avg_home_last_20', 'FD_avg_home_last_20', 'FF_avg_home_last_20', 'G_avg_home_last_20',
                   'I_avg_home_last_20', 'PP_avg_home_last_20', 'RB_avg_home_last_20', 'FC_avg_home_last_20',
                   'GC_avg_home_last_20', 'CA_avg_home_last_20', 'CV_avg_home_last_20', 'SG_avg_home_last_20',
                   'DD_avg_home_last_20', 'DP_avg_home_last_20', 'GS_avg_home_last_20', 'Pontos_avg_home_last_20',
                   'Preco_avg_home_last_20', 'game_points_avg_home_last_20', 'FS_avg_home_last_10', 
                   'PE_avg_home_last_10', 'A_avg_home_last_10', 'FT_avg_home_last_10', 'FD_avg_home_last_10',
                   'FF_avg_home_last_10', 'G_avg_home_last_10', 'I_avg_home_last_10', 'PP_avg_home_last_10', 
                   'RB_avg_home_last_10', 'FC_avg_home_last_10', 'GC_avg_home_last_10', 'CA_avg_home_last_10',
                   'CV_avg_home_last_10', 'SG_avg_home_last_10', 'DD_avg_home_last_10', 'DP_avg_home_last_10',
                   'GS_avg_home_last_10', 'Pontos_avg_home_last_10', 'Preco_avg_home_last_10', 
                   'game_points_avg_home_last_10', 'FS_avg_home_last_5', 'PE_avg_home_last_5',
                   'A_avg_home_last_5', 'FT_avg_home_last_5', 'FD_avg_home_last_5', 'FF_avg_home_last_5',
                   'G_avg_home_last_5', 'I_avg_home_last_5', 'PP_avg_home_last_5', 'RB_avg_home_last_5', 
                   'FC_avg_home_last_5', 'GC_avg_home_last_5', 'CA_avg_home_last_5', 'CV_avg_home_last_5', 
                   'SG_avg_home_last_5', 'DD_avg_home_last_5', 'DP_avg_home_last_5', 'GS_avg_home_last_5',
                   'Pontos_avg_home_last_5', 'Preco_avg_home_last_5', 'game_points_avg_home_last_5',]

TEAM_FEATURES = ['FS_avg_team_home_last_20', 'PE_avg_team_home_last_20', 'A_avg_team_home_last_20',
                 'FT_avg_team_home_last_20', 'FD_avg_team_home_last_20', 'FF_avg_team_home_last_20',
                 'G_avg_team_home_last_20', 'I_avg_team_home_last_20', 'PP_avg_team_home_last_20',
                 'RB_avg_team_home_last_20', 'FC_avg_team_home_last_20', 'GC_avg_team_home_last_20',
                 'CA_avg_team_home_last_20', 'CV_avg_team_home_last_20', 'SG_avg_team_home_last_20',
                 'DD_avg_team_home_last_20', 'DP_avg_team_home_last_20', 'GS_avg_team_home_last_20',
                 'Pontos_avg_team_home_last_20', 'Preco_avg_team_home_last_20', 'game_points_avg_team_home_last_20', 
                 'FS_avg_team_home_last_10', 'PE_avg_team_home_last_10', 'A_avg_team_home_last_10',
                 'FT_avg_team_home_last_10', 'FD_avg_team_home_last_10', 'FF_avg_team_home_last_10',
                 'G_avg_team_home_last_10', 'I_avg_team_home_last_10', 'PP_avg_team_home_last_10',
                 'RB_avg_team_home_last_10', 'FC_avg_team_home_last_10', 'GC_avg_team_home_last_10', 
                 'CA_avg_team_home_last_10', 'CV_avg_team_home_last_10', 'SG_avg_team_home_last_10',
                 'DD_avg_team_home_last_10', 'DP_avg_team_home_last_10', 'GS_avg_team_home_last_10', 
                 'Pontos_avg_team_home_last_10', 'Preco_avg_team_home_last_10', 'game_points_avg_team_home_last_10',
                 'FS_avg_team_home_last_5', 'PE_avg_team_home_last_5', 'A_avg_team_home_last_5', 
                 'FT_avg_team_home_last_5', 'FD_avg_team_home_last_5', 'FF_avg_team_home_last_5',
                 'G_avg_team_home_last_5', 'I_avg_team_home_last_5', 'PP_avg_team_home_last_5',
                 'RB_avg_team_home_last_5', 'FC_avg_team_home_last_5', 'GC_avg_team_home_last_5',
                 'CA_avg_team_home_last_5', 'CV_avg_team_home_last_5', 'SG_avg_team_home_last_5',
                 'DD_avg_team_home_last_5', 'DP_avg_team_home_last_5', 'GS_avg_team_home_last_5',
                 'Pontos_avg_team_home_last_5', 'Preco_avg_team_home_last_5', 'game_points_avg_team_home_last_5']

ENEMY_FEATURES = ['FS_avg_enemy_home_last_20', 'PE_avg_enemy_home_last_20', 'A_avg_enemy_home_last_20', 
                  'FT_avg_enemy_home_last_20', 'FD_avg_enemy_home_last_20', 'FF_avg_enemy_home_last_20',
                  'G_avg_enemy_home_last_20', 'I_avg_enemy_home_last_20', 'PP_avg_enemy_home_last_20',
                  'RB_avg_enemy_home_last_20', 'FC_avg_enemy_home_last_20', 'GC_avg_enemy_home_last_20',
                  'CA_avg_enemy_home_last_20', 'CV_avg_enemy_home_last_20', 'SG_avg_enemy_home_last_20',
                  'DD_avg_enemy_home_last_20', 'DP_avg_enemy_home_last_20', 'GS_avg_enemy_home_last_20',
                  'Pontos_avg_enemy_home_last_20', 'Preco_avg_enemy_home_last_20', 
                  'game_points_avg_enemy_home_last_20', 'FS_avg_enemy_home_last_10',
                  'PE_avg_enemy_home_last_10', 'A_avg_enemy_home_last_10', 'FT_avg_enemy_home_last_10',
                  'FD_avg_enemy_home_last_10', 'FF_avg_enemy_home_last_10', 'G_avg_enemy_home_last_10',
                  'I_avg_enemy_home_last_10', 'PP_avg_enemy_home_last_10', 'RB_avg_enemy_home_last_10',
                  'FC_avg_enemy_home_last_10', 'GC_avg_enemy_home_last_10', 'CA_avg_enemy_home_last_10',
                  'CV_avg_enemy_home_last_10', 'SG_avg_enemy_home_last_10', 'DD_avg_enemy_home_last_10',
                  'DP_avg_enemy_home_last_10', 'GS_avg_enemy_home_last_10', 'Pontos_avg_enemy_home_last_10',
                  'Preco_avg_enemy_home_last_10', 'game_points_avg_enemy_home_last_10', 'FS_avg_enemy_home_last_5', 
                  'PE_avg_enemy_home_last_5', 'A_avg_enemy_home_last_5', 'FT_avg_enemy_home_last_5', 
                  'FD_avg_enemy_home_last_5', 'FF_avg_enemy_home_last_5', 'G_avg_enemy_home_last_5',
                  'I_avg_enemy_home_last_5', 'PP_avg_enemy_home_last_5', 'RB_avg_enemy_home_last_5', 
                  'FC_avg_enemy_home_last_5', 'GC_avg_enemy_home_last_5', 'CA_avg_enemy_home_last_5',
                  'CV_avg_enemy_home_last_5', 'SG_avg_enemy_home_last_5', 'DD_avg_enemy_home_last_5',
                  'DP_avg_enemy_home_last_5', 'GS_avg_enemy_home_last_5', 'Pontos_avg_enemy_home_last_5',
                  'Preco_avg_enemy_home_last_5', 'game_points_avg_enemy_home_last_5']

POSITION_FEATURES = ['FS_avg_pos_home_last_20', 'PE_avg_pos_home_last_20', 'A_avg_pos_home_last_20', 
                     'FT_avg_pos_home_last_20', 'FD_avg_pos_home_last_20', 'FF_avg_pos_home_last_20',
                     'G_avg_pos_home_last_20', 'I_avg_pos_home_last_20', 'PP_avg_pos_home_last_20',
                     'RB_avg_pos_home_last_20', 'FC_avg_pos_home_last_20', 'GC_avg_pos_home_last_20',
                     'CA_avg_pos_home_last_20', 'CV_avg_pos_home_last_20', 'SG_avg_pos_home_last_20',
                     'DD_avg_pos_home_last_20', 'DP_avg_pos_home_last_20', 'GS_avg_pos_home_last_20',
                     'Pontos_avg_pos_home_last_20', 'Preco_avg_pos_home_last_20', 'game_points_avg_pos_home_last_20',
                     'FS_avg_pos_home_last_10', 'PE_avg_pos_home_last_10', 'A_avg_pos_home_last_10', 
                     'FT_avg_pos_home_last_10', 'FD_avg_pos_home_last_10', 'FF_avg_pos_home_last_10', 
                     'G_avg_pos_home_last_10', 'I_avg_pos_home_last_10', 'PP_avg_pos_home_last_10', 
                     'RB_avg_pos_home_last_10', 'FC_avg_pos_home_last_10', 'GC_avg_pos_home_last_10', 
                     'CA_avg_pos_home_last_10', 'CV_avg_pos_home_last_10', 'SG_avg_pos_home_last_10',
                     'DD_avg_pos_home_last_10', 'DP_avg_pos_home_last_10', 'GS_avg_pos_home_last_10',
                     'Pontos_avg_pos_home_last_10', 'Preco_avg_pos_home_last_10', 'game_points_avg_pos_home_last_10', 
                     'FS_avg_pos_home_last_5', 'PE_avg_pos_home_last_5', 'A_avg_pos_home_last_5', 
                     'FT_avg_pos_home_last_5', 'FD_avg_pos_home_last_5', 'FF_avg_pos_home_last_5',
                     'G_avg_pos_home_last_5', 'I_avg_pos_home_last_5', 'PP_avg_pos_home_last_5', 
                     'RB_avg_pos_home_last_5', 'FC_avg_pos_home_last_5', 'GC_avg_pos_home_last_5',
                     'CA_avg_pos_home_last_5', 'CV_avg_pos_home_last_5', 'SG_avg_pos_home_last_5',
                     'DD_avg_pos_home_last_5', 'DP_avg_pos_home_last_5', 'GS_avg_pos_home_last_5', 
                     'Pontos_avg_pos_home_last_5', 'Preco_avg_pos_home_last_5', 'game_points_avg_pos_home_last_5']
FEATURES = BASIC_FEATURES + PLAYER_FEATURES + TEAM_FEATURES + POSITION_FEATURES + CATEGORICAL_FEATURES
XGB_NUM_ESTIMATORS = 3000
XGB_LEARNING_RATE = 0.01

GRID_XGB_EXTRA_PARAMS = {"max_depth": [5,6,7], "subsample": [0.6, 0.7, 0.8] , "colsample_bytree": [0.6, 0.7, 0.8],
                    "colsample_by_level": [0.6, 0.7, 0.8], "seed": 43}


XGB_EXTRA_PARAMS = {"max_depth": 7, "subsample": 0.7 , "colsample_bytree": 0.5,
                    "colsample_by_level": 0.5, "seed": 43}

In [5]:
team_dict = {
'265':'BAH',
'288':'CRI',
'267':'VAS',
'290':'GOI',
'314':'AVA',
'317':'JEC',
'262':'FLA',
'263':'BOT',
'264':'COR',
'266':'FLU',
'275':'PAL',
'276':'SAO',
'277':'SAN',
'282':'CAM',
'283':'CRU',
'284':'GRE',
'285':'INT',
'287':'VIT',
'292':'SPT',
'293':'CAP',
'294':'CFC',
'303':'PON',
'315':'CHA',
'316':'FIG',
'327':'AME',
'344':'SCZ',
'373':'ATL-GO'
}

In [6]:
df = pd.read_csv('../db/final_features_calculated-2018-05-12.csv')

In [7]:
df.Posicao.fillna('missing', inplace=True)

In [8]:
df.fillna(0, inplace=True)

In [9]:
df[['ClubeID', 'Jogos', 'ScoutID', 'Rodada', 'AtletaID']] = df[['ClubeID', 'Jogos', 'ScoutID', 'Rodada', 'AtletaID']].astype('Int64')

In [10]:
df.dtypes

Unnamed: 0                              int64
Unnamed: 0.1                            int64
Unnamed: 0.1.1                          int64
Unnamed: 0.1.1.1                        int64
ScoutID                                 int64
AtletaID                                int64
Rodada                                  int64
ClubeID                                 int64
Participou                               bool
Posicao                                object
Jogos                                   int64
Pontos                                float64
PontosMedia                           float64
Preco                                 float64
PrecoVariacao                         float64
FS                                    float64
PE                                    float64
A                                     float64
FT                                    float64
FD                                    float64
FF                                    float64
G                                 

In [11]:
def hot_encode(df, col):
    names = list(map(lambda x: str(x) , list(df[col].value_counts().index)))
    values = np.array(df[col], dtype=np.str)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    
    return pd.concat([df, pd.DataFrame(onehot_encoded, columns=names)], axis=1)

In [12]:
df = hot_encode(df, 'Posicao')
df = hot_encode(df, 'ClubeID')
df = hot_encode(df, 'Home')

In [13]:
df.columns = list(map(lambda name: team_dict.get(name, name), df.columns))

In [14]:
df = df.loc[~(df.target == -999)]

In [15]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'ClubeID.1'], axis=1, inplace=True)

In [16]:
print(list(df.columns))

['ScoutID', 'AtletaID', 'Rodada', 'ClubeID', 'Participou', 'Posicao', 'Jogos', 'Pontos', 'PontosMedia', 'Preco', 'PrecoVariacao', 'FS', 'PE', 'A', 'FT', 'FD', 'FF', 'G', 'I', 'PP', 'RB', 'FC', 'GC', 'CA', 'CV', 'SG', 'DD', 'DP', 'GS', 'ano', 'Apelido', 'Status', 'mes', 'dia', 'home_score', 'away_score', 'ClubeCasa', 'ClubeVisitante', 'Home', 'result', 'is_next_same', 'target', 'game_points', 'FS_avg_last_20', 'PE_avg_last_20', 'A_avg_last_20', 'FT_avg_last_20', 'FD_avg_last_20', 'FF_avg_last_20', 'G_avg_last_20', 'I_avg_last_20', 'PP_avg_last_20', 'RB_avg_last_20', 'FC_avg_last_20', 'GC_avg_last_20', 'CA_avg_last_20', 'CV_avg_last_20', 'SG_avg_last_20', 'DD_avg_last_20', 'DP_avg_last_20', 'GS_avg_last_20', 'Pontos_avg_last_20', 'Preco_avg_last_20', 'game_points_avg_last_20', 'FS_avg_last_10', 'PE_avg_last_10', 'A_avg_last_10', 'FT_avg_last_10', 'FD_avg_last_10', 'FF_avg_last_10', 'G_avg_last_10', 'I_avg_last_10', 'PP_avg_last_10', 'RB_avg_last_10', 'FC_avg_last_10', 'GC_avg_last_10', '

In [17]:
df_train = df.loc[(~((df.ano == 2017) & (df.Rodada > 15)))]
df_test = df.loc[(df.ano == 2017) & (df.Rodada > 15)]

In [18]:
from sklearn.model_selection import KFold

def KFold_CrossValidation(num_splits, data, model, target, features):
    X = data.drop([target])
    y = data[target].values
    kf = KFold(n_splits=num_splits)
    eval_cv = pd.DataFrame()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index][features], X[test_index][features]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train.shape, X_test.shape)
        print('Starting fit')
        predict_fn = model.fit(X_train, y_train).predict
        predicted = predict_fn(X_test)
        eval_df = pd.concat([df_test.reset_index(drop=True), pd.DataFrame(predicted, dtype=np.float, columns=['prediction'])])
        eval_metric = get_average_metrics(eval_df, target,'prediction')
        eval_cv = pd.concat([eval_cv, eval_metric])
    return eval_cv

In [38]:
def grid_search(data, holdout_data, model, params_dict, scoring, target, features, num_folds=3):
    clf = GridSearchCV(model, params_dict, scoring=scoring, cv=num_folds, verbose=3)
    print('Starting fit')
    clf.fit(data[features], data[target].values)
    estimator = clf.best_estimator_
    params = clf.best_params_
    return clf, estimator, params
#     predicted = predict_fn(holdout_data[features])
#     eval_df = pd.concat([df_test.reset_index(drop=True), pd.DataFrame(predicted, dtype=np.float, columns=['prediction'])], axis=1)
#     eval_metric = get_average_metrics(eval_df, target, 'prediction')
    
#     return eval_df, predict_fn, params, eval_metric

In [20]:
def final_train(data, model, target, features):
    df_train = df.loc[(~((df.ano == 2017) & (df.Rodada > 15)))]
    df_test = df.loc[(df.ano == 2017) & (df.Rodada > 15)]
    X_train, y_train = df_train.drop([target], axis=1)[features], df_train[target].values
    X_test, y_test = df_test.drop([target],axis=1)[features], df_test[target].values
    print('Starting fit')
    predict_fn = model.fit(X_train, y_train).predict
    predicted = predict_fn(X_test)
    eval_df = pd.concat([df_test.reset_index(drop=True), pd.DataFrame(predicted, dtype=np.float, columns=['prediction'])], axis=1)
    eval_metric = get_average_metrics(eval_df, target, 'prediction')
    return eval_df, predict_fn, eval_metric

## XGBoost

In [52]:
# Initialize XGB and GridSearch
xgb = XGBRegressor(nthread=-1, learning_rate=XGB_LEARNING_RATE, n_estimators=XGB_NUM_ESTIMATORS, **XGB_EXTRA_PARAMS) 


In [53]:
xgb.get_xgb_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_by_level': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 3000,
 'objective': 'reg:linear',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 43,
 'silent': 1,
 'subsample': 0.7}

In [54]:
eval_xgb, xgb_predict, _ = final_train(df, xgb, TARGET, FEATURES)

Starting fit


In [55]:
get_average_metrics(eval_xgb, 'target', 'prediction')

Unnamed: 0,pred_sum,pred_avg,pred_dev,desired_sum,desired_avg,desired_dev,explained_variance,mean_abs_error,root_mean_sqrt_error,median_abs_error,r2_score
0,4692.278994,2.875171,1.149714,4909.8,3.008456,4.205884,-0.009577,3.244436,4.228077,2.626557,-0.010581


## Linear Regression

In [56]:
lin_reg = LinearRegression(normalize=True)

In [57]:
eval_lin, lin_reg__predict, _ = final_train(df, lin_reg, TARGET, FEATURES)

Starting fit


In [58]:
get_average_metrics(eval_lin, 'target', 'prediction')

Unnamed: 0,pred_sum,pred_avg,pred_dev,desired_sum,desired_avg,desired_dev,explained_variance,mean_abs_error,root_mean_sqrt_error,median_abs_error,r2_score
0,4493.624512,2.753446,0.909069,4909.8,3.008456,4.205884,0.014333,3.210082,4.183413,2.656274,0.010657


## KNN

In [42]:
knn_grid_params = {"n_neighbors" : [10, 50, 100, 150, 200], "weights" : ["uniform", "distance"], "p": [1,2,3], "n_jobs" : [-1]}

In [50]:
knn = KNeighborsRegressor(n_neighbors=200 , p=1, weights="distance")

In [44]:
clf, estimator, params = grid_search(df_train, df_test, knn, knn_grid_params, 'r2', TARGET, FEATURES, num_folds=3)

Starting fit
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] n_jobs=-1, n_neighbors=10, p=1, weights=uniform .................
[CV]  n_jobs=-1, n_neighbors=10, p=1, weights=uniform, score=-0.07433085465570288, total=  25.8s
[CV] n_jobs=-1, n_neighbors=10, p=1, weights=uniform .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV]  n_jobs=-1, n_neighbors=10, p=1, weights=uniform, score=-0.040470495938676576, total=  24.8s
[CV] n_jobs=-1, n_neighbors=10, p=1, weights=uniform .................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.4min remaining:    0.0s


[CV]  n_jobs=-1, n_neighbors=10, p=1, weights=uniform, score=-0.05860049781707999, total=  26.4s
[CV] n_jobs=-1, n_neighbors=10, p=1, weights=distance ................
[CV]  n_jobs=-1, n_neighbors=10, p=1, weights=distance, score=-0.07440158205257386, total=  23.6s
[CV] n_jobs=-1, n_neighbors=10, p=1, weights=distance ................
[CV]  n_jobs=-1, n_neighbors=10, p=1, weights=distance, score=-0.04157999565176573, total=  23.5s
[CV] n_jobs=-1, n_neighbors=10, p=1, weights=distance ................
[CV]  n_jobs=-1, n_neighbors=10, p=1, weights=distance, score=-0.059071168006296526, total=  24.6s
[CV] n_jobs=-1, n_neighbors=10, p=2, weights=uniform .................
[CV]  n_jobs=-1, n_neighbors=10, p=2, weights=uniform, score=-0.0788643277739034, total=  11.6s
[CV] n_jobs=-1, n_neighbors=10, p=2, weights=uniform .................
[CV]  n_jobs=-1, n_neighbors=10, p=2, weights=uniform, score=-0.05500967728306039, total=  11.5s
[CV] n_jobs=-1, n_neighbors=10, p=2, weights=uniform .......

[CV]  n_jobs=-1, n_neighbors=100, p=3, weights=distance, score=0.0066141663668241035, total= 1.5min
[CV] n_jobs=-1, n_neighbors=100, p=3, weights=distance ...............
[CV]  n_jobs=-1, n_neighbors=100, p=3, weights=distance, score=0.0194474436315758, total= 1.5min
[CV] n_jobs=-1, n_neighbors=100, p=3, weights=distance ...............
[CV]  n_jobs=-1, n_neighbors=100, p=3, weights=distance, score=-0.002401390314642038, total= 1.6min
[CV] n_jobs=-1, n_neighbors=150, p=1, weights=uniform ................
[CV]  n_jobs=-1, n_neighbors=150, p=1, weights=uniform, score=0.01317125577561451, total=  25.9s
[CV] n_jobs=-1, n_neighbors=150, p=1, weights=uniform ................
[CV]  n_jobs=-1, n_neighbors=150, p=1, weights=uniform, score=0.028811177178894476, total=  25.8s
[CV] n_jobs=-1, n_neighbors=150, p=1, weights=uniform ................
[CV]  n_jobs=-1, n_neighbors=150, p=1, weights=uniform, score=0.004128983212799198, total=  25.9s
[CV] n_jobs=-1, n_neighbors=150, p=1, weights=distance 

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 494.2min finished


In [59]:
df.to_csv('../db/preprocessed_final_df_2018-05-21.csv')

In [51]:
eval_knn, knn_predict, knn_metric = final_train(df, knn, TARGET, FEATURES)

Starting fit


In [64]:
def write_pkl(obj, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(obj, f)

In [66]:
import pickle
write_pkl(knn_predict, '../model_pkls/knn.pkl')
write_pkl(xgb_predict, '../model_pkls/xgb.pkl')
write_pkl(lin_reg__predict, '../model_pkls/lin_reg.pkl')

In [23]:
get_average_metrics(eval_knn, 'target', 'prediction')

Unnamed: 0,pred_sum,pred_avg,pred_dev,desired_sum,desired_avg,desired_dev,explained_variance,mean_abs_error,root_mean_sqrt_error,median_abs_error,r2_score
0,4347.662036,2.664009,2.067048,4909.8,3.008456,4.205884,-0.181565,3.551835,4.584746,2.959482,-0.188272
