# Averaging out Outliers and without outliers

In [1]:
import pandas as panda
import numpy as np
import datetime, time
from matplotlib.pyplot import plot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import seaborn as sns
from sklearn.linear_model import LinearRegression, RANSACRegressor, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge,BayesianRidge,ElasticNet, Lasso
from xgboost import XGBRegressor
from math import sqrt

In [4]:

classifiers = [
    LinearRegression(),
    RANSACRegressor(),
    DecisionTreeRegressor(random_state = 1, criterion = 'mse'),
    RandomForestRegressor(random_state = 1, criterion = 'mse'),
    SGDRegressor(),
#     SVR( kernel = 'rbf'),
#     KernelRidge(),
    Ridge(solver='auto'),
    BayesianRidge(),
    ElasticNet(),
    Lasso(),
#     GradientBoostingRegressor(loss='huber')
]


classifier_names = [
            'linear_regression',
            'ransac_regression',
            'decisiontree_regression',
            'randomforest_regression',
            'gradient_descent_regression',
#             'svr',  
#             'kernel_ridge',
            'ridge',
            'bayesian_ridge',
            'elastic_net',
            'lasso',
#             'gbr',
    
]

classifier_param_grid = [
            
            {},
            {'ransac_regression__min_samples':[50, 75, 125, 200], 'ransac_regression__max_trials':[50, 125, 200], 'ransac_regression__residual_threshold':[5, 10, 14]},
            {'decisiontree_regression__max_depth':[6,7,8,9,10,11]},
            {'randomforest_regression__n_estimators':[1,2,3,5,6]} ,
            {'gradient_descent_regression__max_iter' : [100, 200, 300]},
#             {'svr__C':[1, 5,10]},
#             {'kernel_ridge__alpha':[0.01,0.04,1]},
            {'ridge__alpha':[0.01,0.04,1]},
            {'bayesian_ridge__n_iter':[200,500,600]},
            {'elastic_net__alpha' : [0.01,0.04,1,1.2], 'elastic_net__l1_ratio' :[0.2,0.4,0.5]},
            {'lasso__alpha' : [0.2,0.4,0.6,1],'lasso__max_iter':[200,400,600]},
#             {'gbr__n_estimators' :[1000,2000],'gbr__max_depth':[12,16,8]}
    
    
]

In [2]:

from math import sqrt

def root_mean_square_error(y, y_predicted):
    
    return sqrt(mean_squared_error(y,y_predicted))
    
scorer = make_scorer(root_mean_square_error, greater_is_better=False)

In [3]:
train_data_path = 'all/train_new_details_added.csv'
test_data_path = 'all/test_new_details_added.csv'

train_data = panda.read_csv(train_data_path)
test_data = panda.read_csv(test_data_path)
# train_data['max_cat_1'] = train_data.max_cat_1.apply(lambda x: 1 if x=='N' else 0)

train_data  = train_data[[i for i in train_data.columns.tolist() if i !='Unnamed: 0']]

without_outliers = train_data[train_data['target'] > -29]

outliers = train_data[train_data['target'] < -29]

round1_x = without_outliers[[i for i in without_outliers.columns.tolist() if i not in ['target','card_id']]]

round1_y = without_outliers[['target']]

In [4]:


class CodeTimer:
    
    """
        Utility custom contextual class for calculating the time 
        taken for a certain code block to execute
    
    """
    def __init__(self, name=None):
        self.name = " '"  + name + "'" if name else ''

    def __enter__(self):
        self.start = time.clock()

    def __exit__(self, exc_type, exc_value, traceback):
        self.took = (time.clock() - self.start) * 1000.0
        time_taken = datetime.timedelta(milliseconds = self.took)
        print('Code block' + self.name + ' took(HH:MM:SS): ' + str(time_taken))



def runGridSearchAndPredict(pipeline, x_train, y_train, x_test, y_test, param_grid, n_jobs = 1, cv = 10, score = 'neg_mean_squared_error'):
    
    response = {}
    training_timer       = CodeTimer('training')
    testing_timer        = CodeTimer('testing')

    with training_timer:

        gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv = cv, n_jobs = n_jobs, scoring = score)

        search = gridsearch.fit(x_train,y_train)

        print("Grid Search Best parameters ", search.best_params_)
        print("Grid Search Best score ", search.best_score_)
            
    with testing_timer:
        y_prediction = gridsearch.predict(x_test)
            
    print("Mean squared error score %s" %mean_squared_error(y_test,y_prediction))
    
    response['testing_time'] = testing_timer.took
    response['_y_prediction'] = y_prediction
    response['training_time'] = training_timer.took    
    response['mean_squared_error'] = mean_squared_error(y_test,y_prediction)
    response['root_mean_squared_error'] = search.best_score_
    response['r2_score'] = r2_score(y_test,y_prediction)
    response['best_estimator'] = search.best_estimator_
    
    return response
    


def analyzeRegressionModelWithOutliers(X,y, outliers):

    
    _x_train, _x_test, _y_train, _y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
    
    
    ## simply ignoring the outliers
    outlier_x = outliers[[i for i in outliers.columns.tolist() if i not in ['target','card_id']]]
    outlier_y =  outliers[['target']]
    
    outlier_x_train, outlier_x_test, outlier_y_train, outlier_y_test = train_test_split(outlier_x, outlier_y, test_size = 0.2, random_state = 2)
    
    _x_train = panda.concat([_x_train, outlier_x_train])
    
    _y_train = panda.concat([_y_train, outlier_y_train])
    
    _x_test = panda.concat([_x_test, outlier_x_test])
    
    _y_test = panda.concat([_y_test, outlier_y_test])
    
    model_metrics = {}

    for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):

            pipeline = Pipeline([
                    ('scaler', RobustScaler()),
                    (model_name, model)
            ])

            cross_validator = KFold(n_splits = 10, random_state = 12)    
            result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid, cv =cross_validator,score = scorer)
#             result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid,score = scorer)

            _y_prediction = result['_y_prediction']

            model_metrics[model_name] = {}
            model_metrics[model_name]['training_time'] = result['training_time']
            model_metrics[model_name]['testing_time'] = result['testing_time']
            model_metrics[model_name]['r2_score'] = result['r2_score']
            model_metrics[model_name]['mean_squared_error'] = result['mean_squared_error']
            model_metrics[model_name]['root_mean_squared_error'] = result['root_mean_squared_error']
            model_metrics[model_name]['best_estimator'] = result['best_estimator']
            
    return model_metrics
    print('Model metrics are \n :', model_metrics)

    
    

def analyzeRegressionModelRemovingOutliers(X,y):

    
    _x_train, _x_test, _y_train, _y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
    
    
#     ## simply ignoring the outliers
#     outlier_x = outliers[[i for i in outliers.columns.tolist() if i not in ['target','card_id']]]
#     outlier_y =  outliers[['target']]
    
#     outlier_x_train, outlier_x_test, outlier_y_train, outlier_y_test = train_test_split(outlier_x, outlier_y, test_size = 0.2, random_state = 2)
    
#     _x_train = panda.concat([_x_train, outlier_x_train])
    
#     _y_train = panda.concat([_y_train, outlier_y_train])
    
#     _x_test = panda.concat([_x_test, outlier_x_test])
    
#     _y_test = panda.concat([_y_test, outlier_y_test])
    
    model_metrics = {}

    for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):

            pipeline = Pipeline([
                    ('scaler', RobustScaler()),
                    (model_name, model)
            ])

            cross_validator = KFold(n_splits = 10, random_state = 12)    
            result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid, cv =cross_validator,score = scorer)
#             result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid,score = scorer)

            _y_prediction = result['_y_prediction']

            model_metrics[model_name] = {}
            model_metrics[model_name]['training_time'] = result['training_time']
            model_metrics[model_name]['testing_time'] = result['testing_time']
            model_metrics[model_name]['r2_score'] = result['r2_score']
            model_metrics[model_name]['mean_squared_error'] = result['mean_squared_error']
            model_metrics[model_name]['root_mean_squared_error'] = result['root_mean_squared_error']
            model_metrics[model_name]['best_estimator'] = result['best_estimator']
            
    return model_metrics
    print('Model metrics are \n :', model_metrics)


In [12]:
model_metrics = analyzeRegressionModelWithOutliers(round1_x, round1_y, outliers)

Grid Search Best parameters  {}
Grid Search Best score  -2.8108663448443174
Code block 'training' took(HH:MM:SS): 0:00:05.865152
Code block 'testing' took(HH:MM:SS): 0:00:00.049800
Mean squared error score 11.009776012126553
Grid Search Best parameters  {'ransac_regression__max_trials': 200, 'ransac_regression__min_samples': 125, 'ransac_regression__residual_threshold': 5}
Grid Search Best score  -2.7222741748033608
Code block 'training' took(HH:MM:SS): 0:06:23.091727
Code block 'testing' took(HH:MM:SS): 0:00:00.036892
Mean squared error score 11.022418052838042
Grid Search Best parameters  {'decisiontree_regression__max_depth': 6}
Grid Search Best score  -2.8695478225814384
Code block 'training' took(HH:MM:SS): 0:01:35.230287
Code block 'testing' took(HH:MM:SS): 0:00:00.039873
Mean squared error score 11.099476486100517


  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estima

  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


Grid Search Best parameters  {'randomforest_regression__n_estimators': 6}
Grid Search Best score  -3.7695809929023896
Code block 'training' took(HH:MM:SS): 0:07:56.701168
Code block 'testing' took(HH:MM:SS): 0:00:00.331414
Mean squared error score 15.095327353198398


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Grid Search Best parameters  {'gradient_descent_regression__max_iter': 300}
Grid Search Best score  -2.807549767929972
Code block 'training' took(HH:MM:SS): 0:05:06.604872
Code block 'testing' took(HH:MM:SS): 0:00:00.039986
Mean squared error score 10.987601674586296
Grid Search Best parameters  {'ridge__alpha': 1}
Grid Search Best score  -2.8107908348383357
Code block 'training' took(HH:MM:SS): 0:00:13.844649
Code block 'testing' took(HH:MM:SS): 0:00:00.038514
Mean squared error score 11.009557848429498


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Grid Search Best parameters  {'bayesian_ridge__n_iter': 200}
Grid Search Best score  -2.809894922827961
Code block 'training' took(HH:MM:SS): 0:00:17.012478
Code block 'testing' took(HH:MM:SS): 0:00:00.051313
Mean squared error score 11.008003321006534
Grid Search Best parameters  {'elastic_net__alpha': 1, 'elastic_net__l1_ratio': 0.2}
Grid Search Best score  -2.7785801309537907
Code block 'training' took(HH:MM:SS): 0:01:00.075329
Code block 'testing' took(HH:MM:SS): 0:00:00.040047
Mean squared error score 11.000028671080395
Grid Search Best parameters  {'lasso__alpha': 0.4, 'lasso__max_iter': 200}
Grid Search Best score  -2.7786033361201357
Code block 'training' took(HH:MM:SS): 0:00:57.158006
Code block 'testing' took(HH:MM:SS): 0:00:00.030366
Mean squared error score 11.000825369698667


In [14]:
model_metrics

{'linear_regression': {'training_time': 5865.151677198699,
  'testing_time': 49.79997860977203,
  'r2_score': -0.003907567747305274,
  'mean_squared_error': 11.009776012126553,
  'root_mean_squared_error': -2.8108663448443174,
  'best_estimator': Pipeline(memory=None,
       steps=[('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
         with_scaling=True)), ('linear_regression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])},
 'ransac_regression': {'training_time': 383091.72653275775,
  'testing_time': 36.8918860492613,
  'r2_score': -0.005060310575865534,
  'mean_squared_error': 11.022418052838042,
  'root_mean_squared_error': -2.7222741748033608,
  'best_estimator': Pipeline(memory=None,
       steps=[('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
         with_scaling=True)), ('ransac_regression', RANSACRegressor(base_estimator=None, is_data_valid=None, is_model_valid=None,


In [15]:
model_metrics1 = analyzeRegressionModelRemovingOutliers(round1_x, round1_y)

Grid Search Best parameters  {}
Grid Search Best score  -1.7125158273637575
Code block 'training' took(HH:MM:SS): 0:00:05.348909
Code block 'testing' took(HH:MM:SS): 0:00:00.026597
Mean squared error score 2.9715958367591755
Grid Search Best parameters  {'ransac_regression__max_trials': 200, 'ransac_regression__min_samples': 75, 'ransac_regression__residual_threshold': 14}
Grid Search Best score  -1.7124494794575216
Code block 'training' took(HH:MM:SS): 0:04:23.525218
Code block 'testing' took(HH:MM:SS): 0:00:00.033686
Mean squared error score 2.9714413166036087
Grid Search Best parameters  {'decisiontree_regression__max_depth': 6}
Grid Search Best score  -1.7160496726344232
Code block 'training' took(HH:MM:SS): 0:01:34.288139
Code block 'testing' took(HH:MM:SS): 0:00:00.038712
Mean squared error score 2.980486358035781


  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estima

  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


Grid Search Best parameters  {'randomforest_regression__n_estimators': 6}
Grid Search Best score  -1.8925284710498596
Code block 'training' took(HH:MM:SS): 0:08:08.101666
Code block 'testing' took(HH:MM:SS): 0:00:00.374431
Mean squared error score 3.6373947022310893


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Grid Search Best parameters  {'gradient_descent_regression__max_iter': 200}
Grid Search Best score  -1.712862499230382
Code block 'training' took(HH:MM:SS): 0:04:56.227869
Code block 'testing' took(HH:MM:SS): 0:00:00.040461
Mean squared error score 2.9735812130325074
Grid Search Best parameters  {'ridge__alpha': 1}
Grid Search Best score  -1.7125079901575275
Code block 'training' took(HH:MM:SS): 0:00:13.983701
Code block 'testing' took(HH:MM:SS): 0:00:00.033667
Mean squared error score 2.9714445846044235


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Grid Search Best parameters  {'bayesian_ridge__n_iter': 200}
Grid Search Best score  -1.712509605467046
Code block 'training' took(HH:MM:SS): 0:00:16.648998
Code block 'testing' took(HH:MM:SS): 0:00:00.036113
Mean squared error score 2.971440607509131
Grid Search Best parameters  {'elastic_net__alpha': 0.01, 'elastic_net__l1_ratio': 0.2}
Grid Search Best score  -1.7125417285901334
Code block 'training' took(HH:MM:SS): 0:00:57.776077
Code block 'testing' took(HH:MM:SS): 0:00:00.047655
Mean squared error score 2.971431335207786
Grid Search Best parameters  {'lasso__alpha': 0.2, 'lasso__max_iter': 200}
Grid Search Best score  -1.7143952484589149
Code block 'training' took(HH:MM:SS): 0:00:56.125466
Code block 'testing' took(HH:MM:SS): 0:00:00.035088
Mean squared error score 2.976979737810643


In [16]:
panda.DataFrame(model_metrics)

Unnamed: 0,linear_regression,ransac_regression,decisiontree_regression,randomforest_regression,gradient_descent_regression,ridge,bayesian_ridge,elastic_net,lasso
best_estimator,"Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ..."
mean_squared_error,11.0098,11.0224,11.0995,15.0953,10.9876,11.0096,11.008,11,11.0008
r2_score,-0.00390757,-0.00506031,-0.0120868,-0.376442,-0.00188564,-0.00388767,-0.00374593,-0.00301877,-0.00309142
root_mean_squared_error,-2.81087,-2.72227,-2.86955,-3.76958,-2.80755,-2.81079,-2.80989,-2.77858,-2.7786
testing_time,49.8,36.8919,39.8729,331.414,39.9861,38.5144,51.3127,40.0469,30.3662
training_time,5865.15,383092,95230.3,476701,306605,13844.6,17012.5,60075.3,57158


In [17]:
panda.DataFrame(model_metrics1)

Unnamed: 0,linear_regression,ransac_regression,decisiontree_regression,randomforest_regression,gradient_descent_regression,ridge,bayesian_ridge,elastic_net,lasso
best_estimator,"Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ...","Pipeline(memory=None,\n steps=[('scaler', ..."
mean_squared_error,2.9716,2.97144,2.98049,3.63739,2.97358,2.97144,2.97144,2.97143,2.97698
r2_score,0.00174736,0.00179927,-0.00123924,-0.221915,0.00108041,0.00179817,0.00179951,0.00180262,-6.12602e-05
root_mean_squared_error,-1.71252,-1.71245,-1.71605,-1.89253,-1.71286,-1.71251,-1.71251,-1.71254,-1.7144
testing_time,26.5973,33.6862,38.7118,374.431,40.4605,33.6674,36.1131,47.6547,35.0879
training_time,5348.91,263525,94288.1,488102,296228,13983.7,16649,57776.1,56125.5


In [19]:
test_data = test_data[['feature_1','feature_2','feature_3','months_passed','total_no_of_transaction','total','total_accepted', \
    'total_rejects','mean_installments','max_cat_1','max_cat_2','cat_3','most_appearing_mc_id','mean_month_lag']]

test_target = model_metrics['gradient_descent_regression']['best_estimator'].predict(test_data)
   
test_target1 = model_metrics1['linear_regression']['best_estimator'].predict(test_data)


array([-0.75315504, -0.77767966, -0.36307084, ..., -0.66009676,
       -0.29623616, -0.62802782])

In [23]:

test_target2= np.reshape(test_target1, (len(test_target1)))
test_target2

array([ 1.05926514e-01,  1.52587891e-05, -6.67266846e-02, ...,
       -1.41326904e-01, -6.79931641e-02, -1.04125977e-01])

In [24]:
avg_target = np.average([test_target, test_target2], axis = 0)

avg_target

array([-0.32361426, -0.3888322 , -0.21489876, ..., -0.40071183,
       -0.18211466, -0.3660769 ])

In [25]:
temp = panda.read_csv('all/test.csv')
sample_submission_round_1 = temp[['card_id']]
sample_submission_round_1['target'] = avg_target


sample_submission_round_1[['card_id','target']].to_csv('round_avg_new_details_submission.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
classifiers = [
    XGBRegressor(),
]


classifier_names = [
            'xgb',               
    
]

classifier_param_grid = [
            # {'boost_regressor__max_depth':[3]},
            {'xgb__max_depth':[3,5,6, 12],\
                #  'boost_regressor__learning_rate':[0.1,0.05, 0.02], \
                #     'boost_regressor__reg_alpha':[0.1,0.2,0.3], \
                #         'boost_regressor__reg_lambda':[3,0.5,0.6], \
                'xgb__n_estimators':[1000,2000]
            },
    
]

In [None]:

model_metrics = analyzeRegressionModelWithOutliers(round1_x, round1_y, outliers)
model_metrics1 = analyzeRegressionModelRemovingOutliers(round1_x, round1_y)

print(panda.DataFrame(model_metrics), panda.DataFrame(model_metrics1))

In [None]:
test_target = model_metrics['gradient_descent_regression']['best_estimator'].predict(test_data)
   
test_target1 = model_metrics1['linear_regression']['best_estimator'].predict(test_data)

test_target1 = np.reshape(test_target1, len(test_target1))

In [None]:
temp = panda.read_csv('all/test.csv')
sample_submission_round_1 = temp[['card_id']]
sample_submission_round_1['target'] = np.average([test_target, test_target1], axis = 0)


sample_submission_round_1.to_csv('round_avg_with_xgb_new_details_submission.csv')