# Averaging out Outliers and without outliers

In [3]:
import pandas as panda
import numpy as np
import datetime, time
from matplotlib.pyplot import plot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import seaborn as sns
from sklearn.linear_model import LinearRegression, RANSACRegressor, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge,BayesianRidge,ElasticNet, Lasso

from math import sqrt

In [4]:

classifiers = [
    LinearRegression(),
    RANSACRegressor(),
    DecisionTreeRegressor(random_state = 1, criterion = 'mse'),
    RandomForestRegressor(random_state = 1, criterion = 'mse'),
    SGDRegressor(),
#     SVR( kernel = 'rbf'),
#     KernelRidge(),
    Ridge(solver='auto'),
    BayesianRidge(),
    ElasticNet(),
    Lasso(),
#     GradientBoostingRegressor(loss='huber')
]


classifier_names = [
            'linear_regression',
            'ransac_regression',
            'decisiontree_regression',
            'randomforest_regression',
            'gradient_descent_regression',
#             'svr',  
#             'kernel_ridge',
            'ridge',
            'bayesian_ridge',
            'elastic_net',
            'lasso',
#             'gbr',
    
]

classifier_param_grid = [
            
            {},
            {'ransac_regression__min_samples':[50, 75, 125, 200], 'ransac_regression__max_trials':[50, 125, 200], 'ransac_regression__residual_threshold':[5, 10, 14]},
            {'decisiontree_regression__max_depth':[6,7,8,9,10,11]},
            {'randomforest_regression__n_estimators':[1,2,3,5,6]} ,
            {'gradient_descent_regression__max_iter' : [100, 200, 300]},
#             {'svr__C':[1, 5,10]},
#             {'kernel_ridge__alpha':[0.01,0.04,1]},
            {'ridge__alpha':[0.01,0.04,1]},
            {'bayesian_ridge__n_iter':[200,500,600]},
            {'elastic_net__alpha' : [0.01,0.04,1,1.2], 'elastic_net__l1_ratio' :[0.2,0.4,0.5]},
            {'lasso__alpha' : [0.2,0.4,0.6,1],'lasso__max_iter':[200,400,600]},
#             {'gbr__n_estimators' :[1000,2000],'gbr__max_depth':[12,16,8]}
    
    
]

In [6]:

from math import sqrt

def root_mean_square_error(y, y_predicted):
    
    return sqrt(mean_squared_error(y,y_predicted))
    
scorer = make_scorer(root_mean_square_error, greater_is_better=False)

In [7]:
train_data_path = 'all/train_new_details_added.csv'
test_data_path = 'all/test_new_details_added.csv'

train_data = panda.read_csv(train_data_path)
test_data = panda.read_csv(test_data_path)
# train_data['max_cat_1'] = train_data.max_cat_1.apply(lambda x: 1 if x=='N' else 0)

train_data  = train_data[[i for i in train_data.columns.tolist() if i !='Unnamed: 0']]

without_outliers = train_data[train_data['target'] > -29]

outliers = train_data[train_data['target'] < -29]

round1_x = without_outliers[[i for i in without_outliers.columns.tolist() if i not in ['target','card_id']]]

round1_y = without_outliers[['target']]

In [9]:


class CodeTimer:
    
    """
        Utility custom contextual class for calculating the time 
        taken for a certain code block to execute
    
    """
    def __init__(self, name=None):
        self.name = " '"  + name + "'" if name else ''

    def __enter__(self):
        self.start = time.clock()

    def __exit__(self, exc_type, exc_value, traceback):
        self.took = (time.clock() - self.start) * 1000.0
        time_taken = datetime.timedelta(milliseconds = self.took)
        print('Code block' + self.name + ' took(HH:MM:SS): ' + str(time_taken))



def runGridSearchAndPredict(pipeline, x_train, y_train, x_test, y_test, param_grid, n_jobs = 1, cv = 10, score = 'neg_mean_squared_error'):
    
    response = {}
    training_timer       = CodeTimer('training')
    testing_timer        = CodeTimer('testing')

    with training_timer:

        gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv = cv, n_jobs = n_jobs, scoring = score)

        search = gridsearch.fit(x_train,y_train)

        print("Grid Search Best parameters ", search.best_params_)
        print("Grid Search Best score ", search.best_score_)
            
    with testing_timer:
        y_prediction = gridsearch.predict(x_test)
            
    print("Mean squared error score %s" %mean_squared_error(y_test,y_prediction))
    
    response['testing_time'] = testing_timer.took
    response['_y_prediction'] = y_prediction
    response['training_time'] = training_timer.took    
    response['mean_squared_error'] = mean_squared_error(y_test,y_prediction)
    response['root_mean_squared_error'] = search.best_score_
    response['r2_score'] = r2_score(y_test,y_prediction)
    response['best_estimator'] = search.best_estimator_
    
    return response
    


def analyzeRegressionModelWithOutliers(X,y, outliers):

    
    _x_train, _x_test, _y_train, _y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
    
    
    ## simply ignoring the outliers
    outlier_x = outliers[[i for i in outliers.columns.tolist() if i not in ['target','card_id']]]
    outlier_y =  outliers[['target']]
    
    outlier_x_train, outlier_x_test, outlier_y_train, outlier_y_test = train_test_split(outlier_x, outlier_y, test_size = 0.2, random_state = 2)
    
    _x_train = panda.concat([_x_train, outlier_x_train])
    
    _y_train = panda.concat([_y_train, outlier_y_train])
    
    _x_test = panda.concat([_x_test, outlier_x_test])
    
    _y_test = panda.concat([_y_test, outlier_y_test])
    
    model_metrics = {}

    for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):

            pipeline = Pipeline([
                    ('scaler', RobustScaler()),
                    (model_name, model)
            ])

            cross_validator = KFold(n_splits = 10, random_state = 12)    
            result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid, cv =cross_validator,score = scorer)
#             result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid,score = scorer)

            _y_prediction = result['_y_prediction']

            model_metrics[model_name] = {}
            model_metrics[model_name]['training_time'] = result['training_time']
            model_metrics[model_name]['testing_time'] = result['testing_time']
            model_metrics[model_name]['r2_score'] = result['r2_score']
            model_metrics[model_name]['mean_squared_error'] = result['mean_squared_error']
            model_metrics[model_name]['root_mean_squared_error'] = result['root_mean_squared_error']
            model_metrics[model_name]['best_estimator'] = result['best_estimator']
            
    return model_metrics
    print('Model metrics are \n :', model_metrics)

    
    

def analyzeRegressionModelRemovingOutliers(X,y):

    
    _x_train, _x_test, _y_train, _y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)
    
    
#     ## simply ignoring the outliers
#     outlier_x = outliers[[i for i in outliers.columns.tolist() if i not in ['target','card_id']]]
#     outlier_y =  outliers[['target']]
    
#     outlier_x_train, outlier_x_test, outlier_y_train, outlier_y_test = train_test_split(outlier_x, outlier_y, test_size = 0.2, random_state = 2)
    
#     _x_train = panda.concat([_x_train, outlier_x_train])
    
#     _y_train = panda.concat([_y_train, outlier_y_train])
    
#     _x_test = panda.concat([_x_test, outlier_x_test])
    
#     _y_test = panda.concat([_y_test, outlier_y_test])
    
    model_metrics = {}

    for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):

            pipeline = Pipeline([
                    ('scaler', RobustScaler()),
                    (model_name, model)
            ])

            cross_validator = KFold(n_splits = 10, random_state = 12)    
            result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid, cv =cross_validator,score = scorer)
#             result = runGridSearchAndPredict(pipeline, _x_train, _y_train, _x_test, _y_test, model_param_grid,score = scorer)

            _y_prediction = result['_y_prediction']

            model_metrics[model_name] = {}
            model_metrics[model_name]['training_time'] = result['training_time']
            model_metrics[model_name]['testing_time'] = result['testing_time']
            model_metrics[model_name]['r2_score'] = result['r2_score']
            model_metrics[model_name]['mean_squared_error'] = result['mean_squared_error']
            model_metrics[model_name]['root_mean_squared_error'] = result['root_mean_squared_error']
            model_metrics[model_name]['best_estimator'] = result['best_estimator']
            
    return model_metrics
    print('Model metrics are \n :', model_metrics)


In [10]:
model_metrics = analyzeRegressionModelWithOutliers(round1_x, round1_y, outliers)

NameError: name 'outliers' is not defined

In [None]:
model_metrics