In [None]:
%run -i bestestimator_v2.py

In [1]:
import pandas as pd
import numpy as np
import operator

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, \
    RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelBinarizer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import roc_auc_score



def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return (roc_auc_score(y_test, y_pred, average=average))


class BestEstimator(object):

    def __init__(self, 
                 type_esti = 'classifier', 
                 cv = 3, 
                 grid = True, 
                 hard_grid = False,
                 cv_grid = 3):
        
        self.type_esti = type_esti
        #self.params = params
        self.cv = cv
        self.grid = grid
        self.hard_grid = hard_grid
        self.cv_grid = cv_grid
        
        self.AUC = make_scorer(multiclass_roc_auc_score)
        
        self.Decision_Function = None
        self.gr = None
        self.estim = None
        self.Target = None
        self.Data = None
        self.le = None
        self.cat = None
        #self.list_cat = None
        

    def fit(self, data, target,
            ID = 'ID',
            target_ID = True,
            n = 1000,
            n_grid = 1000,
            value_nan = 0,
            view_nan = True,
           params = False,
           value = 0,
           scoring = 'roc_auc'):
        
        loss = scoring 
        self.Data = data.copy()
        self.Target = target.copy()

        self.Data.drop([ID], axis=1, inplace=True)
        if target_ID:
            self.Target.drop([ID], axis=1, inplace=True)
        

        if view_nan:
            print("Missing Values :\n")

            total = self.Data.isnull().sum().sort_values(ascending=False)
            percent = (self.Data.isnull().sum() / self.Data.isnull().count()).sort_values(ascending=False) * 100
            missing_data = pd.concat([total, percent], axis=1, keys=['Total', '%'])
            print("{} \n".format(missing_data[(percent > 0)]))

        if type(value) == int:
            self.Data.fillna(value, inplace=True)
            # self.Test.fillna(value, inplace = True)
            # self.Missing_values()

        elif value == 'bfill':
            self.Data.fillna('bfill', inplace=True)
            # self.Test.fillna('bfill', inplace = True)
            # self.Missing_values()

        elif value == 'ffill':
            self.Data.fillna('ffill', inplace=True)
            # self.Test.fillna('ffill', inplace = True)
            # self.Missing_values()

        if self.Data.isnull().any().any() == False:
            print('NaN data filled by {} \n'.format(value))
        else:
            print('Fail to fill NaN data')

        for i in self.Data.columns:  ###########

            if self.Data[i].dtype == object:
                encoder = LabelEncoder()
                encoder.fit(list(self.Data[i]))
                self.Data[i] = encoder.transform(list(self.Data[i]))

            if self.Data[i].dtype == float:
                self.Data[i] = self.Data[i].astype('int')

        for i in self.Target.columns:
            if self.Target[i].dtype == object:
                self.cat = True
                self.le = LabelEncoder()
                self.le.fit(list(self.Target[i]))
                self.Target[i] = self.le.transform(list(self.Target[i]))

        X_tr, X_te, Y_tr, Y_te = train_test_split(self.Data, self.Target, random_state=0, test_size=1 / 3)

        print('Searching for the best regressor on {} data using {} loss... \n'.format(n, scoring))

        if self.type_esti == 'classifier':

            # print('\n Searching for the best classifier on {} data... \n'.format(n))

            clfs = {}
            clfs['Bagging'] = {'clf': BaggingClassifier(), 'name': 'Bagging'}
            clfs['Gradient Boosting'] = {'clf': GradientBoostingClassifier(), 'name': 'Gradient Boosting'}
            clfs['XGBoost'] = {'clf': XGBClassifier(), 'name': 'XGBoost'}
            clfs['Random Forest'] = {'clf': RandomForestClassifier(n_estimators=100, n_jobs=-1),
                                     'name': 'Random Forest'}
            clfs['Decision Tree'] = {'clf': DecisionTreeClassifier(), 'name': 'Decision Tree'}
            clfs['Extra Tree'] = {'clf': ExtraTreesClassifier(n_jobs=-1), 'name': 'Extra Tree'}

            clfs['KNN'] = {'clf': KNeighborsClassifier(n_jobs=-1), 'name': 'KNN'}
            # clfs['NN'] = {'clf': MLPClassifier(), 'name': 'MLPClassifier'
            # clfs['LR'] = {'clf': LogisticClassifier(), 'name': 'LR'}
            clfs['SVM'] = {'clf': SVC(gamma='auto'), 'name': 'SVM'}

            
           # if scoring == 'AUC' and np.unique(self.Target).shape[0] > 2:
            #    scoring = self.AUC
             #   score = 'AUC'
            #elif scoring == 'AUC':                #########################################
             #   score = 'AUC'     ##########################################
              #  scoring = 'roc_auc' #########################################
            
            for item in clfs:
                
                Score = cross_val_score(clfs[item]['clf'], np.asarray(X_tr[0:n]), np.ravel(Y_tr[0:n]),
                                        cv=self.cv, scoring=scoring)
               
                Score_mean = Score.mean()
                STD2 = Score.std() * 2

                clfs[item]['score'] = Score  # roc_auc
                clfs[item]['mean'] = Score_mean
                clfs[item]['std2'] = STD2

                print("\n {}".format(item + ": %0.4f (+/- %0.4f)" % (clfs[item]['score'].mean(),
                                                                     clfs[item]['score'].std() * 2)))

            Best_clf = clfs[max(clfs.keys(), key=(lambda k: clfs[k]['mean']))]['name']
            


        elif self.type_esti == 'regressor':

            clfs = {}
            clfs['Bagging'] = {'clf': BaggingRegressor(), 'name': 'Bagging'}
            clfs['Gradient Boosting'] = {'clf': GradientBoostingRegressor(), 'name': 'Gradient Boosting'}
            clfs['XGBoost'] = {'clf': XGBRegressor(), 'name': 'XGBoost'}
            clfs['Random Forest'] = {'clf': RandomForestRegressor(n_estimators=100, n_jobs=-1),
                                     'name': 'Random Forest'}
            clfs['Decision Tree'] = {'clf': DecisionTreeRegressor(), 'name': 'Decision Tree'}
            clfs['Extra Tree'] = {'clf': ExtraTreesRegressor(n_jobs=-1), 'name': 'Extra Tree'}
            clfs['KNN'] = {'clf': KNeighborsRegressor(n_jobs=-1), 'name': 'KNN'}
            # clfs['NN'] = {'clf': MLPClassifier(), 'name': 'MLPClassifier'
            # clfs['LR'] = {'clf': LogisticClassifier(), 'name': 'LR'}
            clfs['SVM'] = {'clf': SVR(gamma='auto'), 'name': 'SVM'}

            for item in clfs:
                # print(Y_tr[0:30])
                Score = cross_val_score(clfs[item]['clf'], np.asarray(X_tr[0:n]), np.array(np.ravel(Y_tr[0:n])),
                               cv=self.cv, scoring=scoring)
                Score_mean = Score.mean()
                STD2 = Score.std() * 2

                clfs[item]['score'] = Score  # roc_auc
                clfs[item]['mean'] = Score_mean
                clfs[item]['std2'] = STD2

                print("\n {}".format(item + ": %0.4f (+/- %0.4f)" % (clfs[item]['score'].mean(),
                                                                     clfs[item]['score'].std() * 2)))

            Best_clf = clfs[max(clfs.keys(), key=(lambda k: clfs[k]['mean']))]['name']

        if self.grid:
            # print('grid = True')

            if params == False:

                if self.hard_grid == False:

                    if Best_clf == 'Extra Tree':

                        if self.type_esti == 'regressor':

                            params = {'n_estimators': [100, 300, 600],
                                      'criterion': ['mse', 'mae'],
                                      'max_depth': [None, 5, 10]}

                        else:

                            params = {'n_estimators': [100, 300, 600],
                                      'criterion': ['gini', 'entropy'],
                                      'max_depth': [None, 5, 10]}

                    if Best_clf == 'Gradient Boosting':

                        if self.type_esti == 'regressor':

                            params = {'n_estimators': [100, 300, 600],
                                      'max_depth': [5, 10, None],
                                      'learning_rate': [.001, .01, .1],
                                      'loss': ['ls', 'lad']}
                        else:

                            params = {'n_estimators': [100, 300, 600],
                                      'max_depth': [5, 10, None],
                                      'learning_rate': [.001, .01, .1],
                                      'loss': ['deviance', 'exponential']}


                    elif Best_clf == 'Random Forest':
                        #  print('Best_clf = dt ou rf')

                        if self.type_esti == 'regressor':

                            params = {'n_estimators': [10, 100, 300],
                                      'max_depth': [5, 10, None],
                                      'criterion': ['mse', 'mae']}

                        else:

                            params = {'n_estimators': [10, 100, 300],
                                      'max_depth': [5, 10, None],
                                      'criterion': ['gini', 'entropy']}

                    elif Best_clf == 'Decision Tree':

                        if self.type_esti == 'regressor':

                            params = {'max_depth': [5, 10, 50, None],
                                      'criterion': ['mse', 'friedman_mse', 'mae']}

                        else:

                            params = {'max_depth': [5, 10, 50, None],
                                      'criterion': ['gini', 'entropy']}


                    elif Best_clf == 'XGBoost':
                        # print('Best_clf = xgb')

                        params = {'eta': [.01, .1, .3],
                                  'max_depth': [5, 10, 15],
                                  'gamma': [0, .1, .01]}

                    elif Best_clf == 'Bagging':
                        # print('best_clf = bag)')

                        params = {'n_estimators': [100, 300, 600]}

                    elif Best_clf == 'KNN':

                        params = {'n_neighbors': [2, 5, 10, 30, 40],
                                  'p': [1, 2]}

                    elif Best_clf == 'SVM':

                        params = {'C': {1, .5, .1, 5},
                                  'tol': [.01, .001, .1, .0001]}



                else:

                    if Best_clf == 'Extra Tree':

                        if self.type_esti == 'regressor':

                            params = {'n_estimators': [10, 100, 300, 600, 1000, 1200],
                                      'criterion': ['mae', 'mse'],
                                      'max_depth': [None, 5, 10, 15, 20, 25]}

                        else:

                            params = {'n_estimators': [10, 100, 300, 600, 1000, 1200],
                                      'criterion': ['gini', 'entropy'],
                                      'max_depth': [None, 5, 10, 15, 20, 25]}

                    if Best_clf == 'Gradient Boosting':

                        if self.type_esti == 'regressor':

                            params = {'n_estimators': [100, 300, 600, 1000, 1200],
                                      'max_depth': [5, 10, 15, 25, None],
                                      'learning_rate': [.001, .01, .1],
                                      'loss': ['ls', 'lad', 'huber', 'quantile'],
                                      'criterion': ['mse', 'friedman_mse']}
                        else:

                            params = {'n_estimators': [100, 300, 600, 1000, 1200],
                                      'max_depth': [5, 10, 15, 25, None],
                                      'learning_rate': [.001, .01, .1],
                                      'loss': ['deviance', 'exponential'],
                                      'criterion': ['mse', 'friedman_mse']}


                    elif Best_clf == 'Random Forest':
                        #  print('Best_clf = dt ou rf')

                        if self.type_esti == 'regressor':

                            params = {'n_estimators': [10, 100, 300, 600, 1000, 1200],
                                      'max_depth': [5, 10, 15, 20, 25, None],
                                      'criterion': ['mse', 'mae']}

                        else:

                            params = {'n_estimators': [10, 100, 300, 600, 1000, 1200],
                                      'max_depth': [5, 10, 15, 20, 25],
                                      'criterion': ['gini', 'entropy']}

                    elif Best_clf == 'Decision Tree':

                        if params == 'regressor':

                            params = {'max_depth': [5, 10, 50, 100, None],
                                      'criterion': ['mse', 'friedman_mse', 'mae'],
                                      'splitter': ['best', 'random']}

                        else:

                            params = {'max_depth': [5, 10, 50, 100, None],
                                      'criterion': ['gini', 'entropy'],
                                      'splitter': ['best', 'random']}


                    elif Best_clf == 'XGBoost':
                        # print('Best_clf = xgb')

                        params = {'eta': [0.001, .01, .1, .3, 1],
                                  'max_depth': [5, 10, 15, 20, 25],
                                  'gamma': [0, .1, .01, .001]}

                    elif Best_clf == 'Bagging':
                        # print('best_clf = bag)')

                        params = {'n_estimators': [100, 300, 600, 1000, 1200, 1500]}

                    elif Best_clf == 'KNN':

                        params = {'n_neighbors': [2, 5, 10, 30, 40, 70, 100],
                                  'p': [1, 2, 3]}

                    elif Best_clf == 'SVM':

                        params = {'C': {1, .5, .1, 5, .01, .001},
                                  'tol': [.01, .001, .1, .0001, 1],
                                  'kernel': ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed']}

            if self.hard_grid:
                print('\n Searching for the best hyperparametres of {} using hard_grid on {} data among : \n'.format(
                    Best_clf, n_grid))

            else:
                print('\n Searching for the best hyperparametres of {} on {} data among : \n'.format(Best_clf, n_grid))
            print('{} \n'.format(params))
            # print('Starting GridSearchCV using {} Classifier with {} folds \n'.format(Best_clf, cv_grid))

            
        
            clf = clfs[max(clfs.keys(), key=(lambda k: clfs[k]['mean']))]['clf']

            
            self.gr = GridSearchCV(clf, param_grid=params, cv=self.cv_grid, scoring=scoring, 
                                    verbose=1, refit=True, iid=True, n_jobs = -1)

            self.gr.fit(X_tr[0:n_grid], np.ravel(Y_tr[0:n_grid]))
            


            print('\n Finally, the best estimator is : {} {}'.format(Best_clf, self.type_esti))

            print('\n Using these hyperparametres : {}'.format(self.gr.best_params_))

            print('\n With this {} score : {}'.format(loss, self.gr.best_score_))
            
            self.Decision_Function = self.gr.best_estimator_
            
            #print(self.gr.classes_)
            
            
        else:
            print('\n Best {} : {}'.format(self.type_esti, Best_clf))
            
            
            
    def ReFit(self, Train, Target, ID = 'ID', target_ID = 'ID', value = 0):
        
        train = Train.copy()
        target = Target.copy()
        
        train = self.Transform(train, value = value, ID = ID)
        #target = self.Transform(target, value = value, ID = ID)
        
        self.estim = self.Decision_Function.fit(train, target)
        
        return(self.estim)

    
    
    def Transform(self, Data, value=0, ID = 'ID'):
        
        Test = Data.copy()
        
        if ID != None:

            Test.drop([ID], axis=1, inplace=True)

        if type(value) == int:
            Test.fillna(value, inplace=True)

        elif value == 'bfill':
            Test.fillna('bfill', inplace=True)

        elif value == 'ffill':
            Test.fillna('ffill', inplace=True)

        for i in Test.columns:  ###########
            if Test[i].dtype == float:
                Test[i] = Test[i].astype('int')

            elif Test[i].dtype == object:
                encoder = LabelEncoder()
                encoder.fit(list(Test[i]))
                Test[i] = encoder.transform(list(Test[i]))
        return(Test)
    
    
    
    def pred_grid(self, Test, ID = 'ID', value = 0, prob = False):

        if ID == None:
            test = self.Transform(Test, ID = None, value = 0).copy()
        else :
            test = self.Transform(Test, ID = ID, value = 0).copy()
            
        if prob == False:
            pred = pd.DataFrame()
            predict = self.gr.predict(test)

            if ID == None:
                pred['Target'] = predict
            else :
                pred[ID] = Test[ID]
                pred['Target'] = predict

        else:
            if ID == None:
                pred = pd.DataFrame(self.gr.predict_proba(test), columns = self.gr.classes_)

            else :
                pred = pd.DataFrame(self.gr.predict_proba(test), columns = self.gr.classes_)
                pred.insert(loc = 0, column = ID, value = Test[ID])

        return(pred)
    
    
    
    
    def pred(self, Test, ID=None, value = 0, target_ID = None, n = 1000, prob = False):
        
        if ID == None:
            test = self.Transform(Test, ID = None, value = 0).copy()
        else :
            test = self.Transform(Test, ID = ID, value = 0).copy()
            
    
        if self.estim == None:
            self.estim = self.ReFit(self.Data[0:n], self.Target[0:n], ID = None, target_ID = None, value = 0)
            if prob == False:
                pred = pd.DataFrame()
                predict = self.estim.predict(test)
                
                if ID == None:
                    pred['Target'] = predict
                else :
                    pred[ID] = Test[ID]
                    pred['Target'] = predict
                
            else:
                if ID == None:
                    pred = pd.DataFrame(self.estim.predict_proba(test), columns = self.estim.classes_)

                else :
                    pred = pd.DataFrame(self.estim.predict_proba(test), columns = self.estim.classes_)
                    pred.insert(loc = 0, column = ID, value = Test[ID])
                    
            
        else:
            pred = pd.DataFrame()
            if prob == False:
      
                if prob == False:
                    pred = pd.DataFrame()
                    predict = self.estim.predict(test)
                
                    if ID == None:
                        pred['Target'] = predict
                    else :
                        pred[ID] = Test[ID]
                        pred['Target'] = predict
                
            else:
                if ID == None:
                    pred = pd.DataFrame(self.estim.predict_proba(test), columns = self.estim.classes_)

                else :
                    pred = pd.DataFrame(self.estim.predict_proba(test), columns = self.estim.classes_)
                    pred.insert(loc = 0, column = ID, value = Test[ID])
            
        return(pred)

            
            
    def custom_grid(self, Train, Target, ID = 'ID', target_ID = True,
                    n = 1000, metric = 'AUC', params = None, cv = 3, DF = None, value = 0):


        target = Target.copy()
        loss = metric
        
        for i in target.columns:
            if target[i].dtype == object:
                le = LabelEncoder()
                le.fit(list(target[i]))
                target[i] = le.transform(list(target[i]))

        if ID != None:
            train = self.Transform(Train, ID = ID, value=value)
            if target_ID:
                target.drop([ID], axis=1, inplace=True)
        if DF == None:
            DF = self.Decision_Function

        #if metric == 'AUC':
         #   metric = self.AUC
          #  loss = 'AUC'

        gr = GridSearchCV(DF, param_grid=params, cv=cv, scoring=metric, n_jobs=-1,
                          verbose=1, refit=True, iid=True);

        gr.fit(train[0:n], np.ravel(target[0:n]))

        print('\n Best hyperparametres : {}'.format(gr.best_params_))

        print('\n Giving this {} score : {}'.format(loss, gr.best_score_))







     # def grid(self, clf, params, cv=3, n=100000):
     #
     #     X_tr, X_te, Y_tr, Y_te = train_test_split(self.Data, self.Target, random_state=0, test_size=1 / 3)
     #
     #     gr = GridSearchCV(clf, param_grid=params, cv=cv, scoring=self.AUC, n_jobs=-1,
     #                       verbose=1, refit=True, iid=True);
     #
     #     gr.fit(X_tr[0:n], np.ravel(Y_tr[0:n]))
     #
     #     # print(' Best score :', gr.best_score_,   '\n Using this parametres :', gr.best_params_, '\n With :', clf)
     #     print(' Best score on Train:', gr.best_score_, '\n Using this parametres :', gr.best_params_,
     #           '\n With : \n {} '.format(clf))
     #     return gr







  from numpy.core.umath_tests import inner1d


### Multiclass

In [2]:
Train = pd.read_csv(r"C:\Users\jecombe\OneDrive - Capgemini\Notebooks\Train.csv", sep = ',')
Target = pd.read_csv(r"C:\Users\jecombe\OneDrive - Capgemini\Notebooks\Target.csv", sep = ';')

In [10]:
clf = BestEstimator(type_esti = 'classifier', 
                 cv = 3, 
                 grid = True, 
                 hard_grid = False,
                 cv_grid = 3)

clf.fit(Train, Target,
            ID = 'ID',
            target_ID = True,
            n = 1000,
            n_grid = 1000,
            value_nan = 0,
            view_nan = True,
           params = False,
        scoring = 'accuracy') 

Missing Values :

                      Total       %
WARRANTIES_PRICE      96603  96.603
SHIPPING_PRICE        67610  67.610
BUYER_BIRTHDAY_DATE    5836   5.836
SHIPPING_MODE           315   0.315
PRICECLUB_STATUS         57   0.057
SELLER_SCORE_AVERAGE      6   0.006
SELLER_SCORE_COUNT        6   0.006 

NaN data filled by 0 

Searching for the best regressor on 1000 data using accuracy loss... 


 Bagging: 0.4602 (+/- 0.0583)

 Gradient Boosting: 0.4691 (+/- 0.0145)


  if diff:
  if diff:
  if diff:



 XGBoost: 0.4900 (+/- 0.0156)

 Random Forest: 0.5051 (+/- 0.0158)

 Decision Tree: 0.3450 (+/- 0.0082)

 Extra Tree: 0.4790 (+/- 0.0169)

 KNN: 0.4470 (+/- 0.0126)

 SVM: 0.4870 (+/- 0.0027)

 Searching for the best hyperparametres of Random Forest on 1000 data among : 

{'n_estimators': [10, 100, 300], 'max_depth': [5, 10, None], 'criterion': ['gini', 'entropy']} 

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   20.2s finished



 Finally, the best estimator is : Random Forest classifier

 Using these hyperparametres : {'criterion': 'gini', 'max_depth': None, 'n_estimators': 300}

 With this accuracy score : 0.511


### Binary

In [11]:
Train = pd.read_csv(r"C:\Users\jecombe\OneDrive - Capgemini\Notebooks\Train1.csv", sep = ';')
target = pd.read_csv(r"C:\Users\jecombe\OneDrive - Capgemini\Notebooks\Target1.csv", sep = ';')

Target = pd.DataFrame()
Target['ID'] = target['ID']
Target['Target'] = [np.nan]*target.shape[0]


for i in range(target.shape[0]):
    if target['Target'][i] == 1:
        Target['Target'][i] ='+'
    else :
        Target['Target'][i] ='-'
        
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
clf = BestEstimator(type_esti = 'classifier', 
                 cv = 3, 
                 grid = True, 
                 hard_grid = False,
                 cv_grid = 3)

clf.fit(Train, Target,
            ID = 'ID',
            target_ID = True,
            n = 1000,
            n_grid = 1000,
            value_nan = 0,
            view_nan = False,
           params = False,
        scoring = 'roc_auc') 

NaN data filled by 0 

Searching for the best regressor on 1000 data using roc_auc loss... 


 Bagging: 0.6963 (+/- 0.0254)

 Gradient Boosting: 0.7104 (+/- 0.0125)

 XGBoost: 0.7121 (+/- 0.0225)

 Random Forest: 0.7269 (+/- 0.0375)

 Decision Tree: 0.6295 (+/- 0.0285)

 Extra Tree: 0.6647 (+/- 0.0102)

 KNN: 0.6367 (+/- 0.0594)

 SVM: 0.5040 (+/- 0.0054)

 Searching for the best hyperparametres of Random Forest on 1000 data among : 

{'n_estimators': [10, 100, 300], 'max_depth': [5, 10, None], 'criterion': ['gini', 'entropy']} 

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   28.2s finished



 Finally, the best estimator is : Random Forest classifier

 Using these hyperparametres : {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 300}

 With this roc_auc score : 0.7438012216910522


### Regression

In [7]:
Train = pd.read_csv(r"C:\Users\jecombe\OneDrive - Capgemini\Notebooks\input_training.csv", sep = ';')
Target = pd.read_csv(r"C:\Users\jecombe\OneDrive - Capgemini\Notebooks\Target_Engie.csv", sep = ';')

In [None]:
clf = BestEstimator(type_esti = 'regressor', 
                 cv = 3, 
                 grid = True, 
                 hard_grid = True,
                 cv_grid = 3)

clf.fit(Train, Target,
            ID = 'ID',
            target_ID = True,
            n = 500,
            n_grid = 500,
            value_nan = 0,
            view_nan = True,
           params = False,
        scoring = 'r2') 

Missing Values :

                                Total          %
Grid_voltage                   101322  16.411451
Grid_voltage_std               101322  16.411451
Grid_voltage_max               101322  16.411451
Grid_voltage_min               101322  16.411451
Gearbox_inlet_temperature        8064   1.306152
Generator_converter_speed        8064   1.306152
Generator_converter_speed_min    8064   1.306152
Generator_converter_speed_max    8064   1.306152
Generator_converter_speed_std    8064   1.306152
Gearbox_inlet_temperature_min    8064   1.306152
Gearbox_inlet_temperature_max    8064   1.306152
Gearbox_inlet_temperature_std    8064   1.306152
Absolute_wind_direction_c          72   0.011662
Nacelle_angle_c                    72   0.011662 

NaN data filled by 0 



In [None]:
clf.pred_grid(Train[0:50], ID = 'ID', prob = False)

In [None]:
clf.pred(Train[0:50], ID = 'ID', prob = False, n = 1000)

In [9]:
%run -i bestestimator_v2.py

In [4]:
 params = {'n_estimators': [10, 100, 300],
           'max_depth': [5, 10, None]}

In [10]:
clf = BestEstimator(type_esti = 'regressor', 
                 cv = 3, 
                 grid = True, 
                 hard_grid = False,
                 cv_grid = 3)

clf.custom_grid(Train, Target, ID = 'ID', target_ID = True,
                    n = 1000, metric = 'r2', params = params, cv = 3, DF = RandomForestRegressor())

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   19.1s finished



 Best hyperparametres : {'max_depth': 10, 'n_estimators': 100}

 Giving this r2 score : -1.0361581538638591
