## Version 1

In [1]:
import pandas as pd
import numpy as np
import operator

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelBinarizer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return(roc_auc_score(y_test, y_pred, average=average))



class classif(object):
    
    
    def __init__(self, Train, Test, Target):
        
   
        self.Train = Train.copy()
        self.Test = Test
        self.Target = Target.copy()
        self.dim_test = Test.shape
        self.dim_train = Train.shape
        
        self.AUC = make_scorer(multiclass_roc_auc_score)
    

        
    def best_clf(self, params = False, ID = 'ID', cv = 3, cv_grid = 3, n = 10000, n_grid = 10000, value = 0, 
                 view_nan = False, grid = False):
        
        self.Train.drop([ID], axis = 1, inplace = True)   
        self.Test.drop([ID], axis = 1, inplace = True)
        self.Target.drop([ID], axis = 1, inplace = True)
        
        if view_nan:
            
            print("Missing Values :\n")
        
            total = self.Train.isnull().sum().sort_values(ascending=False)
            percent = (self.Train.isnull().sum()/self.Train.isnull().count()).sort_values(ascending=False)*100
            missing_data = pd.concat([total,percent], axis=1, keys=['Total', '%'])
            print('Train')
            print("{} \n".format(missing_data[(percent>0)]))


            total = self.Test.isnull().sum().sort_values(ascending=False)
            percent = (self.Test.isnull().sum()/self.Test.isnull().count()).sort_values(ascending=False)*100
            missing_data = pd.concat([total,percent], axis=1, keys=['Total', '%'])
            print('Test')
            print("{} \n".format(missing_data[(percent>0)]))
        
        
        if type(value) == int:
            self.Train.fillna(value, inplace = True)
            self.Test.fillna(value, inplace = True)
            #self.Missing_values()
        
        elif value == 'bfill':
            self.Train.fillna('bfill', inplace = True)
            self.Test.fillna('bfill', inplace = True)
            #self.Missing_values()                        
        
        elif value == 'ffill':
            self.Train.fillna('ffill', inplace = True)
            self.Test.fillna('ffill', inplace = True)
            #self.Missing_values()
        
        
        if self.Train.isnull().any().any() == False & self.Test.isnull().any().any() == False :
            print('\n Train & Test NaN data filled by {} \n'.format(value))
        else :
            print('Fail to fill Train et Test')
                
        
        for i in self.Train.columns:###########
            
            if self.Train[i].dtype == float:
                self.Train[i] = self.Train[i].astype('int')
            
            if self.Train[i].dtype == object:
                encoder = LabelEncoder()
                encoder.fit(list(self.Train[i]) + list(self.Test[i]))
                self.Train[i] = encoder.transform(list(self.Train[i]))
                self.Test[i] = encoder.transform(list(self.Test[i]))
                
        for i in self.Test.columns:###########
            if self.Test[i].dtype == float:
                self.Test[i] = self.Test[i].astype('int')
        
        for i in self.Target.columns:
            if self.Target[i].dtype == object:
                le = LabelEncoder()
                le.fit(list(self.Target[i]))
                self.Target[i] = le.transform(list(self.Target[i]))
                

                
        X_tr,X_te,Y_tr,Y_te = train_test_split(self.Train,self.Target, random_state = 0, test_size = 1/3)
        
        
        print('\n Searching best classifier... \n')
        
        clfs = {}
        clfs['Bagging'] = {'clf': BaggingClassifier(), 'name' : 'Bagging'}
        clfs['Gradient Boosting'] = {'clf': GradientBoostingClassifier(), 'name' : 'Gradient Boosting'}
        clfs['XGBoost'] = {'clf': XGBClassifier(), 'name' : 'XGBoost'}
        clfs['Random Forest'] = {'clf': RandomForestClassifier(n_estimators = 100, n_jobs=-1), 'name' : 'Random Forest'}
        clfs['Decision Tree'] = {'clf': DecisionTreeClassifier(), 'name' : 'Decision Tree'}
        clfs['KNN'] = {'clf': KNeighborsClassifier(n_jobs=-1), 'name': 'KNN'}
        #clfs['NN'] = {'clf': MLPClassifier(), 'name': 'MLPClassifier'
        #clfs['LR'] = {'clf': LogisticClassifier(), 'name': 'LR'}
        clfs['SVM'] = {'clf': SVC(gamma = 'auto'), 'name' : 'SVM'}
        
        for item in clfs:
            
            Score = cross_val_score(clfs[item]['clf'], np.asarray(X_tr[0:n]), np.ravel(Y_tr[0:n]), 
                                                 cv=cv, scoring = self.AUC)
            Score_mean = Score.mean()
            STD2 = Score.std()*2

            clfs[item]['score'] = Score # roc_auc
            clfs[item]['mean'] = Score_mean
            clfs[item]['std2'] = STD2
            
            
            print("{} \n".format(item + ": %0.4f (+/- %0.4f)" % (clfs[item]['score'].mean(), 
                                                                              clfs[item]['score'].std()*2)))
        
        
        Best_clf = clfs[max(clfs.keys(), key = (lambda k: clfs[k]['mean']))]['name']
        
       # print(clfs[max(clfs.keys(), key = (lambda k: clfs[k]['mean']))]['name'])
        
        
        if grid :
            #print('grid = True')
            
            if params == False:
                #print('params = False')
                
                #print(Best_clf)
                
                if Best_clf == 'Gradient Boosting' :
                 #   print('Best_clf = gb')
                    
                    params = {'n_estimators' : [100, 300, 600], 
                              'max_depth' : [5, 10, None],
                             'learning_rate' : [.001, .01, .1]} 
                    
                    
                elif Best_clf == 'Random Forest':
                  #  print('Best_clf = dt ou rf')
                    
                    params = {'n_estimators' : [10,100, 300], 
                              'max_depth' : [5, 10, None],
                             'criterion' : ['gini', 'entropy']}
                    
                elif Best_clf == 'Decision Tree' : 
                   # print('best_clf = dt')
                    
                    params = {'max_depth' : [5, 10, 50, None],
                             'criterion' : ['gini', 'entropy']}
                    
                elif Best_clf == 'XGBoost':
                    #print('Best_clf = xgb')
                    
                    params = {'eta' : [.01,.1,.3], 
                              'max_depth' : [5, 10, None],
                             'gamma' : [0, .1, .01]}       
                    
                elif Best_clf == 'Bagging':
                    #print('best_clf = bag)')
                       
                    
                    params = {'n_estimators' : [100, 300, 600]} 
                
                elif Best_clf == 'KNN':
                    
                    params = {'n_neighbors' : [2,5, 10, 30, 40],
                             'p' : [1,2]}
                    
                elif Best_clf == 'SVM' :
                    
                    params = {'C' : {1, .5, .1, 5},
                             'tol' : [.01, .001, .1, .0001]}
                    
            
                
            print('\n Searching best hyperparametres of {} Classifier among : \n'.format(Best_clf))
            print('{} \n'.format(params))
            #print('Starting GridSearchCV using {} Classifier with {} folds \n'.format(Best_clf, cv_grid))
            
            
            clf = clfs[max(clfs.keys(), key = (lambda k: clfs[k]['mean']))]['clf']
            gr = GridSearchCV(clf, param_grid = params, cv = cv_grid, scoring = self.AUC, n_jobs=-1, 
                              verbose = 1, refit = True, iid = True) #;

            gr.fit(X_tr[0:n_grid], np.ravel(Y_tr[0:n_grid]))

            
            print(' Best score :', gr.best_score_,   '\n Using this parametres :', gr.best_params_)
            return(gr)
            

    
    def grid(self, clf, params, cv = 3, n = 100000):
        
        
        X_tr,X_te,Y_tr,Y_te = train_test_split(self.Train,self.Target, random_state = 0, test_size = 1/3)
        
        gr = GridSearchCV(clf, param_grid = params, cv = cv, scoring = self.AUC, n_jobs=-1, 
                          verbose = 1, refit = True, iid = True);
        
        gr.fit(X_tr[0:n], np.ravel(Y_tr[0:n]))
        
        #print(' Best score :', gr.best_score_,   '\n Using this parametres :', gr.best_params_, '\n With :', clf)
        print(' Best score :', gr.best_score_,   '\n Using this parametres :', gr.best_params_, 
              '\n With : \n {} '.format(clf))
        return gr
        
    def feature_eng(self, Test, value = 0, ID = 'ID'):
        
        Test.drop([ID], axis = 1, inplace = True)
        
        if type(value) == int:
            Test.fillna(value, inplace = True)
        
        elif value == 'bfill':
                Test.fillna('bfill', inplace = True)
        
        elif value == 'ffill':
            Test.fillna('ffill', inplace = True)
            
        
        for i in Test.columns:###########
            if Test[i].dtype == float:
                Test[i] =Test[i].astype('int')
                
            elif Test[i].dtype == object:
                    encoder = LabelEncoder()
                    encoder.fit(list(Test[i]))
                    Test[i] = encoder.transform(list(Test[i]))
    
    
    
    def pred(self, Test, gr, prob = False, same = True, ID = 'ID', value = 0): #
        
        #Test.drop([ID], axis = 1, inplace = True)
        Pred = pd.DataFrame() 

        if same == False :
            
            Test.drop([ID], axis = 1, inplace = True)
        
            if type(value) == int:
                Test.fillna(value, inplace = True)

            elif value == 'bfill':
                    Test.fillna('bfill', inplace = True)

            elif value == 'ffill':
                Test.fillna('ffill', inplace = True)


            for i in Test.columns:
                if Test[i].dtype == float:
                    Test[i] =Test[i].astype('int')

                elif Test[i].dtype == object:
                        encoder = LabelEncoder()
                        encoder.fit(list(Test[i]))
                        Test[i] = encoder.transform(list(Test[i]))   
        
        if prob == False :
            #Pred[ID] = Test[ID]
            Pred['Target'] = gr.predict(Test)
            return(Pred)

        else :
            return(gr.predict_proba(Test))

       # else :
        #    return(gr.predict_proba(self.feature_eng(Data, value , ID)))
       
      
    
    
		
		


  from numpy.core.umath_tests import inner1d


### Test Rakuten Data

In [None]:
#%run -i 'classif.py'


Train = pd.read_csv('Train.csv', sep = ',')
Test = pd.read_csv('Test.csv', sep = ',')
Test1 = pd.read_csv('Test.csv', sep = ',')
Target = pd.read_csv('Target.csv', sep =';')

CLF = classif(Train, Test, Target)


params = {'n_estimators' : [50,100, 300], 'max_depth' : [3,5, None]} #,0.05,0.1]}

gr = CLF.best_clf(params = False , n = 1000, n_grid = 1000, view_nan = True, grid = True)
#gr = CLF.best_clf(params = False , n = 1000, n_grid = 100, view_nan = True, grid = True, pred_Test = True, prob = True)


#CLF.pred(Test,gr)




#gr = CLF.grid(clf, cv = 2, n = 1000, params = params)

#CLF.pred(Test, gr, same = True, ID = 'ID', value = 0)

Missing Values :

Train
                      Total       %
WARRANTIES_PRICE      96603  96.603
SHIPPING_PRICE        67610  67.610
BUYER_BIRTHDAY_DATE    5836   5.836
SHIPPING_MODE           315   0.315
PRICECLUB_STATUS         57   0.057
SELLER_SCORE_AVERAGE      6   0.006
SELLER_SCORE_COUNT        6   0.006 

Test
                      Total          %
WARRANTIES_PRICE      96688  96.692835
SHIPPING_PRICE        67430  67.433372
BUYER_BIRTHDAY_DATE    5785   5.785289
SHIPPING_MODE           357   0.357018
PRICECLUB_STATUS         73   0.073004
SELLER_SCORE_AVERAGE     13   0.013001
SELLER_SCORE_COUNT       13   0.013001 


 Train & Test NaN data filled by 0 


 Searching best classifier... 

Bagging: 0.5353 (+/- 0.0081) 

Gradient Boosting: 0.5424 (+/- 0.0086) 



  if diff:
  if diff:
  if diff:


XGBoost: 0.5363 (+/- 0.0175) 

Random Forest: 0.5393 (+/- 0.0187) 

Decision Tree: 0.5491 (+/- 0.0236) 

KNN: 0.5182 (+/- 0.0040) 

SVM: 0.5029 (+/- 0.0040) 


 Searching best hyperparametres of Decision Tree Classifier among : 

{'max_depth': [5, 10, 50, None], 'criterion': ['gini', 'entropy']} 

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [24]:
#CLF.pred(Test, gr, same = True, ID = 'ID', value = 0)

### Test Sogec Data

In [None]:
Train = pd.read_csv('Train_sg.csv', sep = ';')
Test = pd.read_csv('Test_sg.csv', sep = ';')
Target = pd.read_csv('Target_sg.csv', sep =';')

CLF = classif(Train, Test, Target)


params = {'n_estimators' : [50,100, 300], 'max_depth' : [3,5, None]} #,0.05,0.1]}

gr = CLF.best_clf(params = False , n = 1000, n_grid = 50, view_nan = True, grid = True)

#CLF.pred(Test,gr)


# Version 2

In [7]:
import pandas as pd
import numpy as np
import operator

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelBinarizer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return(roc_auc_score(y_test, y_pred, average=average))



class BestEstimator(object):
    
    
    def __init__(self, Data, Target):
        
   
        self.Data = Data.copy()
        self.Target = Target.copy()
        self.dim_ = Data.shape
        
        self.AUC = make_scorer(multiclass_roc_auc_score)
    

        
    def best_clf(self,
                 type_esti = 'classifier', # Type of estimator : classifier or regressor
                 params = False,           # Allow to use a custom hyperparametres dict for GridSearCV
                 ID = 'ID',                # ID feature of the DataFrame used
                 target_ID = True,         # If Target feature have an ID
                 cv = 3,                   # Numbers of folds for the first estimators check
                 grid = False,             # if True, use a GridSearchCV with best estimator found
                 cv_grid = 3,              # Number of folds for the GridSearchCV 
                 n = 10000,                # Number of observations used for the first check
                 n_grid = 10000,           # Number of observations used for the GridSearchCV
                 value = 0,                # Value for fill Nan
                 view_nan = False,         # if True check the NaN Data
                 scorer = 'mae'):          # Type of scorer is type_esti = 'regressor'      
                 
        
        self.Data.drop([ID], axis = 1, inplace = True)
        if target_ID :
            self.Target.drop([ID], axis = 1, inplace = True)
        
        if view_nan:
            
            print("Missing Values :\n")
        
            total = self.Data.isnull().sum().sort_values(ascending=False)
            percent = (self.Data.isnull().sum()/self.Data.isnull().count()).sort_values(ascending=False)*100
            missing_data = pd.concat([total,percent], axis=1, keys=['Total', '%'])
            print("{} \n".format(missing_data[(percent>0)]))
        
        
        if type(value) == int:
            self.Data.fillna(value, inplace = True)
            #self.Test.fillna(value, inplace = True)
            #self.Missing_values()
        
        elif value == 'bfill':
            self.Data.fillna('bfill', inplace = True)
            #self.Test.fillna('bfill', inplace = True)
            #self.Missing_values()                        
        
        elif value == 'ffill':
            self.Data.fillna('ffill', inplace = True)
            #self.Test.fillna('ffill', inplace = True)
            #self.Missing_values()
        
        
        if self.Data.isnull().any().any() == False :
            print('\n NaN data filled by {} \n'.format(value))
        else :
            print('Fail to fill NaN data')
                
        
        for i in self.Data.columns:###########
            
            if self.Data[i].dtype == object:
                encoder = LabelEncoder()
                encoder.fit(list(self.Data[i]))
                self.Data[i] = encoder.transform(list(self.Data[i]))
            
            if self.Data[i].dtype == float:
                self.Data[i] = self.Data[i].astype('int')
                
        
        for i in self.Target.columns:
            if self.Target[i].dtype == object:
                le = LabelEncoder()
                le.fit(list(self.Target[i]))
                self.Target[i] = le.transform(list(self.Target[i]))
                

                
        X_tr,X_te,Y_tr,Y_te = train_test_split(self.Data,self.Target, random_state = 0, test_size = 1/3)
        
        
        if type_esti == 'classifier':
        
            print('\n Searching for the best classifier on {} data... \n'.format(n))

            clfs = {}
            clfs['Bagging'] = {'clf': BaggingClassifier(), 'name' : 'Bagging'}
            clfs['Gradient Boosting'] = {'clf': GradientBoostingClassifier(), 'name' : 'Gradient Boosting'}
            clfs['XGBoost'] = {'clf': XGBClassifier(), 'name' : 'XGBoost'}
            clfs['Random Forest'] = {'clf': RandomForestClassifier(n_estimators = 100, n_jobs=-1), 'name' : 'Random Forest'}
            clfs['Decision Tree'] = {'clf': DecisionTreeClassifier(), 'name' : 'Decision Tree'}
            clfs['KNN'] = {'clf': KNeighborsClassifier(n_jobs=-1), 'name': 'KNN'}
            #clfs['NN'] = {'clf': MLPClassifier(), 'name': 'MLPClassifier'
            #clfs['LR'] = {'clf': LogisticClassifier(), 'name': 'LR'}
            clfs['SVM'] = {'clf': SVC(gamma = 'auto'), 'name' : 'SVM'}

            for item in clfs:

                Score = cross_val_score(clfs[item]['clf'], np.asarray(X_tr[0:n]), np.ravel(Y_tr[0:n]), 
                                                     cv=cv, scoring = self.AUC)
                Score_mean = Score.mean()
                STD2 = Score.std()*2

                clfs[item]['score'] = Score # roc_auc
                clfs[item]['mean'] = Score_mean
                clfs[item]['std2'] = STD2


                print("{} \n".format(item + ": %0.4f (+/- %0.4f)" % (clfs[item]['score'].mean(), 
                                                                                  clfs[item]['score'].std()*2)))


            Best_clf = clfs[max(clfs.keys(), key = (lambda k: clfs[k]['mean']))]['name']

           # print(clfs[max(clfs.keys(), key = (lambda k: clfs[k]['mean']))]['name'])


            if grid :
                #print('grid = True')

                if params == False:
                    #print('params = False')

                    #print(Best_clf)

                    if Best_clf == 'Gradient Boosting' :
                     #   print('Best_clf = gb')

                        params = {'n_estimators' : [100, 300, 600], 
                                  'max_depth' : [5, 10, None],
                                 'learning_rate' : [.001, .01, .1]} 


                    elif Best_clf == 'Random Forest':
                      #  print('Best_clf = dt ou rf')

                        params = {'n_estimators' : [10,100, 300], 
                                  'max_depth' : [5, 10, None],
                                 'criterion' : ['gini', 'entropy']}

                    elif Best_clf == 'Decision Tree' : 
                       # print('best_clf = dt')

                        params = {'max_depth' : [5, 10, 50, None],
                                 'criterion' : ['gini', 'entropy']}

                    elif Best_clf == 'XGBoost':
                        #print('Best_clf = xgb')

                        params = {'eta' : [.01,.1,.3], 
                                  'max_depth' : [5, 10, None],
                                 'gamma' : [0, .1, .01]}       

                    elif Best_clf == 'Bagging':
                        #print('best_clf = bag)')


                        params = {'n_estimators' : [100, 300, 600]} 

                    elif Best_clf == 'KNN':

                        params = {'n_neighbors' : [2,5, 10, 30, 40],
                                 'p' : [1,2]}

                    elif Best_clf == 'SVM' :

                        params = {'C' : {1, .5, .1, 5},
                                 'tol' : [.01, .001, .1, .0001]}



                print('\n Searching best hyperparametres of {} Classifier on {} data among : \n'.format(Best_clf, n_grid))
                print('{} \n'.format(params))
                #print('Starting GridSearchCV using {} Classifier with {} folds \n'.format(Best_clf, cv_grid))


                clf = clfs[max(clfs.keys(), key = (lambda k: clfs[k]['mean']))]['clf']
                gr = GridSearchCV(clf, param_grid = params, cv = cv_grid, scoring = self.AUC, n_jobs=-1, 
                                  verbose = 1, refit = True, iid = True) #;

                gr.fit(X_tr[0:n_grid], np.ravel(Y_tr[0:n_grid]))


                #print(' Best score :', gr.best_score_,   '\n Using these parametres :', gr.best_params_)
                
            #####
                print('\n Finally, best estimator is : {} Classifier'.format(Best_clf), '\n Using these parametres :', gr.best_params_)
            #####
                return(gr)
        elif type_esti == 'regressor':
            pass
            
            # A développer !!!!

    
    def grid(self, clf, params, cv = 3, n = 100000):
        
        
        X_tr,X_te,Y_tr,Y_te = train_test_split(self.Data,self.Target, random_state = 0, test_size = 1/3)
        
        gr = GridSearchCV(clf, param_grid = params, cv = cv, scoring = self.AUC, n_jobs=-1, 
                          verbose = 1, refit = True, iid = True);
        
        gr.fit(X_tr[0:n], np.ravel(Y_tr[0:n]))
        
        #print(' Best score :', gr.best_score_,   '\n Using this parametres :', gr.best_params_, '\n With :', clf)
        print(' Best score on Train:', gr.best_score_,   '\n Using this parametres :', gr.best_params_, 
              '\n With : \n {} '.format(clf))
        return gr
        
    def feature_eng(self, Test, value = 0, ID = 'ID'):
        
        Test.drop([ID], axis = 1, inplace = True)
        
        if type(value) == int:
            Test.fillna(value, inplace = True)
        
        elif value == 'bfill':
                Test.fillna('bfill', inplace = True)
        
        elif value == 'ffill':
            Test.fillna('ffill', inplace = True)
            
        
        for i in Test.columns:###########
            if Test[i].dtype == float:
                Test[i] =Test[i].astype('int')
                
            elif Test[i].dtype == object:
                    encoder = LabelEncoder()
                    encoder.fit(list(Test[i]))
                    Test[i] = encoder.transform(list(Test[i]))
    
    
    
    def pred(self, Test, gr, prob = False, same = True, ID = 'ID', value = 0): #
        
        #Test.drop([ID], axis = 1, inplace = True)
        Pred = pd.DataFrame() 

        if same == False :
            
            Test.drop([ID], axis = 1, inplace = True)
        
            if type(value) == int:
                Test.fillna(value, inplace = True)

            elif value == 'bfill':
                    Test.fillna('bfill', inplace = True)

            elif value == 'ffill':
                Test.fillna('ffill', inplace = True)


            for i in Test.columns:
                if Test[i].dtype == float:
                    Test[i] =Test[i].astype('int')

                elif Test[i].dtype == object:
                        encoder = LabelEncoder()
                        encoder.fit(list(Test[i]))
                        Test[i] = encoder.transform(list(Test[i]))   
        
        if prob == False :
            #Pred[ID] = Test[ID]
            Pred['Target'] = gr.predict(Test)
            return(Pred)

        else :
            return(gr.predict_proba(Test))

       # else :
        #    return(gr.predict_proba(self.feature_eng(Data, value , ID)))
       
      
    
    
		
		


In [1]:
%run -i bestestimator.py

import time
start_time = time.time()



Train = pd.read_csv('Train.csv', sep = ',')
Target = pd.read_csv('Target.csv', sep =';')

CLF = BestEstimator(Train, Target)


params = {'n_estimators' : [50,100, 300], 'max_depth' : [3,5, None]} #,0.05,0.1]}


gr = CLF.best_clf(type_esti = 'classifier',ID = 'ID', 
                  params = False , n = 20000, n_grid = 20000, 
                  view_nan = True, grid = True, target_ID = True)

print("--- %s seconds ---" % (time.time() - start_time))

  from numpy.core.umath_tests import inner1d


Missing Values :

                      Total       %
WARRANTIES_PRICE      96603  96.603
SHIPPING_PRICE        67610  67.610
BUYER_BIRTHDAY_DATE    5836   5.836
SHIPPING_MODE           315   0.315
PRICECLUB_STATUS         57   0.057
SELLER_SCORE_AVERAGE      6   0.006
SELLER_SCORE_COUNT        6   0.006 


 NaN data filled by 0 


 Searching for the best classifier on 20000 data... 

Bagging: 0.5501 (+/- 0.0120) 

Gradient Boosting: 0.5428 (+/- 0.0091) 

Random Forest: 0.5542 (+/- 0.0096) 

Decision Tree: 0.5512 (+/- 0.0029) 

KNN: 0.5215 (+/- 0.0033) 

SVM: 0.5119 (+/- 0.0002) 


 Searching for best hyperparametres of Random Forest Classifier on 20000 data among : 

{'n_estimators': [10, 100, 300], 'max_depth': [5, 10, None], 'criterion': ['gini', 'entropy']} 

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.7min finished



 Finally, best estimator is : Random Forest Classifier 
 Using these parametres : {'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
--- 346.3958389759064 seconds ---


In [4]:
%run -i bestestimator.py

Data = pd.read_csv('data.csv', sep = ',', index_col = 0)
Target_data = pd.DataFrame()
Target_data['Target'] = Data['embauche']
Data.drop(['embauche'], axis = 1, inplace = True)

CLF = BestEstimator(Data, Target_data)

gr = CLF.best_clf(type_esti = 'classifier',ID = 'index', 
                  params = False , n = 200000, n_grid = 200000, 
                  view_nan = True, grid = True, target_ID = False)

Missing Values :

            Total      %
note          114  0.570
diplome       110  0.550
dispo         106  0.530
cheveux       103  0.515
sexe          100  0.500
exp            96  0.480
salaire        95  0.475
specialite     93  0.465
age            91  0.455
date           91  0.455 


 NaN data filled by 0 


 Searching for the best classifier on 200000 data... 

Bagging: 0.6529 (+/- 0.0051) 

Gradient Boosting: 0.5661 (+/- 0.0237) 



  if diff:
  if diff:
  if diff:


XGBoost: 0.5484 (+/- 0.0078) 

Random Forest: 0.6444 (+/- 0.0077) 

Decision Tree: 0.6739 (+/- 0.0180) 

KNN: 0.5023 (+/- 0.0026) 

SVM: 0.5000 (+/- 0.0000) 


 Searching for best hyperparametres of Decision Tree Classifier on 200000 data among : 

{'max_depth': [5, 10, 50, None], 'criterion': ['gini', 'entropy']} 

Fitting 3 folds for each of 8 candidates, totalling 24 fits

 Finally, best estimator is : Decision Tree Classifier 
 Using these parametres : {'criterion': 'gini', 'max_depth': None}


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    7.1s finished


In [3]:
%run -i bestestimator.py

Train_v2 = pd.read_csv('Train1.csv', sep = ';')
Target_v2 =  pd.read_csv('Target1.csv', sep = ';')

CLF = BestEstimator(Train_v2, Target_v2)

gr = CLF.best_clf(type_esti = 'classifier',ID = 'ID', 
                  params = False , n = 20000, n_grid = 20000, 
                  view_nan = True, grid = True, target_ID = True)

  from numpy.core.umath_tests import inner1d


Missing Values :

                            Total         %
1_diffClosing stocks(kmt)     261  2.569151
2_diffClosing stocks(kmt)     257  2.529777
3_diffClosing stocks(kmt)     253  2.490403
4_diffClosing stocks(kmt)     249  2.451029
5_diffClosing stocks(kmt)     245  2.411655
6_diffClosing stocks(kmt)     241  2.372281
7_diffClosing stocks(kmt)     238  2.342750
8_diffClosing stocks(kmt)     235  2.313220
9_diffClosing stocks(kmt)     232  2.283689
10_diffClosing stocks(kmt)    228  2.244315
11_diffClosing stocks(kmt)    224  2.204941
12_diffClosing stocks(kmt)    220  2.165567
1_diffImports(kmt)            151  1.486367
2_diffImports(kmt)            148  1.456836
3_diffImports(kmt)            145  1.427306
4_diffImports(kmt)            142  1.397775
5_diffImports(kmt)            139  1.368245
6_diffImports(kmt)            136  1.338714
7_diffImports(kmt)            133  1.309184
8_diffImports(kmt)            130  1.279654
9_diffImports(kmt)            127  1.250123
10_diffImports

  if diff:
  if diff:
  if diff:


XGBoost: 0.6846 (+/- 0.0151) 

Random Forest: 0.6868 (+/- 0.0195) 

Decision Tree: 0.6199 (+/- 0.0214) 

KNN: 0.5738 (+/- 0.0096) 

SVM: 0.4998 (+/- 0.0006) 


 Searching for best hyperparametres of Random Forest Classifier on 20000 data among : 

{'n_estimators': [10, 100, 300], 'max_depth': [5, 10, None], 'criterion': ['gini', 'entropy']} 

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.3min finished



 Finally, best estimator is : Random Forest Classifier 
 Using these parametres : {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 300}
