In [None]:
# Implement the Modeling class in Home-Credit-Prediction/homecredit/model.py

In [1]:

import os
import sys
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency # need this for chi-squared function
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
path_dir = (os.path.dirname(os.getcwd()))
sys.path.append(path_dir)
    
from homecredit.data import HomeCredit
from homecredit.preparation import Preparation
from homecredit.cleaner import Cleaning
from homecredit.exploration import Exploration

In [3]:
#.remove_missvalues() method
# removes first some entries
# then, Replaces the NaNs in numerical column by the mean of values
# in numerical column respectively 
# for categorical variables, NaNs are replaced by ''
df = Cleaning().remove_missvalues()
df.shape

(251754, 122)

In [4]:
catcols = Cleaning().prep.get_catcols() # Preparation().get_catcols() # categorical columns
numcols = Cleaning().prep.get_numcols() # Preparation().get_numcols() # numerical columns

In [5]:
#data = df.drop_duplicates(subset = df.columns)
#data.shape
#There are no duplicates values

# Encoding function

In [7]:
tt = pd.read_csv("../raw_data/application_test.csv")

df_test = Cleaning(data_set = 'test').remove_missvalues()

df_test.shape

(39431, 121)

In [8]:
tt.shape

(48744, 121)

In [27]:
if tt is  None:
    print('ok')

In [32]:
# N.B : Here, we use OneHotEncoder, but we also can use 
#pandas.get_dummies. More, get_dummies easier than OneHotEncoder

def encoding_categ_column(df, cols, new_data = None):
    
    for col_name in cols:
        
        #print(" ***** ")
    
        L = list(df[col_name].unique())
        if not (new_data is None):
            L_test = list(new_data[col_name].unique())
        
        #print("L :   ", L)
        if '' in L or L_test:
            df[col_name]=df[col_name].replace("", "NoValue") #Replace NaN by "NoValue"
            new_data[col_name]=new_data[col_name].replace("", "NoValue")

        ohe = OneHotEncoder(sparse = False) # Instanciate encoder
        #ohe.fit(df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

        #print(  list(df[col_name].unique())  )
        col_encoded = ohe.fit_transform(df[[col_name]]) # Encode

        if not (new_data is None):
            new_encoded = ohe.transform(new_data[[col_name]]) # data to predict
        
        dicts_col = {}
        keys = list(ohe.categories_[0])
        values = col_encoded.T.astype(int)
        

        for i,j in enumerate(keys):
            dicts_col[j] = values[i,:]
           
        result = pd.DataFrame.from_dict(dicts_col)
        df = df.reset_index(drop=True)
        
        #Concat df and result dataframes
        data_res = pd.concat([df, result], axis = 1)

        if 'NoValue' in list(data_res.columns):
            data_res = data_res.drop(columns= ['NoValue',col_name] )
            df = data_res
        else:
            data_res = data_res.drop(columns= col_name)
            df = data_res
            
         
        if not (new_data is None):
            dicts_newdata = {}
            values_newdata = new_encoded.T.astype(int)
            for i,j in enumerate(keys):
                dicts_newdata[j] = values_newdata[i,:]
                
            result_newdata = pd.DataFrame.from_dict(dicts_newdata)
            new_data = new_data.reset_index(drop=True)
            data_res_new = pd.concat([new_data, result_newdata], axis = 1)
          
            if 'NoValue' in list(data_res_new.columns):
                data_res_new = data_res_new.drop(columns= ['NoValue',col_name] )
                new_data = data_res_new
            else:
                data_res_new = data_res_new.drop(columns= col_name)
                new_data = data_res_new
        
    return df if new_data is None else (df, new_data)

In [34]:
encoded_df, encoded_newdata = encoding_categ_column(df = df,
                                   cols = catcols, new_data = df_test )

In [35]:
encoded_df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,terraced house,Block,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,No,Yes
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,0,1,0,0,0,0,0,0,1,0


In [36]:
encoded_newdata.head(2)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,terraced house,Block,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,No,Yes
0,100001,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329,-5170.0,...,0,0,0,0,0,0,1,0,1,0
1,100005,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,...,0,0,0,0,0,0,0,0,0,0


# KNeighborsClassifier() model

In [None]:
# create X, y
y = encoded_df.TARGET
X = encoded_df.drop('TARGET', axis = 1)

# Feature names
#features = list(X.columns)

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# KNN model
neigh = KNeighborsClassifier(n_neighbors=5)

# Train the model on the Training data
neigh.fit(X_train, y_train)

# Score the model on the Testing data
score = neigh.score(X_test, y_test) 
print( "score : ", score )

In [None]:

# KNN model
neigh = KNeighborsClassifier(n_neighbors=5)

# Train the model on the Training data
neigh.fit(X_train, y_train)

# cross validation 
# Test model performance 
score = cross_val_score(neigh, X_train, y_train, cv=5).mean() 
print("score : ", score)

#  Predict on new data
y_pred = neigh.predict(X_test)

# Test accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


# Classifier Models 

In [None]:
scoring = 'accuracy'#['roc_auc', 'accuracy'] we only use one scoring to make faster simulations
models = []
results = []

# Classifiers
models.append(('LR', LogisticRegression()))
#models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('DTC', DecisionTreeClassifier()))
#models.append(('RF', RandomForestClassifier()))
#models.append(('SVC', SVC()))

In [None]:
for name, model in models:
    for s in scoring:
        model.fit(X_train, y_train)
        res = cross_val_score(model, X_train, y_train, cv=5, scoring=s).mean()
        results.append(res)
        print("Model: ", name, " scoring:", s, " score", res)

# Prediction

 **Making prediction based on LR model that gives the best score**

In [None]:
model = LogisticRegression()
scoring = 'accuracy'

model.fit(X_train, y_train)

In [None]:
#  Predict on new data
y_pred = model.predict(X_test)

# Test accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))   

# Modeling with scaled data

In [None]:
# Scaling data

# create X, y
y = encoded_df.TARGET
X = encoded_df.drop('TARGET', axis = 1)

# Scaling features
scaler = MinMaxScaler() # Instanciate StandarScaler
scaler.fit(X)

X_rescaled = scaler.transform(X)

# Split into Train/Test
X_train_sc, X_test_sc, y_train, y_test = train_test_split(X_rescaled, y, test_size=0.3)


In [None]:
scoring = 'accuracy'# ['roc_auc', 'accuracy']
models = []
results = []

# Classifiers : we can integrate in models all classifiers we want
# here, we only use two classifiers to test if the code works well

models.append(('LR', LogisticRegression(max_iter=1000)))
#models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('DTC', DecisionTreeClassifier()))

for name, model in models:
    for s in scoring:
        model.fit(X_train_sc, y_train)
        res = cross_val_score(model, X_train_sc, y_train, cv=5, scoring=s).mean()
        results.append(res)
        print("Model: ", name, " scoring:", s, " score", res)      

**LogisticRegression model gives the best score**

**Let's make predictions on test data**

In [None]:
model = LogisticRegression(max_iter=1000)
scoring = 'accuracy'

model.fit(X_train_sc, y_train)
res = cross_val_score(model, X_train_sc, y_train, cv=5, scoring=s).mean()
print("score: ", res)  

#  Predict on new data
y_pred = model.predict(X_test_sc)

# Test accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))   

# Encoder() class

In [None]:
class Encoder:
    
    """ Initialize dataframe
    """
    def __init__(self, data_set = 'train'):

        path_dir = (os.path.dirname(os.getcwd()))
        sys.path.append(path_dir)
        # Assign an attribute ".data" to all new instances of Preparation

         # Preparation
        self.prep = Preparation()
        
        # Cleaning
        self.cl = Cleaning()
        
        self.cl.prep.data_set = data_set
                
        self.data = self.cl.remove_missvalues() 
        
        # cleaning new data to predict
        self.tt = Cleaning('test')
        self.new_data = self.tt.remove_missvalues() 
                 
        
    def execute(self, new_data=False): #new_data : we also need to encode/transform any data we want to predict 
        
        df = self.data#.copy()
        
        catcols = self.prep.get_catcols()
        
        for col_name in catcols:
                
            L = list(df[col_name].unique())
            if '' in L:
                df[col_name] = df[col_name].replace("", "NoValue") #Replace NaN by "NoCodeNature"

            ohe = OneHotEncoder(sparse = False) # Instanciate encoder
            ohe.fit(df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

            col_encoded = ohe.transform(df[[col_name]]) # Encode
            
            if new_data:
                new_encoded = ohe.transform(new_data[[col_name]]) 
                

            dicts_col = {}
            keys = list(ohe.categories_[0])
            values = col_encoded.T.astype(int)

            for i,j in enumerate(keys):
                dicts_col[j] = values[i,:]

            result = pd.DataFrame.from_dict(dicts_col)

            #copy = copy.reset_index(drop=True)

            #Concat self.data and result dataframes
            data_res = pd.concat([df.reset_index(drop=True), result], axis = 1)

            if 'NoValue' in list(data_res.columns):
                data_res = data_res.drop(columns= ['NoValue',col_name] )
                df = data_res.copy(deep=True)
            else:
                data_res = data_res.drop(columns= col_name)
                df = data_res.copy(deep=True)
        
        return df

In [68]:
class Encoder:
    
    """ Initialize dataframe
    """
    def __init__(self, data_set = 'train'):

        path_dir = (os.path.dirname(os.getcwd()))
        sys.path.append(path_dir)
        # Assign an attribute ".data" to all new instances of Preparation

         # Preparation
        self.prep = Preparation()
        
        # Cleaning
        self.cl = Cleaning()
        
        self.cl.prep.data_set = data_set
                
        self.data = self.cl.remove_missvalues() 
        
        # cleaning new data to predict
        self.tt = Cleaning('test')
        self.new_data = self.tt.remove_missvalues() 
                 
        
    def execute(self, data_topredict=False): #new_data : we also need to encode/transform any data we want to predict 
        
        df = self.data
        df_new = self.new_data
        
        catcols = self.prep.get_catcols()
        for col_name in catcols:
        
            #print(" ***** ")
            L = list(df[col_name].unique())
            

            #print("L :   ", L)
            if '' in L:
                df[col_name]=df[col_name].replace("", "NoValue") #Replace NaN by "NoValue"

            ohe = OneHotEncoder(sparse = False) # Instanciate encoder
            col_encoded = ohe.fit_transform(df[[col_name]]) # Encode

            dicts_col = {}
            keys = list(ohe.categories_[0])
            values = col_encoded.T.astype(int)

            for i,j in enumerate(keys):
                dicts_col[j] = values[i,:]
                
            result = pd.DataFrame.from_dict(dicts_col)
            df = df.reset_index(drop=True)
            #Concat df and result dataframes
            data_res = pd.concat([df, result], axis = 1)

            if 'NoValue' in list(data_res.columns):
                data_res = data_res.drop(columns= ['NoValue',col_name] )
                df = data_res
            else:
                data_res = data_res.drop(columns= col_name)
                df = data_res

            if data_topredict:
                L_test = list(df_new[col_name].unique())
                if '' in L_test:
                    df_new[col_name]=df_new[col_name].replace("", "NoValue")

                ##############  N.B  #############
                #for the data to predict, we only make transformation
                # without fitting, for this reason, we add above these lines
                ##############  N.B (END) #############
                
                new_encoded = ohe.transform(df_new[[col_name]]) # transforming data to predict
                dicts_newdata = {}
                values_newdata = new_encoded.T.astype(int)
                for i,j in enumerate(keys):
                    dicts_newdata[j] = values_newdata[i,:]

                result_newdata = pd.DataFrame.from_dict(dicts_newdata)
                df_new = df_new.reset_index(drop=True)
                data_res_new = pd.concat([df_new, result_newdata], axis = 1)

                if 'NoValue' in list(data_res_new.columns):
                    data_res_new = data_res_new.drop(columns= ['NoValue',col_name] )
                    df_new = data_res_new
                else:
                    data_res_new = data_res_new.drop(columns= col_name)
                    df_new = data_res_new
        
        return (df, df_new) if data_topredict else df



In [69]:
en = Encoder()

In [70]:
en.data.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
encoded_df = en.execute()
encoded_df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,terraced house,Block,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,No,Yes
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,0,1,0,0,0,0,0,0,1,0


In [73]:
(encoded_df_train, encoded_df_pred) = en.execute(data_topredict=True)
encoded_df_train.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,terraced house,Block,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,No,Yes
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,0,1,0,0,0,0,0,0,1,0


In [75]:
encoded_df_pred.head(2)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,terraced house,Block,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,No,Yes
0,100001,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329,-5170.0,...,0,0,0,0,0,0,1,0,1,0
1,100005,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,...,0,0,0,0,0,0,0,0,0,0


# Modeling() class

In [88]:
import time

In [81]:
class Modeling:
    
    def __init__(self, data_set = 'train'):
            
        path_dir = (os.path.dirname(os.getcwd()))
        sys.path.append(path_dir)
        # Assign an attribute ".data" to all new instances of Preparation

         # Preparation
        self.prep = Preparation()
        
        # Cleaning
        self.cl = Cleaning()
        
        self.cl.prep.data_set = data_set
        
        self.en = Encoder()
                
        self.data = self.en.execute()              
        
    def preprocess(self, VarTarg = 'TARGET', scaler = MinMaxScaler(), data_topredict=False): # we can here integrate scaler as arg
        
        #encoded_df = Encoder().execute()
        # create X, y
        y = self.data[VarTarg]
        X = self.data.drop(VarTarg, axis = 1)
        
        
        # Split into Train/Test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        

        # Scaling features
        #scaler = MinMaxScaler() # Instanciate StandarScaler
        scaler.fit(X_train)

        X_train_sc = scaler.transform(X_train)
        X_test_sc = scaler.transform(X_test)
        
        if data_topredict:
            encoded_df_pred  = self.en.execute(data_topredict=True)[1]
            encoded_df_pred_sc = scaler.transform(encoded_df_pred)

        res = (X_train_sc, X_test_sc, y_train, y_test)
        return (res ,encoded_df_pred_sc) if data_topredict else res
       
        
    def execute(self, models=[], scoring = ['accuracy']): # we can here integrate models, and scoring as arg
        #scoring = ['roc_auc', 'accuracy']
        X_train_sc, X_test_sc, y_train, y_test = self.preprocess()
        
        #scoring = ['roc_auc', 'accuracy']
        
        #models = []
        results = []

        # Classifiers
        #models.append(('LR', LogisticRegression(max_iter=1000)))
        #models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
        #models.append(('DTC', DecisionTreeClassifier()))
        
        #models.append(('RF', RandomForestClassifier()))
        #models.append(('SVC', SVC()))

        with open("models.pckl", "wb") as f:
            for name, model in models:
                for s in scoring:
                    # start timer
                    start = time.time()

                    model.fit(X_train_sc, y_train)
                    train_res = cross_val_score(model, X_train_sc, y_train, cv=5, scoring=s).mean()
                    
                    # stop timing
                    end = time.time()
                    time_run = (end - start)/60
    
                    results.append({"Name ": name, "Model " : model, 
                                    " scoring": s, " train score" : train_res,
                                     "time_run (mins)": time_run} )
                    print("Model: ", name, " scoring:", s, " train score", train_res, "time_run (mins)", time_run) 
                    
            # Save the models
            #dictResults = {"Models": models, "Scoring":scoring, "Results": results}
            
            #pickle.dump(results, f)    we use this line in Modeling() to no overwrite the existing file         
         
        return results
    
    def predict_test_score(self, best_model=LogisticRegression(max_iter=1000), best_scoring = 'accuracy'): # #scoring = ['roc_auc', 'accuracy']
        
        X_train_sc, X_test_sc, y_train, y_test = self.preprocess()
        
        best_model.fit(X_train_sc, y_train)

        #  Predict on test data
        y_pred = best_model.predict(X_test_sc)

        # model accuracy
        if best_scoring == 'roc_auc':
            print("Accuracy:",metrics.accuracy_score(y_test, y_pred))  
            testScore = metrics.roc_auc_score(y_test, y_pred)
        
        testScore = metrics.accuracy_score(y_test, y_pred)
        
        return testScore
                
    
    def predict_newdata(self, best_model):
        
        X_train_sc, X_test_sc, y_train, y_test = self.preprocess(data_topredict=True)[0]
        
        encoded_df_pred_sc  = self.preprocess(data_topredict=True)[1]
        
        best_model.fit(X_train_sc, y_train)
        
        #  Predict on test data
        y_pred = best_model.predict(encoded_df_pred_sc)

        return {"Predictions" : y_pred}

In [82]:
ml = Modeling()

In [83]:
ml.data.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,terraced house,Block,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,No,Yes
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,0,1,0,0,0,0,0,0,1,0


In [84]:
ml.data.shape

(251754, 242)

In [85]:
ml.data.isnull().sum().sum()

0

In [86]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000)))
#models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('DTC', DecisionTreeClassifier()))

In [89]:
dic = ml.execute(models)

Model:  LR  scoring: accuracy  train score 0.9129645265624557 time_run (mins) 2.5659686009089153
Model:  DTC  scoring: accuracy  train score 0.8432646470853538 time_run (mins) 2.124692932764689


In [90]:
dic

[{'Name ': 'LR',
  'Model ': LogisticRegression(max_iter=1000),
  ' scoring': 'accuracy',
  ' train score': 0.9129645265624557,
  'time_run (mins)': 2.5659686009089153},
 {'Name ': 'DTC',
  'Model ': DecisionTreeClassifier(),
  ' scoring': 'accuracy',
  ' train score': 0.8432646470853538,
  'time_run (mins)': 2.124692932764689}]

In [91]:
ml.predict_test_score(LogisticRegression(max_iter=1000))

0.9139777827796682