In [None]:
# Implement the Modeling class in Home-Credit-Prediction/homecredit/model.py

In [19]:

import os
import sys
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency # need this for chi-squared function
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier



import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
path_dir = (os.path.dirname(os.getcwd()))
sys.path.append(path_dir)
    
from homecredit.data import HomeCredit
from homecredit.preparation import Preparation
from homecredit.cleaner import Cleaning
from homecredit.exploration import Exploration

In [3]:
df = Cleaning().remove_entries()
df.shape

(91206, 122)

In [4]:
#df.dtypes

In [5]:
catcols = Preparation().get_catcols() # categorical columns
numcols = Preparation().get_numcols() # numerical columns

In [6]:
#df['a'] = df['a'].astype(float, errors = 'raise')

In [7]:
#data = df.drop_duplicates(subset = df.columns)
#data.shape

In [8]:
df[catcols].fillna('', inplace=True)

# Replace the NaNs in numerical column by the mean of values
# in numerical column respectively
df[numcols] = df[numcols].fillna(value=df[numcols].mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [9]:
df.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 122, dtype: object

In [10]:
df['AMT_ANNUITY'].dtype

dtype('float64')

In [None]:
#catcols[:7]

In [None]:
#catcols[7:]

In [11]:
def encoding_categ_column(df, cols):
    
    for col_name in cols:
    
        L = list(df[col_name].unique())
        if '' in L:
            df[col_name].replace("", "NoValue", inplace=True) #Replace NaN by "NoCodeNature"

        ohe = OneHotEncoder(sparse = False) # Instanciate encoder
        ohe.fit(df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

        col_encoded = ohe.transform(df[[col_name]]) # Encode

        dicts_col = {}
        keys = list(ohe.categories_[0])
        values = col_encoded.T.astype(int)

        for i,j in enumerate(keys):
            dicts_col[j] = values[i,:]

        result = pd.DataFrame.from_dict(dicts_col)

        df = df.reset_index(drop=True)

        #Concat df and result dataframes
        data_res = pd.concat([df, result], axis = 1)

        if 'NoValue' in list(data_res.columns):
            data_res = data_res.drop(columns= ['NoValue',col_name] )
            df = data_res
        else:
            data_res = data_res.drop(columns= col_name)
            df = data_res
        
    return df

In [12]:
encoded_df = encoding_categ_column(df = df,
                                   cols = catcols)
encoded_df.shape

(91206, 247)

In [14]:
# create X, y
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier


y = encoded_df.TARGET
X = encoded_df.drop('TARGET', axis = 1)

# Feature names
#features = list(X.columns)

# Se crean los datos de entrenamiento y de prueba para el entrenamiento y evaluación del modelo de Machine Learning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# KNN model
neigh = KNeighborsClassifier(n_neighbors=5)

# Model fitting
neigh.fit(X_train, y_train)

#  baseline score
score = neigh.score(X_train, y_train) # Score model
print( "score : ", score )

# Se predicen los datos de prueba
y_pred = neigh.predict(X_test)

# Se calcula la exactitud del modelo
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))



score :  0.9262734164526032
Accuracy: 0.9216431547401506


In [15]:
# Se crean los datos de entrenamiento y de prueba para el entrenamiento y evaluación del modelo de Machine Learning
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Se crea el modelo KNN
neigh = KNeighborsClassifier(n_neighbors=5)

cross_val_score(neigh, X_train, y_train, cv=5).mean() 



0.9204310530614264

In [16]:
scoring = ['roc_auc', 'accuracy']

In [18]:
for s in scoring:
    neigh = KNeighborsClassifier(n_neighbors=5)
    print(s, "---->", cross_val_score(neigh, X_train, y_train, cv=5, scoring=s).mean())


roc_auc ----> 0.5277718867254786
accuracy ----> 0.9204310530614264


In [None]:
#instanciation
model_SVC = SVC( kernel = 'linear', gamma = 'scale', shrinking = False,)
#training
model_SVC.fit( X_train, y_train)
#calcule de précision
print("score: ",  model_SVC.score( X_test, y_test))

# Se predicen los datos de prueba
y_pred = model_SVC.predict(X_test)

# Se calcula la exactitud del modelo
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#instanciation
model_DT = DecisionTreeClassifier() 

#training
model_DT.fit( X_train, y_train)
#calcule de précision
print("score: ",  model_DT.score( X_test, y_test))

# Se predicen los datos de prueba
y_pred = model_SVC.predict(X_test)

# Se calcula la exactitud del modelo
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:

class Modeling:
    
    def __init__(self):
        # Assign an attribute ".data" to all new instances of Preparation
        #self.data = HomeCredit().get_data()['train'].copy() # good practice to be sure not to modify your `data` variable
        
        # Cleaning
        self.data = Cleaning().remove_entries()
        self.catcols = Preparation().get_catcols()
        
    
    def cramers_val(self, col1, col2, margins=False): # df : dataframe
        chisqt = pd.crosstab(self.data[col1], self.data[col2], margins=margins)
        value = chisqt.to_numpy() 

        #Chi-squared test statistic, sample size, and minimum of rows and columns
        X2 = chi2_contingency(value, correction=False)[0]
        n = np.sum(value)
        minDim = min(value.shape)-1

        #calculate Cramer's V 
        V = np.sqrt((X2/n) / minDim) 

        return V
    
    # Plot of Heatmap of Cramer's V
    def plot_heatmapCramerV(self):
        L = len(self.catcols)
        cramers_outputs = np.zeros((L,L))

        for i,x in enumerate(self.catcols):
            for j,y in enumerate(self.catcols):
                result = round(self.cramers_val(x,y,margins=False),4)
                cramers_outputs[i,j] = result
                
        fig = plt.figure(figsize = (8, 8))  # instanciate figure for heat map
        ax = sns.heatmap(cramers_outputs, annot = True,  cmap = "BuPu", fmt=".0%", cbar = False)
        ax.set_xticklabels(self.catcols)
        ax.set_yticklabels(self.catcols)
        ax.tick_params(axis = 'x', labelrotation = 90)
        ax.tick_params(axis = 'y', labelrotation = 0)
        ax.set_title("Heatmap of Cramer's V on categorical variables");


   

In [None]:
exp = Exploration()

In [None]:
exp.data.head(2)

In [None]:
exp.data.shape

In [None]:
exp.categ_relation(col1="CODE_GENDER", col2="NAME_CONTRACT_TYPE")

In [None]:
exp.cramers_val(col1="CODE_GENDER", col2="NAME_CONTRACT_TYPE")

In [None]:
exp.plot_heatmapCramerV()