In [None]:
# Implement the Modeling class in Home-Credit-Prediction/homecredit/model.py

In [1]:

import os
import sys
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency # need this for chi-squared function
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
path_dir = (os.path.dirname(os.getcwd()))
sys.path.append(path_dir)
    
from homecredit.data import HomeCredit
from homecredit.preparation import Preparation
from homecredit.cleaner import Cleaning
from homecredit.exploration import Exploration

In [4]:
df = Cleaning().remove_entries()
df.shape

(91206, 122)

In [5]:
#df.dtypes

In [6]:
catcols = Preparation().get_catcols() # categorical columns
numcols = Preparation().get_numcols() # numerical columns

In [None]:
#df['a'] = df['a'].astype(float, errors = 'raise')

In [None]:
#data = df.drop_duplicates(subset = df.columns)
#data.shape

In [9]:
df[catcols].fillna('', inplace=True)

# Replace the NaNs in numerical column by the mean of values
# in numerical column respectively
df[numcols] = df[numcols].fillna(value=df[numcols].mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [10]:
df.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 122, dtype: object

In [11]:
df['AMT_ANNUITY'].dtype

dtype('float64')

In [12]:
max(df['OWN_CAR_AGE'])  

44.0

In [13]:
#catcols[:7]

In [14]:
#catcols[7:]

In [15]:
def encoding_categ_column(df, cols):
    
    for col_name in cols:
    
        L = list(df[col_name].unique())
        if '' in L:
            df[col_name].replace("", "NoValue", inplace=True) #Replace NaN by "NoCodeNature"

        ohe = OneHotEncoder(sparse = False) # Instanciate encoder
        ohe.fit(df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

        col_encoded = ohe.transform(df[[col_name]]) # Encode

        dicts_col = {}
        keys = list(ohe.categories_[0])
        values = col_encoded.T.astype(int)

        for i,j in enumerate(keys):
            dicts_col[j] = values[i,:]

        result = pd.DataFrame.from_dict(dicts_col)

        df = df.reset_index(drop=True)

        #Concat df and result dataframes
        data_res = pd.concat([df, result], axis = 1)

        if 'NoValue' in list(data_res.columns):
            data_res = data_res.drop(columns= ['NoValue',col_name] )
            df = data_res
        else:
            data_res = data_res.drop(columns= col_name)
            df = data_res
        
    return df

In [16]:
encoded_df = encoding_categ_column(df = df,
                                   cols = catcols)
encoded_df.shape

(91206, 247)

In [17]:
df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
6,100009,0,Cash loans,F,Y,Y,1,171000.0,1560726.0,41301.0,...,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0


In [18]:
encoded_df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,Mixed,Monolithic,Others,Panel,"Stone, brick",Wooden,NaN,No,Yes,NaN.1
0,100004,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,...,0,0,0,0,0,0,1,0,0,1
1,100009,0,1,171000.0,1560726.0,41301.0,1395000.0,0.035792,-13778,-3130,...,0,0,0,0,0,0,1,0,0,1


In [20]:
#encoded_df.dtypes.unique()

In [29]:
# create X, y
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier


y = encoded_df.TARGET
X = encoded_df.drop('TARGET', axis = 1)
X.shape

# Feature names
features = list(X.columns)

# Se crean los datos de entrenamiento y de prueba para el entrenamiento y evaluación del modelo de Machine Learning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# Se crea el modelo KNN
neigh = KNeighborsClassifier(n_neighbors=5)

# Se entrena el modelo
neigh.fit(X_train, y_train)

# Se predicen los datos de prueba
y_pred = neigh.predict(X_test)

# Se calcula la exactitud del modelo
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))



Accuracy: 0.9188137265650697


In [None]:

class Modeling:
    
    def __init__(self):
        # Assign an attribute ".data" to all new instances of Preparation
        #self.data = HomeCredit().get_data()['train'].copy() # good practice to be sure not to modify your `data` variable
        
        # Cleaning
        self.data = Cleaning().remove_entries()
        self.catcols = Preparation().get_catcols()
        
    
    def cramers_val(self, col1, col2, margins=False): # df : dataframe
        chisqt = pd.crosstab(self.data[col1], self.data[col2], margins=margins)
        value = chisqt.to_numpy() 

        #Chi-squared test statistic, sample size, and minimum of rows and columns
        X2 = chi2_contingency(value, correction=False)[0]
        n = np.sum(value)
        minDim = min(value.shape)-1

        #calculate Cramer's V 
        V = np.sqrt((X2/n) / minDim) 

        return V
    
    # Plot of Heatmap of Cramer's V
    def plot_heatmapCramerV(self):
        L = len(self.catcols)
        cramers_outputs = np.zeros((L,L))

        for i,x in enumerate(self.catcols):
            for j,y in enumerate(self.catcols):
                result = round(self.cramers_val(x,y,margins=False),4)
                cramers_outputs[i,j] = result
                
        fig = plt.figure(figsize = (8, 8))  # instanciate figure for heat map
        ax = sns.heatmap(cramers_outputs, annot = True,  cmap = "BuPu", fmt=".0%", cbar = False)
        ax.set_xticklabels(self.catcols)
        ax.set_yticklabels(self.catcols)
        ax.tick_params(axis = 'x', labelrotation = 90)
        ax.tick_params(axis = 'y', labelrotation = 0)
        ax.set_title("Heatmap of Cramer's V on categorical variables");


   

In [None]:
exp = Exploration()

In [None]:
exp.data.head(2)

In [None]:
exp.data.shape

In [None]:
exp.categ_relation(col1="CODE_GENDER", col2="NAME_CONTRACT_TYPE")

In [None]:
exp.cramers_val(col1="CODE_GENDER", col2="NAME_CONTRACT_TYPE")

In [None]:
exp.plot_heatmapCramerV()