In [None]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import scipy.stats as stats
from sklearn import preprocessing
from scipy.stats import chi2_contingency
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import  sklearn.svm as svm
import pickle

# 1. Load Dataset 

In [None]:
os.listdir('dataset')

In [None]:
org_dataset = pd.read_csv('dataset/current_app.csv', na_values= ["N/a", "na", "XNA", np.nan])
org_dataset.head(10)

In [None]:
org_dataset.shape

Notice: the data set has about 307k rows and 122 features (regressors)

# 2. Cleaning Dataset

## 2.1 visulalize and analyze Nan Values in dataset

In [None]:
to_clean_dataset = org_dataset.copy()
to_clean_dataset.isnull().any().any()

In [None]:
msno.matrix(to_clean_dataset,figsize=(20,10),color=(0, 0 ,1))

Notice: the dataset has many rows with a lot of Nan values 
recommended 1: to drop columns with nan values of percentage > 50% 

In [None]:
to_clean_dataset.dropna(thresh= to_clean_dataset.shape[0]* 0.5, axis = 1, inplace=True )
to_clean_dataset.shape

Notice: the number of columns decreased from 122 to 81 <br>
when dropped columns with nan values of percentage > 50% <br> 
recommended 2: fill columns with Nan values < 13% with mean or mode depending on column type 

In [None]:
dirty_cols =to_clean_dataset.columns[to_clean_dataset.isnull().sum() >0.13 * to_clean_dataset.shape[0]]
dirty_cols

In [None]:
to_clean_dataset[dirty_cols].head()

Notice: We will replace with mode values for categorical columns and mean for continous columns<br> 
but for this we will first check if there are any outliers so that the mean values would be correct

In [None]:
to_clean_dataset.head(10)

In [None]:
dirty_cols2 = dirty_cols.delete([0,1,10])
_ = to_clean_dataset[dirty_cols2].hist(figsize=(20,20) )

Notice: All columns seems to have valid values (ignoring Nan)<br> 
Now we can fill the columns with mean and mode

In [None]:
to_clean_dataset['OCCUPATION_TYPE'].fillna(value= to_clean_dataset['OCCUPATION_TYPE'].mode()[0] ,inplace = True )
to_clean_dataset['ORGANIZATION_TYPE'].fillna(value= to_clean_dataset['ORGANIZATION_TYPE'].mode()[0] ,inplace = True )
to_clean_dataset['EMERGENCYSTATE_MODE'].fillna(value= to_clean_dataset['EMERGENCYSTATE_MODE'].mode()[0] ,inplace = True )
to_clean_dataset['CODE_GENDER'].fillna(value= to_clean_dataset['CODE_GENDER'].mode()[0] ,inplace = True )

to_clean_dataset['EXT_SOURCE_3'].fillna(value= to_clean_dataset['EXT_SOURCE_3'].mean() ,inplace = True )
to_clean_dataset['YEARS_BEGINEXPLUATATION_AVG'].fillna(value= to_clean_dataset['YEARS_BEGINEXPLUATATION_AVG'].mean() ,inplace = True )
to_clean_dataset['FLOORSMAX_AVG'].fillna(value= to_clean_dataset['FLOORSMAX_AVG'].mean() ,inplace = True )
to_clean_dataset['YEARS_BEGINEXPLUATATION_MODE'].fillna(value= to_clean_dataset['YEARS_BEGINEXPLUATATION_MODE'].mean() ,inplace = True )
to_clean_dataset['FLOORSMAX_MODE'].fillna(value= to_clean_dataset['FLOORSMAX_MODE'].mean() ,inplace = True )
to_clean_dataset['YEARS_BEGINEXPLUATATION_MEDI'].fillna(value= to_clean_dataset['YEARS_BEGINEXPLUATATION_MEDI'].mean() ,inplace = True )
to_clean_dataset['TOTALAREA_MODE'].fillna(value= to_clean_dataset['TOTALAREA_MODE'].mean() ,inplace = True )
to_clean_dataset['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(value= to_clean_dataset['AMT_REQ_CREDIT_BUREAU_HOUR'].mean() ,inplace = True )
to_clean_dataset['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(value= to_clean_dataset['AMT_REQ_CREDIT_BUREAU_DAY'].mean() ,inplace = True )
to_clean_dataset['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(value= to_clean_dataset['AMT_REQ_CREDIT_BUREAU_WEEK'].mean() ,inplace = True )
to_clean_dataset['AMT_REQ_CREDIT_BUREAU_MON'].fillna(value= to_clean_dataset['AMT_REQ_CREDIT_BUREAU_MON'].mean() ,inplace = True )
to_clean_dataset['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(value= to_clean_dataset['AMT_REQ_CREDIT_BUREAU_QRT'].mean() ,inplace = True )
to_clean_dataset['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(value= to_clean_dataset['AMT_REQ_CREDIT_BUREAU_YEAR'].mean() ,inplace = True )

Notice: For columns with Nan values > 13% and <50% we will drop rows containing Nan values

In [None]:
to_clean_dataset.head()

In [None]:
to_clean_dataset.dropna(inplace=True)
to_clean_dataset.isnull().any().any()

Notice: Now our dataset has no Nan values

In [None]:
# remove SK_ID_CURR Columns from dataset as it won't be useful for our problem 
to_clean_dataset = to_clean_dataset.drop(labels=['SK_ID_CURR'], axis = 1)

In [None]:
to_clean_dataset.info()

## 2.2 Check invalid values in dataset

In [None]:
to_clean_dataset.describe()

In [None]:
_ = to_clean_dataset[to_clean_dataset.columns[:len(to_clean_dataset.columns)//2]].hist(figsize=(20,20))

In [None]:
to_clean_dataset_ = to_clean_dataset[to_clean_dataset.columns[len(to_clean_dataset.columns)//2:]].hist(figsize=(20,20))

Notice: Some variables have negative values where they should be positive such as; <br> 
DAYS_BIRTH, DAYES_REGISTRATION, DAYES_ID_PUBLISH, DAYES_EMPLOYED, DAYES_LAST_PHONE_CHANGE <br> 
assuming the negative sign is added falsely we will convert values to positive

In [None]:
to_clean_dataset['DAYS_BIRTH'] = abs(to_clean_dataset['DAYS_BIRTH'] )
to_clean_dataset['DAYS_REGISTRATION'] = abs(to_clean_dataset['DAYS_REGISTRATION'] )
to_clean_dataset['DAYS_ID_PUBLISH'] = abs(to_clean_dataset['DAYS_ID_PUBLISH'] )
to_clean_dataset['DAYS_EMPLOYED'] = abs(to_clean_dataset['DAYS_EMPLOYED'] )
to_clean_dataset['DAYS_LAST_PHONE_CHANGE'] = abs(to_clean_dataset['DAYS_LAST_PHONE_CHANGE'] )

In [None]:
to_clean_dataset.info()

In [None]:
binary_features = [col for col in to_clean_dataset.columns if to_clean_dataset[col].unique().size <= 2]
binary_features

In [None]:
clean_dataset = to_clean_dataset.copy()

Notice: Considering the very large amount of features<br>  
We will apply bivariate analysis first to remove redundant features<br> 
then We will analyize the usefull features

# 3. Bivariate Analysis
## 3.1 Continuous Vs Continuous

In [None]:
clean_dataset.dtypes.unique()

In [None]:
cont_cols = clean_dataset.columns[(clean_dataset.dtypes == 'int64')|(clean_dataset.dtypes == 'float64')]
cont_cols = [col for col in cont_cols if col not in binary_features]

In [None]:
def get_correlated(cor, thr):
    corr_idx = []
    for i in range(cor.shape[0]): 
        for j in range(i, cor.shape[1]): 
            if i != j and abs(cor[i,j]) >= thr:
                corr_idx.append((i,j))
    
    return corr_idx


In [None]:
cor = clean_dataset[cont_cols].corr()
plt.figure(figsize=(50,50))
_ = sns.heatmap(data = cor ,cmap=plt.cm.CMRmap_r, annot= True)

Notice: From the heat map above we can see that some features are correlated.<br> 
So, we will remove features to get independent features. our correlation threshold is 0.85

In [None]:
corr_idx = get_correlated(cor.to_numpy(), 0.85)
indicies =np.unique([idx for idx,_ in corr_idx])
for idx in indicies:
    clean_dataset.drop(cont_cols[idx], axis = 1, inplace=True)
clean_dataset.shape

Notice: the features decreased from 80 to 71 features 

## 3.2 Continous Vs Output (Categorical)

In [None]:
cont_cols = clean_dataset.columns[(clean_dataset.dtypes == 'int64')|(clean_dataset.dtypes == 'float64')]
cont_cols = [col for col in cont_cols if col not in binary_features]

cat_cols = clean_dataset.columns[(clean_dataset.dtypes != 'int64')&(clean_dataset.dtypes != 'float64')].values
cat_cols = np.append(cat_cols, binary_features)

In [None]:
def cont_cat_boxplot(cat_col, cont_col):
    plt.figure()
    sns.boxplot(data = clean_dataset, x= cat_col, y = cont_col, order=clean_dataset[cat_col].value_counts().index)
    plt.xticks(rotation = 90)
    plt.show()
    plt.close()
def cont_vs_out(out, cont_col):
    plt.figure()
    sns.boxplot(data = clean_dataset, x= out, y = cont_col, order=out.value_counts().index)
    plt.xticks(rotation = 90)
    plt.show()
    plt.close()

Notice: We will remove features that are not useful for prediction<br> 
From the box plot, if Y axis has same distribution for all categories of categorical<br> 
feature on X axis, then there is no relation between Y axis and X axis, hence, we will remove this feature
because it is a weak regressor. 


In [None]:
Y = clean_dataset['TARGET']
for j in range(len(cont_cols)):
    print(cont_cols[j])
    cont_vs_out(Y, cont_cols[j])

Notice: After drawing box plot of continuous features vs output TARGET<br> 
We notice that the distribution for many features is almost the same for both output values<br> 
Which indicates that the output doesn't depend on those features. From the above plot we decided to remove the following features as they won't be useful: ['REGION_POPULATION_RELATIVE', 'DAYS_BIRTH','DAYS_REGISTRATION',
                    'DAYS_ID_PUBLISH','REGION_RATING_CLIENT_W_CITY','HOUR_APPR_PROCESS_START'
                    'LIVE_REGION_NOT_WORK_REGION','REG_CITY_NOT_LIVE_CITY','LIVE_CITY_NOT_WORK_CITY',
                    'YEARS_BEGINEXPLUATATION_MEDI','FLOORSMAX_MEDI','TOTALAREA_MODE','DAYS_LAST_PHONE_CHANGE',
                    'AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
                    'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_YEAR']

In [None]:
# added 'DAYS_BIRTH', 'DAYS_ID_PUBLISH' , 'HOUR_APPR_PROCESS_START', 'FLOORSMAX_MEDI'
useless_features = ['AMT_ANNUITY','AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE','DAYS_REGISTRATION',
                    'REGION_RATING_CLIENT_W_CITY',
                    'YEARS_BEGINEXPLUATATION_MEDI','TOTALAREA_MODE','DAYS_LAST_PHONE_CHANGE',
                    'AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
                    'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_YEAR']
len(useless_features)

Notice: Columns; OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE, AMT_REQ_CREDIT_BUREAU_QRT, AMT_INCOME_TOTAL, DAYS_EMPLOYED have outlier we will remove the ouliers and see distribution again

In [None]:
clean_dataset.drop((clean_dataset[clean_dataset.OBS_60_CNT_SOCIAL_CIRCLE > 40].index), axis = 0, inplace= True)
clean_dataset.drop((clean_dataset[clean_dataset.DEF_60_CNT_SOCIAL_CIRCLE > 20].index), axis = 0, inplace= True)
clean_dataset.drop((clean_dataset[clean_dataset.AMT_REQ_CREDIT_BUREAU_QRT > 200].index), axis = 0, inplace= True)
clean_dataset.drop((clean_dataset[clean_dataset.AMT_INCOME_TOTAL > 0.4e6].index), axis = 0, inplace= True)
clean_dataset.drop((clean_dataset[clean_dataset.DAYS_EMPLOYED > 300000].index), axis = 0, inplace= True)

In [None]:
cont_vs_out(Y, clean_dataset.OBS_60_CNT_SOCIAL_CIRCLE)
cont_vs_out(Y, clean_dataset.DEF_60_CNT_SOCIAL_CIRCLE)
cont_vs_out(Y, clean_dataset.AMT_REQ_CREDIT_BUREAU_QRT)
cont_vs_out(Y, clean_dataset.AMT_INCOME_TOTAL)
cont_vs_out(Y, clean_dataset.DAYS_EMPLOYED)

Notice: They also are independent of output so we will remove them too except DAYS_EMPLOYES, AMT_INCOME_TOTAL there is sort of dependence

In [None]:
useless_features.extend(['OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'AMT_REQ_CREDIT_BUREAU_QRT'])

In [None]:
clean_dataset.drop(useless_features, axis = 1, inplace= True)

In [None]:
clean_dataset.shape

Notice Now we have 51 regressors

## 3.3 Binary Categorical Vs Output (binary categorical) 
### For this problem we will use pearson r method to calculate correlation between 
### binary categorical and output

In [None]:
#convert binary categorical to numerical values
clean_dataset['FLAG_OWN_REALTY'] = pd.get_dummies(clean_dataset['FLAG_OWN_REALTY'], drop_first=True, prefix = 'FLAG_OWN_REALTY')
clean_dataset['FLAG_OWN_CAR'] = pd.get_dummies(clean_dataset['FLAG_OWN_CAR'], drop_first=True, prefix = 'FLAG_OWN_CAR')
clean_dataset['CODE_GENDER'] = pd.get_dummies(clean_dataset['CODE_GENDER'], drop_first=True, prefix = 'CODE_GENDER')
clean_dataset['NAME_CONTRACT_TYPE'] = pd.get_dummies(clean_dataset['NAME_CONTRACT_TYPE'], drop_first=True, prefix = 'NAME_CONTRACT_TYPE')
clean_dataset['EMERGENCYSTATE_MODE'] = pd.get_dummies(clean_dataset['EMERGENCYSTATE_MODE'], drop_first=True, prefix = 'EMERGENCYSTATE_MODE')

In [None]:
for i in range(len(binary_features)):
    r = stats.pearsonr(clean_dataset[binary_features[i]], clean_dataset['TARGET'])
    print(f'corr of {binary_features[i]} vs TARGET = {r}')

Notice: All binary features except CODE_GENDER, NAME_CONTRACT_TYPE, FLAG_OWN_CAR,FLAG_PHONE, 
REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_3 doesn't have any correlation with output, so we will drop all of them

In [None]:
binary_to_keep = ['CODE_GENDER', 'NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR','FLAG_PHONE', 
'REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','FLAG_DOCUMENT_3','TARGET']
features = [f for f in binary_features if f not in binary_to_keep]
clean_dataset.drop(features, axis = 1, inplace=True)
clean_dataset.shape

In [None]:
clean_dataset.head()

In [None]:
clean_dataset.info()

## 3.4 Multiple Categorical Vs Continuous


In [None]:
cat_cols = clean_dataset.columns[(clean_dataset.dtypes == 'object')]
cont_cols = clean_dataset.columns[(clean_dataset.dtypes == 'int64')|(clean_dataset.dtypes == 'float64')]
cont_cols = [col for col in cont_cols if col not in binary_features]

In [None]:
for j in range(len(cont_cols)):
    for i in range(len(cat_cols)):
        print(f'{cont_cols[j]} vs {cat_cols[i]}')
        cont_cat_boxplot(cat_cols[i], cont_cols[j])

Notice: For this analysis we will remove the redundant feature that is correlated to another existing one <br> 
because we need all features to be indepenedent <br> 
We can notice that AMT_INCOME_TOTAL vs NAME_INCOME_TYPE, AMT_INCOME_TOTAL vs NAME_EDUCATION_TYPE ,
 CNT_FAM_MEMBERS vs NAME_FAMILY_STATUS are dependent

In [None]:
# added NAME_EDUCATION_TYPE
clean_dataset.drop(['NAME_INCOME_TYPE',], axis = 1, inplace= True)
clean_dataset.shape

## 3.5 Multiple Categorical Vs Categorical
For this analysis we will use chi2 test to calculate the correlation between the variables

In [None]:
cat_cols = clean_dataset.columns[(clean_dataset.dtypes == 'object')]
cat_cols =np.append(cat_cols.values,binary_to_keep)
cat_cols

In [None]:
to_encoded = clean_dataset.copy()
label = preprocessing.LabelEncoder()
encoded_data = pd.DataFrame(to_encoded)
for col in cat_cols:
    encoded_data[col] = label.fit_transform(to_encoded[col])


In [None]:
encoded_data.head()

In [None]:
def calc_cramerV(col1, col2):
    cross_table = pd.crosstab(col1, col2)
    chi2 = chi2_contingency(cross_table)[0]
    n = cross_table.sum().sum()
    phi2 = chi2/n
    r,k = cross_table.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

rows = []
for i in range(len(cat_cols)):
    cols = []
    for j in range(len(cat_cols)):
        V = calc_cramerV(encoded_data[cat_cols[i]],encoded_data[cat_cols[j]])
        cols.append(V)
    rows.append(cols)

In [None]:
corr_matrix = pd.DataFrame(data = rows, index= cat_cols,columns = cat_cols )
corr_matrix

From the above matrix the correlation between we will remove a feature if correlated with another for over 85% <br> 
and we will keep a features if correlated with target output for over 5%

In [None]:
# correlated with target 
f_sel_data = clean_dataset.copy()
f_sel_data.drop(['NAME_TYPE_SUITE', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START', 
                    'FLAG_OWN_CAR', 'FLAG_PHONE', 'REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY'], axis = 1, inplace= True)
# total_features = ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'CODE_GENDER','DAYS_EMPLOYED','AMT_INCOME_TOTAL',
#                  'CNT_FAM_MEMBERS','EXT_SOURCE_2','EXT_SOURCE_3','TARGET']
# # ,

In [None]:
f_sel_data.head()
# f_sel_data = f_sel_data[total_features]
# f_sel_data.head()

# Univariate Analaysis (Understand the Variables) 

### Distribution of applications

In [None]:
labels = [ 'Non-Defaulters', 'Defaulters']
plt.title("Distribution of Apps")
_ = plt.pie(f_sel_data.TARGET.value_counts(),labels=labels, explode= (0.1,0.1), autopct= "%1.1f%%", startangle=90)

Notice: that our dataset has 93% Non-Defaulters and 7% Defaulters so our dataset is <b>Impalanced<b>

### Female Vs Male applicants 

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
fig.set_figwidth(10)
fig.set_figheight(4)
_ = sns.countplot(x = 'TARGET', data= f_sel_data[f_sel_data.CODE_GENDER == 0], ax= ax1)
ax1.set_title('Distribution of Females')
ax1.set_ylim(0, len(f_sel_data))
ax1.set_xticklabels(['Non-Defaulter', 'Defaulter'])

_ = sns.countplot(x = 'TARGET', data= f_sel_data[f_sel_data.CODE_GENDER == 1], ax= ax2)
ax2.set_title('Distribution of Males')
ax2.set_ylim(0, len(f_sel_data))
ax2.set_xticklabels(['Non-Defaulter', 'Defaulter'])

Notice: Here the number of Non-defalters for men is less than the number of defaulters for women
To get better insight we better calculate percentage

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
fig.set_figwidth(10)
fig.set_figheight(4)
labels = [ 'Non-Defaulters', 'Defaulters']
ax1.pie(f_sel_data[f_sel_data.CODE_GENDER == 1].TARGET.value_counts(),labels=labels, explode= (0.1,0.1), autopct= "%1.1f%%", startangle=90)
ax1.set_title('Percentage of Males')

ax2.pie(f_sel_data[f_sel_data.CODE_GENDER == 0].TARGET.value_counts(),labels=labels, explode= (0.1,0.1), autopct= "%1.1f%%", startangle=90)
ax2.set_title('Percentage of Females')


Notice: Now we can see that the percentage of Non-Defaulted females is larger than men 

###  Family members for females and males 

In [None]:
plt.figure(figsize=(10,4))
plt.xticks(rotation = 45)
plt.title("Occupation type for Males")
sns.countplot(x = 'OCCUPATION_TYPE', data= f_sel_data[f_sel_data.CODE_GENDER == 1])
plt.figure(figsize=(10,4))
plt.xticks(rotation = 45)
plt.title("Occupation type for Females")
sns.countplot(x = 'OCCUPATION_TYPE', data= f_sel_data[f_sel_data.CODE_GENDER == 0])



Notice: we can see that most people who want a credit are laborers<br> 

In [None]:
X = f_sel_data.copy()

In [None]:
X.to_csv("Selected_dataset.csv", index=False)

# Learning Model

In [1]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import scipy.stats as stats
from sklearn import preprocessing
from scipy.stats import chi2_contingency
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import  sklearn.svm as svm
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

### Encoding Categorial Features

In [2]:
X = pd.read_csv("Selected_dataset.csv")
X.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,AMT_INCOME_TOTAL,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,OCCUPATION_TYPE,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,FLOORSMAX_MEDI,FLAG_DOCUMENT_3
0,1,0,1,202500.0,Secondary / secondary special,Single / not married,9461,637,2120,Laborers,1.0,10,Business Entity Type 3,0.262949,0.139376,0.0833,1
1,0,0,0,270000.0,Higher education,Married,16765,1188,291,Core staff,2.0,11,School,0.622246,0.510853,0.2917,1
2,0,0,0,67500.0,Secondary / secondary special,Married,13439,2717,3227,Laborers,2.0,10,Business Entity Type 2,0.715042,0.176653,0.1667,1
3,0,0,1,225000.0,Secondary / secondary special,Married,14086,3028,4911,Drivers,3.0,13,Self-employed,0.566907,0.770087,0.3333,1
4,0,0,0,189000.0,Secondary / secondary special,Married,14583,203,2056,Laborers,2.0,9,Transport: type 2,0.642656,0.510853,0.6667,1


In [3]:
def encode_data(X):
    Y = X.TARGET
    X.drop(['TARGET'] , axis = 1, inplace = True)
    num_cols = ['DAYS_BIRTH','FLOORSMAX_MEDI','HOUR_APPR_PROCESS_START','DAYS_ID_PUBLISH','CNT_FAM_MEMBERS','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_EMPLOYED','AMT_INCOME_TOTAL']
    mu = X[num_cols].mean(axis = 0)
    X[num_cols] = X[num_cols] - mu
    X[num_cols] /= X[num_cols].max(axis = 0)
    encoder = OneHotEncoder(drop='if_binary', sparse=True)
    cat_cols = X.columns[X.dtypes == 'object']
    encoded_cols = encoder.fit_transform(X[cat_cols])
    encoded_X = X.drop(cat_cols, axis = 1)
    encoded_X = np.append( encoded_cols.toarray(), encoded_X.to_numpy(), axis= 1)
    return encoded_X, Y.to_numpy()



In [4]:
# Normalize continous data
encoded_X, Y = encode_data(X)

total = np.arange(0,len(X), 1)
test_indx = np.random.randint(0,len(X),size = int(0.1* len(X)))
train_indx = np.setdiff1d(total, test_indx)

X_test = encoded_X[test_indx].copy()
y_test = Y[test_indx].copy()
Y = Y[train_indx].copy()
encoded_X = encoded_X[train_indx].copy()

In [5]:
encoded_X.shape

(109596, 97)

In [7]:
X_test.shape

(12109, 97)

### Helpers 

In [8]:
def svm_clf(X_train, X_valid, y_train, y_valid,kernel , degree, nu = None,c = 1.0, plot = True):
    #fit model
    clf = None 
    if nu == None: 
        clf = svm.SVC(kernel = kernel, gamma='auto',C = c, degree = degree)
    else: 
        clf = svm.Nu_SVC(gamma='auto')
        
    clf.fit(X_train, y_train)
    
    #predict train set
    train_pred = clf.predict(X_train)
    valid_pred = clf.predict(X_valid)
    if plot: 
        print(f'Training Accuracy: {(train_pred == y_train).mean()*100:.2f} , Test Accuracy: {(valid_pred == y_valid).mean()*100:.2f}')
        plot_confusion(y_train, train_pred, y_valid, valid_pred)
    return clf, (train_pred == y_train).mean(),(valid_pred == y_valid).mean()*100

def plot_confusion(y_train, train_pred, y_valid, valid_pred):
    fig, (ax1, ax2) = plt.subplots(1,2)
    fig.set_figheight(5)
    fig.set_figwidth(8)
    
    ax1.set_title("Training Confuction Matrix")
    ax2.set_title("Testing Confuction Matrix")
    plt.subplots_adjust(left=0.1, bottom=0.1,right=0.9, top=0.9, wspace=0.5, hspace=0.4)
    
    cm = confusion_matrix(y_train, train_pred)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(ax = ax1)
     
    # predict test set 
   
    cm = confusion_matrix(y_valid, valid_pred)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(ax = ax2)
    
def over_sample(X, Y, test_size = 0.2):
    ros = RandomOverSampler()
    return ros.fit_resample(X, Y)
    
    
def under_sample(X,Y, test_size = 0.2): 
    rus = RandomUnderSampler()
    return rus.fit_resample(X,Y)
    

def dt_clf(X_train, X_valid, y_train, y_valid,max_depth = 34, plot = True):
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    valid_pred = dt.predict(X_valid)
    if plot: 
        print(f"Max-Depth is: {dt.tree_.max_depth}")
        print(f"DT: Train Acc: {(train_pred == y_train).mean()*100:.2f} , Test Acc: {(valid_pred == y_valid).mean()*100:.2f}")
        plot_confusion(y_train, train_pred, y_test, test_pred)
    return dt, (train_pred == y_train).mean()*100,(valid_pred == y_valid).mean()*100

def knn_clf(X_train, X_valid, y_train, y_valid,k = 5 ,plot = True):
    clf = KNeighborsClassifier(n_neighbors = k)
    clf.fit(X_train, y_train)
    train_pred = clf.predict(X_train)
    valid_pred = clf.predict(X_valid)
    if plot:
        print(f'{int(np.sqrt(len(X_train)))}-NN Clf -> Train Accuracy: {(train_pred == y_train).mean()*100:.2f},  Test Accuracy: {(valid_pred == y_valid).mean()*100:.2f}')
        plot_confusion(y_train, train_pred, y_valid, valid_pred)
    return clf, (train_pred == y_train).mean()*100,(valid_pred == y_valid).mean()*100

def RF_clf(X_train, X_valid, y_train, y_valid, n_esimators,plot = True):
    rf = RandomForestClassifier(n_estimators= n_esimators)
    rf.fit(X_train, y_train)
    train_pred = rf.predict(X_train)
    valid_pred = rf.predict(X_valid)
    if plot: 
        print(f"DT: Train Acc: {(train_pred == y_train).mean()*100:.2f} , Test Acc: {(valid_pred == y_valid).mean()*100:.2f}")
        plot_confusion(y_train, train_pred, y_valid,  valid_pred)
    return rf, (train_pred == y_train).mean()*100,(valid_pred == y_valid).mean()*100


def tune_svm(X_train, X_valid, y_train, y_valid):
    kernel = ['poly', 'rbf', 'linear','precomputed', 'sigmoid']
    degree = np.arange(1, 10, 1)
    C = np.arange(1, 100, 5)
    best_model = (None, None, None)
    max_acc = 0
    for kr in kernel:
        for deg in degree:
            for c in C:
                _, tacc, vacc = svm_clf(X_train, X_valid, y_train, y_valid, kernel = kr,
                                        degree = deg,c = c, plot = False)
                if vacc > max_acc:
                    max_acc = vacc
                    best_model = (kr, deg, c)
                if tacc == 100:
                    break
    return best_model

def tune_knn( X_train, X_valid, y_train, y_valid):
    train_acc = [];  valid_acc = []
    best_k = None
    max_vacc = 0
    for k in np.arange(5, int(np.sqrt(len(X_train))), 2):
        _, tracc, vacc  = knn_clf(X_train, X_valid, y_train, y_valid,k , plot = False)
        train_acc.extend([tracc])
        valid_acc.extend([vacc])
        if vacc > max_vacc:
            max_vacc = vacc
            best_k = k
        if tracc == 100:
            break
            
    plt.figure()
    plt.title("Accuracy Graph vs K value")
    plt.xlabel("K")
    plt.ylabel("Accuracy")
    plt.plot(np.arange(5, int(np.sqrt(len(X_train))), 2), train_acc)
    plt.plot(np.arange(5, int(np.sqrt(len(X_train))), 2), valid_acc)
    return best_k

def tune_DT(X_train, X_valid, y_train, y_valid):
    train_acc = [];  valid_acc = []
    best_depth = None
    max_vacc = 0
    for i in np.arange(3,34,1):
        _ , tracc, vacc = dt_clf(X_train, X_valid, y_train, y_valid, max_depth = i, plot= False)
        train_acc.extend([tracc])
        valid_acc.extend([vacc])
        if vacc > max_vacc:
            max_vacc = vacc
            best_depth = i
        if tracc == 100:
            break
    plt.figure()
    plt.title("Accuracy Graph vs Max-Depth")
    plt.xlabel("Max-Depth")
    plt.ylabel("Accuracy")
    plt.plot(np.arange(3,34,1), train_acc)
    plt.plot(np.arange(3,34,1), valid_acc)
    return best_depth

## 1- Impalanced Data

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(encoded_X, Y, test_size=0.1)

### SVM

In [10]:
# best_model = tune_svm(X_train, X_valid, y_train, y_valid)

In [11]:
# svm_model, _, _ = svm_clf(X_train, X_valid, y_train, y_valid,
#                         kernel = best_model[0], degree =  best_model[1],c =  best_model[2], plot = False)

In [12]:
# pred = svm_model.predict(X_test)
# plot_confusion(pred,y_test,pred,y_test)
# print(f'Test Accuracy:{(pred == y_test).mean()}')
# print(classification_report(y_test,pred))

### KNN

In [14]:
best_k = tune_knn(X_train, X_valid, y_train, y_valid)

In [None]:
knn_model, _, _  = knn_clf(X_train, X_valid, y_train, y_valid, best_k, plot = False)

In [None]:
pred = knn_model.predict(X_test)
plot_confusion(pred,y_test,pred,y_test)
print(f'Test Accuracy:{(pred == y_test).mean()}')
print(classification_report(y_test,pred))

### DT

In [16]:
best_depth = tune_DT(X_train, X_valid, y_train, y_valid)

In [None]:
dt_model ,_,_ = dt_clf(X_train, X_valid, y_train, y_valid, max_depth = best_depth, plot= False)

In [None]:
pred = dt_model.predict(X_test)
plot_confusion(pred,y_test,pred,y_test)
print(f'Test Accuracy:{(pred == y_test).mean()}')
print(classification_report(y_test,pred))

### Random Forest with K-Fold

In [None]:
# kf = KFold(n_splits=10, shuffle=True, random_state=0)
# mean_train_acc = []
# mean_valid_acc = []
# best_fold = (None, None)
# best_valid_acc = 0
# train_acc = 0
# for train, valid in kf.split(encoded_X,Y):
#         _, tacc, vacc = RF_clf(encoded_X[train], encoded_X[valid], Y[train], Y[valid], n_esimators = 2, plot= False)
#         mean_train_acc.extend([tacc])
#         mean_valid_acc.extend([vacc])
#         if vacc > best_valid_acc:
#             best_valid_acc = vacc
#             train_acc = tacc
#             best_fold = (train, valid)
        
# plt.plot(range(len(mean_train_acc)),mean_train_acc)
# plt.plot(range(len(mean_valid_acc)),mean_valid_acc)

### Final Random Forest 

In [None]:
# X_train, X_valid, y_train, y_valid = encoded_X[best_fold[0]],encoded_X[best_fold[1]], Y[best_fold[0]], Y[best_fold[1]]
# rf_model, vacc, vacc = RF_clf(X_train, X_valid, y_train, y_valid, n_esimators = 2, plot= True)

In [None]:
# # X_test,y_test = encode_data(test_data)
# pred = rf_model.predict(X_test)
# plot_confusion(pred,y_test,pred,y_test)
# print(f'Test Accuracy:{(pred == y_test).mean()}')

## 2- UnderSampling

In [None]:
X_us, Y_us = under_sample(encoded_X, Y, test_size=0.1)
X_train, X_valid, y_train, y_valid = train_test_split(X_us, Y_us, test_size=0.1)

### SVM 

In [None]:
# best_model = tune_svm(X_train, X_valid, y_train, y_valid)

In [None]:
# svm_model, _, _ = svm_clf(X_train, X_valid, y_train, y_valid,
#                         kernel = best_model[0], degree =  best_model[1],c =  best_model[2], plot = False)

In [None]:
# pred = svm_model.predict(X_test)
# plot_confusion(pred,y_test,pred,y_test)
# print(f'Test Accuracy:{(pred == y_test).mean()}')
# print(classification_report(y_test,pred))

### KNN 

In [None]:
best_k = tune_knn(X_train, X_valid, y_train, y_valid)

In [None]:
knn_model, _, _  = knn_clf(X_train, X_valid, y_train, y_valid, best_k, plot = False)

In [None]:
pred = knn_model.predict(X_test)
plot_confusion(pred,y_test,pred,y_test)
print(f'Test Accuracy:{(pred == y_test).mean()}')
print(classification_report(y_test,pred))

### Decision Tree

In [None]:
best_depth = tune_DT(X_train, X_valid, y_train, y_valid)

In [None]:
dt_model ,_,_ = dt_clf(X_train, X_valid, y_train, y_valid, max_depth = best_depth, plot= False)

In [None]:
pred = dt_model.predict(X_test)
plot_confusion(pred,y_test,pred,y_test)
print(f'Test Accuracy:{(pred == y_test).mean()}')
print(classification_report(y_test,pred))

### Random Forest

In [None]:
# under sample dataset 
# rus = RandomUnderSampler()
# X_res, y_res = rus.fit_resample(encoded_X,Y)
# # K-Fold split
# kf = KFold(n_splits=10, shuffle=True, random_state=0)
# mean_train_acc = []
# mean_valid_acc = []
# best_fold = (None, None)
# best_valid_acc = 0
# train_acc = 0
# for n in np.arange(2,100,5):
#     mean_valid_acc = []
#     mean_train_acc = []
#     for train, valid in kf.split(X_res,y_res):
#             _, tacc, vacc = RF_clf(X_res[train], X_res[valid], y_res[train], y_res[valid], n_esimators = n, plot= False)
#             mean_train_acc.extend([tacc])
#             mean_valid_acc.extend([vacc])
#             if vacc > best_valid_acc:
#                 best_valid_acc = vacc
#                 train_acc = tacc
#                 best_fold = (train, valid)

#     plt.plot(range(len(mean_train_acc)),mean_train_acc)
#     plt.plot(range(len(mean_valid_acc)),mean_valid_acc)

In [None]:
# X_train, X_valid, y_train, y_valid = X_res[best_fold[0]],X_res[best_fold[1]], y_res[best_fold[0]], y_res[best_fold[1]]


### OverSampling

In [None]:
X_os, Y_os = over_sample(encoded_X, Y, test_size=0.1)
X_train, X_valid, y_train, y_valid = train_test_split(X_os, Y_os, test_size=0.1)

### DT

In [None]:
best_depth = tune_DT(X_train, X_valid, y_train, y_valid)

In [None]:
dt_model ,_,_ = dt_clf(X_train, X_valid, y_train, y_valid, max_depth = best_depth, plot= False)

In [None]:
pred = dt_model.predict(X_test)
plot_confusion(pred,y_test,pred,y_test)
print(f'Test Accuracy:{(pred == y_test).mean()}')
print(classification_report(y_test,pred))

### KNN

In [None]:
best_k = tune_knn(X_train, X_valid, y_train, y_valid)

In [None]:
knn_model, _, _  = knn_clf(X_train, X_valid, y_train, y_valid, best_k, plot = False)

In [None]:
pred = knn_model.predict(X_test)
plot_confusion(pred,y_test,pred,y_test)
print(f'Test Accuracy:{(pred == y_test).mean()}')
print(classification_report(y_test,pred))

### SVM

In [None]:
# best_model = tune_svm(X_train, X_valid, y_train, y_valid)

In [None]:
# svm_model, _, _ = svm_clf(X_train, X_valid, y_train, y_valid,
#                         kernel = best_model[0], degree =  best_model[1],c =  best_model[2], plot = False)

In [None]:
# pred = svm_model.predict(X_test)
# plot_confusion(pred,y_test,pred,y_test)
# print(f'Test Accuracy:{(pred == y_test).mean()}')
# print(classification_report(y_test,pred))

In [None]:
# X_train, X_valid, y_train, y_valid = over_sample(encoded_X, Y, test_size=0.1)
# list_train_acc = []
# list_valid_acc = []
# best_valid_acc = 0
# best_n = None
# for n in np.arange(2,100,5):
#     _, tacc, vacc = RF_clf(X_train, X_valid, y_train, y_valid, n_esimators = n, plot= False)
#     list_train_acc.extend([tacc])
#     list_valid_acc.extend([vacc])
#     if tacc == 100:
#         break
#     if vacc > best_valid_acc:
#         best_valid_acc = vacc
#         best_n = n
        
# plt.figure()
# plt.title("Accuracy Vs N_Esimators")
# plt.plot(range(len(list_train_acc)),list_train_acc)
# plt.plot(range(len(list_valid_acc)),list_valid_acc)

In [None]:
# rf_model, _, _ = RF_clf(X_train, X_valid, y_train, y_valid, n_esimators = best_n, plot= False)
# # print(f"Traing Accuracy: {tracc}, Valid Accuracy: {vacc}")

In [None]:
# # X_test,y_test = encode_data(test_data)
# pred = rf_model.predict(X_test)
# plot_confusion(pred,y_test,pred,y_test)
# print(f'Test Accuracy:{(pred == y_test).mean()}')
# print(classification_report(y_test,pred))

The Best Model Gotten is Random Forest With OverSampling 