In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [5]:
data

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# show categorical columns

for i in data:
    if data[i].nunique() < 7:
        
        print(i,data[i].nunique(),data[i].unique())

TARGET 2 [1 0]
NAME_CONTRACT_TYPE 2 ['Cash loans' 'Revolving loans']
CODE_GENDER 3 ['M' 'F' 'XNA']
FLAG_OWN_CAR 2 ['N' 'Y']
FLAG_OWN_REALTY 2 ['Y' 'N']
NAME_EDUCATION_TYPE 5 ['Secondary / secondary special' 'Higher education' 'Incomplete higher'
 'Lower secondary' 'Academic degree']
NAME_FAMILY_STATUS 6 ['Single / not married' 'Married' 'Civil marriage' 'Widow' 'Separated'
 'Unknown']
NAME_HOUSING_TYPE 6 ['House / apartment' 'Rented apartment' 'With parents'
 'Municipal apartment' 'Office apartment' 'Co-op apartment']
FLAG_MOBIL 2 [1 0]
FLAG_EMP_PHONE 2 [1 0]
FLAG_WORK_PHONE 2 [0 1]
FLAG_CONT_MOBILE 2 [1 0]
FLAG_PHONE 2 [1 0]
FLAG_EMAIL 2 [0 1]
REGION_RATING_CLIENT 3 [2 1 3]
REGION_RATING_CLIENT_W_CITY 3 [2 1 3]
REG_REGION_NOT_LIVE_REGION 2 [0 1]
REG_REGION_NOT_WORK_REGION 2 [0 1]
LIVE_REGION_NOT_WORK_REGION 2 [0 1]
REG_CITY_NOT_LIVE_CITY 2 [0 1]
REG_CITY_NOT_WORK_CITY 2 [0 1]
LIVE_CITY_NOT_WORK_CITY 2 [0 1]
FONDKAPREMONT_MODE 4 ['reg oper account' nan 'org spec account' 'reg oper spec

In [7]:
# select some columns

data1=data[['TARGET','FLAG_OWN_CAR','NAME_EDUCATION_TYPE','NAME_CONTRACT_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','AMT_REQ_CREDIT_BUREAU_HOUR']]

data1.isnull().sum()

data1=data1.head(100000)

In [8]:
data1.head(3)

Unnamed: 0,TARGET,FLAG_OWN_CAR,NAME_EDUCATION_TYPE,NAME_CONTRACT_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,AMT_REQ_CREDIT_BUREAU_HOUR
0,1,N,Secondary / secondary special,Cash loans,Single / not married,House / apartment,0.0
1,0,N,Higher education,Cash loans,Married,House / apartment,0.0
2,0,Y,Secondary / secondary special,Revolving loans,Single / not married,House / apartment,0.0


In [9]:
print(data1.groupby('NAME_CONTRACT_TYPE')['TARGET'].mean())

print(data1.groupby('NAME_FAMILY_STATUS')['TARGET'].mean())

print(data1.groupby('NAME_HOUSING_TYPE')['TARGET'].mean())

NAME_CONTRACT_TYPE
Cash loans         0.083542
Revolving loans    0.055995
Name: TARGET, dtype: float64
NAME_FAMILY_STATUS
Civil marriage          0.101955
Married                 0.074838
Separated               0.085710
Single / not married    0.100062
Unknown                 0.000000
Widow                   0.057290
Name: TARGET, dtype: float64
NAME_HOUSING_TYPE
Co-op apartment        0.074586
House / apartment      0.078599
Municipal apartment    0.078667
Office apartment       0.063754
Rented apartment       0.113772
With parents           0.118630
Name: TARGET, dtype: float64


In [10]:
data1['FLAG_OWN_CAR'] = data1['FLAG_OWN_CAR'].map( {'Y': 1, 'N': 0} )

data1['NAME_EDUCATION_TYPE']=data1['NAME_EDUCATION_TYPE'].map({'Lower secondary':1, 'Secondary / secondary special':2,
                                    'Incomplete higher':3,'Higher education':4,'Academic degree':5 })
                                    
data1['NAME_CONTRACT_TYPE'] = data1['NAME_CONTRACT_TYPE'].map({'Cash loans':0.083751,'Revolving loans':0.049545})

data1['NAME_FAMILY_STATUS'] = data1['NAME_FAMILY_STATUS'].map({'Civil marriage':0.099197, 'Married':0.074622, 'Separated':0.086569,
'Single / not married':0.099781, 'Unknown':0.000000, 'Widow':0.056989})

data1['NAME_HOUSING_TYPE'] = data1['NAME_HOUSING_TYPE'].map({'Co-op apartment':0.078534,'House / apartment':0.078275, 
'Municipal apartment':0.078591, 'Office apartment':0.067916, 'Rented apartment':0.113134, 'With parents':0.115465 })



In [11]:
# imputing missing values with KNN imputer

knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(data1), columns=data1.columns)

In [None]:
# her defe running ile vaxt getmesin modelde ist edilecek hazır datanı  save etdim.

df_knn_imputed.to_excel('09.01.2023_data.xlsx')

In [None]:
df_knn_imputed=pd.read_excel(r'C:\Users\qw\Desktop\Kod akademi\Tasks\Datasets\09.01.2023_data.xlsx')

In [None]:
df_knn_imputed.drop('Unnamed: 0', axis=1,inplace=True)


In [None]:
df_knn_imputed

Unnamed: 0,TARGET,FLAG_OWN_CAR,NAME_EDUCATION_TYPE,NAME_CONTRACT_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,AMT_REQ_CREDIT_BUREAU_HOUR
0,1,0,2,0.083751,0.099781,0.078275,0.0
1,0,0,4,0.083751,0.074622,0.078275,0.0
2,0,1,2,0.049545,0.099781,0.078275,0.0
3,0,0,2,0.083751,0.099197,0.078275,0.0
4,0,0,2,0.083751,0.099781,0.078275,0.0
...,...,...,...,...,...,...,...
99995,0,0,2,0.083751,0.074622,0.078275,0.0
99996,0,0,4,0.083751,0.074622,0.078275,0.0
99997,0,1,2,0.083751,0.074622,0.078275,0.0
99998,0,1,2,0.083751,0.099197,0.115465,0.0


In [None]:
df_knn_imputed_0=df_knn_imputed[df_knn_imputed['TARGET']==0].sample(frac=0.055)

df_knn_imputed_1=df_knn_imputed[df_knn_imputed['TARGET']==1]

In [None]:
df_knn_imputed=pd.concat([df_knn_imputed_0, df_knn_imputed_1])

In [None]:
X=df_knn_imputed[['FLAG_OWN_CAR','NAME_EDUCATION_TYPE','NAME_CONTRACT_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','AMT_REQ_CREDIT_BUREAU_HOUR']]

y=df_knn_imputed['TARGET']


In [None]:
# Split dataset into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[  39, 1474],
       [  16, 2416]], dtype=int64)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(39, 1474, 16, 2416)

In [None]:
print('accuracy_score', accuracy_score(y_test, y_pred))
print('precision_score', precision_score(y_test, y_pred))
print('recall_score', recall_score(y_test, y_pred))
print('f1_score', f1_score(y_test, y_pred))

accuracy_score 0.6223067173637515
precision_score 0.6210796915167095
recall_score 0.993421052631579
f1_score 0.7643150901613415


In [None]:
trainpred = clf.predict(X_train)

In [None]:
recall_score(y_train,trainpred)

0.9936406995230525