In [None]:
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest #feature selection
from sklearn.feature_selection import chi2
from sklearn.utils import resample #re-sampling
from sklearn.model_selection import train_test_split #pembagian data
from sklearn.preprocessing import MinMaxScaler  #normalisasi fitur
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.svm import SVC #svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier #random forest
from sklearn.neighbors import KNeighborsClassifier #k-nearest neighbor
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.metrics import roc_auc_score #roc score
%matplotlib
%matplotlib inline
%matplotlib inline

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df_train = pd.read_csv("application_train.csv", sep=',')
print('This dataset has %d rows dan %d columns.\n' % df_train.shape)
df_train.head()

In [None]:
print('Data types: \n')
df_train.info(verbose=True)

In [None]:
train = df_train.copy()

In [None]:
decode_map = {0: "Tanpa Kesulitan Pembayaran", 1: "Kesulitan Pembayaran"}
def decode_sentiment(label):
    return decode_map[int(label)]

train['TARGET'] = train['TARGET'].apply(lambda x: decode_sentiment(x))

In [None]:
target_grp = (train[['TARGET']]
                .groupby("TARGET")
                .agg(COUNT=("TARGET","count"))
                .sort_values(by=["COUNT"],ascending=False)
                .reset_index()
                )

target_grp.style.background_gradient(cmap='Blues')


In [None]:
grp = train['TARGET'].value_counts(normalize=True)
grp.reset_index().style.background_gradient(cmap='Blues')

In [None]:
fig = plt.figure(figsize = (12,7))
grp.plot(kind='bar', color= ['midnightblue','gray'], alpha = 0.9, rot=0)
plt.title('The Distribution of Clients Repayment Abilities\n', fontsize=14)
plt.show()

In [None]:
print('The number of duplication is:', df_train.duplicated().sum())


In [None]:
print('Missing values status:', df_train.isnull().values.any())
nvc = pd.DataFrame(df_train.isnull().sum(), columns=['Total Null Values'])
nvc['Percentage'] = (nvc['Total Null Values']/df_train.shape[0])*100
nvc.sort_values(by=['Percentage'], ascending=False).reset_index()

In [None]:
df_train.drop(df_train.iloc[:, 44:91], inplace=True, axis=1)
df_train.drop(['OWN_CAR_AGE','EXT_SOURCE_1'], inplace=True, axis=1)

In [None]:
print('Missing values status:', df_train.isnull().values.any())
nvc = pd.DataFrame(df_train.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = (nvc['Total Null Values']/df_train.shape[0])*100
nvc.sort_values(by=['Percentage'], ascending=False).reset_index()

In [None]:
category_columns = df_train.select_dtypes(include=['object']).columns.tolist()
integer_columns = df_train.select_dtypes(include=['int64','float64']).columns.tolist()

for column in df_train:
    if df_train[column].isnull().any():
        if(column in category_columns):
            df_train[column]=df_train[column].fillna(df_train[column].mode()[0])
        else:
            df_train[column]=df_train[column].fillna(df_train[column].median())

In [None]:
df_train[["CODE_GENDER", "NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
          "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS",
         "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE"]] = df_train[["CODE_GENDER", "NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
          "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS",
         "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE"]].apply(LabelEncoder().fit_transform)

In [None]:
df_train.head()

In [None]:
df_train.drop(['SK_ID_CURR'], inplace=True, axis=1)


In [None]:
df_train.head()

In [None]:
df_train.iloc[:,16:20] = df_train.iloc[:,16:20].abs()
df_train.iloc[:,45] = df_train.iloc[:,45].abs()

In [None]:
x = df_train.drop(['TARGET'], axis=1)
y = df_train['TARGET']

In [None]:
df_majority = df_train[(df_train['TARGET']==0)] 
df_minority = df_train[(df_train['TARGET']==1)] 

df_minority_upsampled = resample(df_minority, 
                                 replace=True,    
                                 n_samples= 282686, 
                                 random_state=42)  

df_upsampled = pd.concat([df_minority_upsampled, df_majority])

In [None]:
x_balanced = df_upsampled[['DAYS_EMPLOYED', 'AMT_GOODS_PRICE', 'AMT_CREDIT', 
                           'DAYS_BIRTH', 'AMT_INCOME_TOTAL', 'DAYS_REGISTRATION', 
                           'DAYS_LAST_PHONE_CHANGE', 'DAYS_ID_PUBLISH', 'AMT_ANNUITY', 
                           'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE', 'REG_CITY_NOT_WORK_CITY',
                          'CODE_GENDER', 'EXT_SOURCE_2', 'REG_CITY_NOT_LIVE_CITY', 'NAME_EDUCATION_TYPE',
                          'DEF_30_CNT_SOCIAL_CIRCLE', 'EXT_SOURCE_3', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVE_CITY_NOT_WORK_CITY']]
y_balanced = df_upsampled['TARGET']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_balanced, y_balanced, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
model_log = LogisticRegression().fit(X_train, y_train)
print(model_log)



In [None]:
y_train_pred_log = model_log.predict(X_train)

print('Logistic Regression :')
print(classification_report(y_train, y_train_pred_log))

In [None]:
y_test_pred_log = model_log.predict(X_test)

print('Logistic Regression :')
print(classification_report(y_test, y_test_pred_log))

In [None]:
akurasi_log_train=round(model_log.score(X_train,y_train)*100,2)
akurasi_log_test=round(model_log.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(akurasi_log_train))
print("Test Accuracy: % {}".format(akurasi_log_test)) 

In [None]:
roc_auc_log = round(roc_auc_score(y_test, y_test_pred_log)*100,2)
print('ROC AUC:', roc_auc_log)

In [None]:
model_svm = SVC(max_iter=1000).fit(X_train, y_train)

print(model_svm)

In [None]:
y_test_pred_svm = model_svm.predict(X_test)

print('SVM :')
print(classification_report(y_test, y_test_pred_svm))

In [None]:
akurasi_svm_train=round(model_svm.score(X_train,y_train)*100,2)
akurasi_svm_test=round(model_svm.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(akurasi_svm_train))
print("Test Accuracy: % {}".format(akurasi_svm_test))

In [None]:
roc_auc_svm = round(roc_auc_score(y_test, y_test_pred_svm)*100,2)
print('Skor ROC AUC:', roc_auc_svm)

In [None]:
model_knn = KNeighborsClassifier().fit(X_train,y_train)
print(model_knn)

In [None]:
y_train_pred_knn = model_knn.predict(X_train)

print('K-Nearest Neighbors :')
print(classification_report(y_train, y_train_pred_knn))

In [None]:
y_test_pred_knn = model_knn.predict(X_test)

print('K-Nearest Neighbors :')
print(classification_report(y_test, y_test_pred_knn))

In [None]:
akurasi_knn_train=round(model_knn.score(X_train,y_train)*100,2)
akurasi_knn_test=round(model_knn.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(akurasi_knn_train))
print("Test Accuracy: % {}".format(akurasi_knn_test))

In [None]:
roc_auc_knn = round(roc_auc_score(y_test, y_test_pred_knn)*100,2)
print('Skor ROC AUC:', roc_auc_knn)

In [None]:
from sklearn.tree import DecisionTreeClassifier 
model_dt = DecisionTreeClassifier().fit(X_train,y_train)
print(model_dt)

In [None]:
y_train_pred_dt = model_dt.predict(X_train)

print('Decision Tree :')
print(classification_report(y_train, y_train_pred_dt))

In [None]:
y_test_pred_dt = model_dt.predict(X_test)

print('Decision Tree :')
print(classification_report(y_test, y_test_pred_dt))

In [None]:
akurasi_dt_train=round(model_dt.score(X_train,y_train)*100,2)
akurasi_dt_test=round(model_dt.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(akurasi_dt_train))
print("Test Accuracy: % {}".format(akurasi_dt_test))

In [None]:
roc_auc_dt = round(roc_auc_score(y_test, y_test_pred_dt)*100,2)
print('Skor ROC AUC:', roc_auc_knn)

In [None]:
hasil = pd.DataFrame([["Logistic Regression", 67.15, 67.88, 66.87],[" SVM", 52.13, 52.26, 52.26],
                       ["Decision Tree", 100, 75.79, 87.97],["K-Nearest Neighbor", 91.6, 88.02, 87.97]],
                        columns = ["Metode Algoritma", "Skor Training Accuracy", "Skor Testing Accuracy", "Skor ROC"])

hasil.sort_values(by=['Skor ROC'], ascending=False).style.background_gradient(cmap='Blues')

In [None]:
df_test = pd.read_csv("application_test.csv", sep=',')
print('Dataset ini memiliki %d baris dan %d kolom.\n' % df_test.shape)
df_test.head()

In [None]:
print('Jumlah data duplikat:', df_test.duplicated().sum())

In [None]:
print('Status value yang hilang:', df_test.isnull().values.any())
jvp = pd.DataFrame(df_test.isnull().sum(), columns=['Total Null Values'])
jvp['Percentage'] = (jvp['Total Null Values']/df_test.shape[0])*100
jvp.sort_values(by=['Percentage'], ascending=False).reset_index()

In [None]:
df_test.drop(df_test.iloc[:, 43:90], inplace=True, axis=1)
df_test.drop(['OWN_CAR_AGE','EXT_SOURCE_1'], inplace=True, axis=1)

In [None]:
print('Status value yang hilang:', df_test.isnull().values.any())
tvc = pd.DataFrame(df_test.isnull().sum(), columns=['Total Null Values'])
tvc['Percentage'] = (tvc['Total Null Values']/df_test.shape[0])*100
tvc.sort_values(by=['Percentage'], ascending=False).reset_index()

In [None]:
category_columns = df_test.select_dtypes(include=['object']).columns.tolist()
integer_columns = df_test.select_dtypes(include=['int64','float64']).columns.tolist()

for column in df_test:
    if df_test[column].isnull().any():
        if(column in category_columns):
            df_test[column]=df_test[column].fillna(df_test[column].mode()[0])
        else:
            df_test[column]=df_test[column].fillna(df_test[column].median())

In [None]:
df_test[["CODE_GENDER", "NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
          "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS",
         "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE"]] = df_test[["CODE_GENDER", "NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
          "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS",
         "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE"]].apply(LabelEncoder().fit_transform)

In [None]:
df_test.iloc[:,16:20] = df_test.iloc[:,16:20].abs()
df_test.iloc[:,45] = df_test.iloc[:,45].abs()

In [None]:
pred_test = df_test[['DAYS_EMPLOYED', 'AMT_GOODS_PRICE', 'AMT_CREDIT', 
                           'DAYS_BIRTH', 'AMT_INCOME_TOTAL', 'DAYS_REGISTRATION', 
                           'DAYS_LAST_PHONE_CHANGE', 'DAYS_ID_PUBLISH', 'AMT_ANNUITY', 
                           'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE', 'REG_CITY_NOT_WORK_CITY',
                          'CODE_GENDER', 'EXT_SOURCE_2', 'REG_CITY_NOT_LIVE_CITY', 'NAME_EDUCATION_TYPE',
                          'DEF_30_CNT_SOCIAL_CIRCLE', 'EXT_SOURCE_3', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVE_CITY_NOT_WORK_CITY']]
pred_test.head()

In [None]:
# lets predict!
predict = pd.Series(model_knn.predict(pred_test), name = "TARGET").astype(int)
results = pd.concat([df_test['SK_ID_CURR'], predict],axis = 1)
results.to_csv("hasilprediksi.csv", index = False)
results.head() 