# Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

xgb.set_config(verbosity=0)
assert xgb.get_config()["verbosity"] == 0

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.sample(5)

Unnamed: 0,uid,NAME_CONTRACT_TYPE,TARGET,n_loans,total_loan_amount,mean_loan_amount,min_loan_amount,max_loan_amount,amount_overdue,overdue_ratio,...,late_days,late_months,loan_frequency,days_since_last_loan,n_enquiries,total_enquiry_amount,mean_enquiry_amount,min_enquiry_amount,max_enquiry_amount,enquiry_frequency
223671,BLT50430806,Cash loans,0,6,653854.005,108975.6675,36018.0,315000.0,0.0,0.0,...,984,8,0.002828,0,6,822000,137000.0,89000,181000,0.007585
86569,MBD17322167,Cash loans,0,4,505435.5,126358.875,34128.0,270000.0,0.0,0.0,...,9,1,0.002712,0,7,865000,123571.428571,23000,197000,0.010736
206138,BIU49312454,Cash loans,0,9,959269.05,106585.45,33750.0,225000.0,0.0,0.0,...,25,1,0.003552,0,1,155000,155000.0,155000,155000,0.001
177528,KOM94167488,Cash loans,0,2,485955.0,242977.5,35955.0,450000.0,0.0,0.0,...,0,0,0.008065,0,17,1749000,102882.352941,10000,197000,0.019016
220380,UMH69234659,Cash loans,1,2,532530.0,266265.0,247410.0,285120.0,0.0,0.0,...,0,0,0.002646,0,7,859000,122714.285714,27000,178000,0.011706


In [3]:
train_df['NAME_CONTRACT_TYPE'] = train_df['NAME_CONTRACT_TYPE'].map({'Cash loans': 0, 'Revolving loans': 1})

In [18]:
train_df.isnull().sum()

uid                       0
NAME_CONTRACT_TYPE        0
TARGET                    0
n_loans                   0
total_loan_amount         0
mean_loan_amount          0
min_loan_amount           0
max_loan_amount           0
amount_overdue            0
overdue_ratio             0
total_tenure              0
mean_tenure               0
min_tenure                0
max_tenure                0
days_per_rupee            0
running_loans             0
settled_loans             0
settlement_ratio          0
is_latest_loan_settled    0
late_days                 0
late_months               0
loan_frequency            0
days_since_last_loan      0
n_enquiries               0
total_enquiry_amount      0
mean_enquiry_amount       0
min_enquiry_amount        0
max_enquiry_amount        0
enquiry_frequency         0
dtype: int64

In [4]:
X = train_df.loc[:, (train_df.columns != 'TARGET') & (train_df.columns != 'uid')]
y = train_df['TARGET']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

## Preprocessing

In [6]:
scaler = StandardScaler().fit(X_train)

In [7]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
pca = PCA(n_components=10)
X_train_transformed = pca.fit_transform(X_train)
X_test_transformed = pca.transform(X_test)

In [31]:
pca.explained_variance_ratio_.sum()

0.8217486341400105

## XGBoost

In [43]:
xgboost = xgb.XGBClassifier(scale_pos_weight=12)
xgboost.fit(X_train_transformed, y_train)
y_pred = xgboost.predict(X_test_transformed)
y_prob = xgboost.predict_proba(X_test_transformed)
print(roc_auc_score(y_test, y_prob[:, 1]))
confusion_matrix(y_test, y_pred)

0.6128646107883738


array([[34829, 16837],
       [ 2253,  2061]], dtype=int64)

## SVC

In [21]:
clf = SVC(gamma='scale', class_weight='balanced')
clf.fit(X_train_transformed[:10000], y_train[:10000])
y_pred = clf.predict(X_test_transformed)
print(roc_auc_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.5741301368310536


array([[28942, 22724],
       [ 1777,  2537]], dtype=int64)

In [37]:
clf = SVC(gamma='scale', class_weight='balanced', kernel='linear')
clf.fit(X_train_transformed[:10000], y_train[:10000])
y_pred = clf.predict(X_test_transformed)
print(roc_auc_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.5691732825266299


array([[25795, 25871],
       [ 1557,  2757]], dtype=int64)

## Naive Bayes

In [33]:
y_balanced = np.concatenate((y_train[y_train == 0][:10000], y_train[y_train == 1][:10000]))
X_balanced = np.concatenate((X_train_transformed[y_train == 0][:10000], X_train_transformed[y_train == 1][:10000]))

In [34]:
y_test2d = np.vstack((y_test, 1 - y_test)).T

In [35]:
clf = GaussianNB()
clf.fit(X_balanced, y_balanced)
y_pred = clf.predict(X_test_transformed)
y_prob = clf.predict_proba(X_test_transformed)
print(roc_auc_score(y_test, y_prob[:, 1]))
confusion_matrix(y_test, y_pred)

0.5929240712891068


array([[40714, 10952],
       [ 2863,  1451]], dtype=int64)

## Logistic Regression

In [36]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)
y_prob = clf.predict_proba(X_test_transformed)
print(roc_auc_score(y_test, y_prob[:, 1]))
confusion_matrix(y_test, y_pred)

0.6057879929394216


array([[27409, 24257],
       [ 1635,  2679]], dtype=int64)

## Test Predictions

In [44]:
test_df['NAME_CONTRACT_TYPE'] = test_df['NAME_CONTRACT_TYPE'].map({'Cash loans': 0, 'Revolving loans': 1})
X_final = test_df.loc[:, (test_df.columns != 'TARGET') & (test_df.columns != 'uid')]
X_final = scaler.transform(X_final)
X_final = pca.transform(X_final)
y_pred = xgboost.predict_proba(X_final)[:, 1]

In [45]:
op_dict = {'uid': test_df['uid'], 'pred': y_pred}
op_df = pd.DataFrame(op_dict)

In [46]:
op_df.to_csv('final_submission.csv', index=False)