In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
%matplotlib inline

In [2]:
os.chdir("/Users/rishiagarwal/Downloads")

In [3]:
credit_data = pd.read_csv("default.csv",index_col = 0)

In [4]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
LIMIT_BAL                     30000 non-null int64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_0                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null int64
BILL_AMT2                     30000 non-null int64
BILL_AMT3                     30000 non-null int64
BILL_AMT4                     30000 non-null int64
BILL_AMT5                     30000 non-null int64
BILL_AMT6                     30000 non-null int64
PAY_AMT1                

In [5]:
cols_bill_amt = [col for col in credit_data.columns if ('BILL') in col]
cols_pay_amt = [col for col in credit_data.columns if ('PAY_AMT') in col]
cols_numerical = [col for col in credit_data.columns if ('AMT') in col]
cols_pay = [col for col in credit_data.columns if ('PAY_') in col]
cols_pay =cols_pay[0:6]

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix, accuracy_score,f1_score,log_loss,precision_score,recall_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
import warnings
from sklearn.ensemble import BaggingClassifier as BC
warnings.filterwarnings("ignore", category=DeprecationWarning)
scaler = StandardScaler()

#the columns from 1 to 10 are all categorical features. So we need to fit One Hot Encoder only on them.
categorical_features = [i for i in range(1,11)]
enc = OneHotEncoder(categorical_features=categorical_features)
y_target = credit_data["default payment next month"]



In [7]:
#function to do all the data scaling and one hot encoding
def data_prep(data):
    #scaling the numerical values columns
    data[cols_numerical] = scaler.fit_transform(data[cols_numerical])
    #since the one hot encoder doesn't accept negative values, shifting each value by 2
    data[cols_pay] = data[cols_pay].apply(lambda x: x+2)
    #selection of all the rows for training
    x = data.iloc[:,0:23]
    #one hot encoding. Note that enc has been set on categorical features
    x = enc.fit_transform(x)
    return x

In [8]:
X_train = data_prep(credit_data)

In [9]:
#stratified sampling ensures that the test data follows the same pattern as the train data
X_train, X_test, y_train, y_test = train_test_split(X_train, y_target,test_size=0.2, 
                                                    stratify=credit_data["default payment next month"], 
                                                    random_state=42)

In [10]:
def gridsearch(X_train,y_train,X_test,y_test,model, params, scoring,cv=None):
    gs_cv = GridSearchCV(model,params,scoring=scoring,n_jobs=-1,cv=cv)
    gs_cv.fit(X_train, y_train)
    y_pred_train = gs_cv.predict(X_train)
    y_pred_test = gs_cv.predict(X_test)
    print("----------------------------TRAINING DATA----------------------------")
    print("Accuracy on training set is: "+str(accuracy_score(y_train,y_pred_train)))
    print("F1 score on training set is: "+str(f1_score(y_train,y_pred_train)))
    print("Precision on training set is: "+str(precision_score(y_train,y_pred_train)))
    print("Recall on training set is: "+str(recall_score(y_train,y_pred_train)))
    print("----------------------------TEST DATA----------------------------")     
    print("Accuracy on test set is: "+str(accuracy_score(y_test,y_pred_test)))
    print("F1 score on test set is: "+str(f1_score(y_test,y_pred_test)))
    print("Precision on test set is: "+str(precision_score(y_test,y_pred_test)))
    print("Recall on test set is: "+str(recall_score(y_test,y_pred_test)))
    print("-----------------------------------------------------------------")
    print("BEST PARAMETERS BASED ON "+scoring+" AS A SCORING PARAMETER")  
    print("Best parameters: "+str(gs_cv.best_params_))

In [11]:
svm_parameters = {'C': [0.001, 0.01, 0.1, 1, 10],'gamma':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

In [12]:
svm = SVC(kernel="rbf",tol=0.1,cache_size=1000)

In [13]:
gridsearch(X_train,y_train,X_test,y_test,svm,svm_parameters,"accuracy",3)

----------------------------TRAINING DATA----------------------------
Accuracy on training set is: 0.855125
F1 score on training set is: 0.571111385223
Precision on training set is: 0.827376697641
Recall on training set is: 0.436051987192
----------------------------TEST DATA----------------------------
Accuracy on test set is: 0.8075
F1 score on test set is: 0.391143911439
Precision on test set is: 0.650877192982
Recall on test set is: 0.279577995479
-----------------------------------------------------------------
BEST PARAMETERS BASED ON accuracy AS A SCORING PARAMETER
Best parameters: {'C': 1, 'gamma': 0.1}


In [15]:
svm = SVC(kernel="rbf",tol=0.01,cache_size=1000,class_weight={1:0.9,0:0.1})
svm_parameters = {'C': [0.001, 0.01, 0.1, 1, 10],'gamma':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gridsearch(X_train,y_train,X_test,y_test,svm,svm_parameters,"f1",3)

----------------------------TRAINING DATA----------------------------
Accuracy on training set is: 0.808708333333
F1 score on training set is: 0.697423054109
Precision on training set is: 0.536394971614
Recall on training set is: 0.996609530985
----------------------------TEST DATA----------------------------
Accuracy on test set is: 0.650166666667
F1 score on test set is: 0.443383717847
Precision on test set is: 0.342062193126
Recall on test set is: 0.629992464205
-----------------------------------------------------------------
BEST PARAMETERS BASED ON f1 AS A SCORING PARAMETER
Best parameters: {'C': 10, 'gamma': 0.1}


In [17]:
#here I try to use bagging.

n_estimators = 400
clf = BC(base_estimator=SVC(kernel="rbf",C=10,gamma=10,cache_size=1000),
         max_samples=1/n_estimators, n_estimators=n_estimators,n_jobs=-1)
clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=SVC(C=10, cache_size=1000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.0025, n_estimators=400, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [18]:
y_pred_train = clf.predict(X_train)

In [19]:
y_pred_test = clf.predict(X_test)

In [20]:
print("----------------------------TRAINING DATA----------------------------")
print("Accuracy on training set is: "+str(accuracy_score(y_train,y_pred_train)))
print("F1 score on training set is: "+str(f1_score(y_train,y_pred_train)))
print("Precision on training set is: "+str(precision_score(y_train,y_pred_train)))
print("Recall on training set is: "+str(recall_score(y_train,y_pred_train)))
print("----------------------------TEST DATA----------------------------")     
print("Accuracy on test set is: "+str(accuracy_score(y_test,y_pred_test)))
print("F1 score on test set is: "+str(f1_score(y_test,y_pred_test)))
print("Precision on test set is: "+str(precision_score(y_test,y_pred_test)))
print("Recall on test set is: "+str(recall_score(y_test,y_pred_test)))

----------------------------TRAINING DATA----------------------------
Accuracy on training set is: 0.778791666667
F1 score on training set is: 0.0
Precision on training set is: 0.0
Recall on training set is: 0.0
----------------------------TEST DATA----------------------------
Accuracy on test set is: 0.778833333333
F1 score on test set is: 0.0
Precision on test set is: 0.0
Recall on test set is: 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [21]:
param_grid={"n_estimators":[10,50,100,500,1000,2000],'max_samples': [0.1,0.5,0.6, 0.8, 1.0]}

In [None]:
clf = BC(base_estimator=SVC(kernel="rbf",C=10,gamma=10,cache_size=1000))
gridsearch(X_train,y_train,X_test,y_test,clf,param_grid,"f1",3)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
#as it can be seen that using bagging doesn't give good results.

In [None]:
#downsampling to reduce the computation time. 