# Import the Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Load the Data

In [2]:
default = pd.read_csv('default.csv', header = 1)

In [3]:
default.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


# Data Cleaning

In [4]:
default.rename({'default payment next month':'DEFAULT'}, axis = 1, inplace = True)
default.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
default.drop(['ID'], axis = 1, inplace = True)
default.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
default.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   LIMIT_BAL  10000 non-null  int64
 1   SEX        10000 non-null  int64
 2   EDUCATION  10000 non-null  int64
 3   MARRIAGE   10000 non-null  int64
 4   AGE        10000 non-null  int64
 5   PAY_0      10000 non-null  int64
 6   PAY_2      10000 non-null  int64
 7   PAY_3      10000 non-null  int64
 8   PAY_4      10000 non-null  int64
 9   PAY_5      10000 non-null  int64
 10  PAY_6      10000 non-null  int64
 11  BILL_AMT1  10000 non-null  int64
 12  BILL_AMT2  10000 non-null  int64
 13  BILL_AMT3  10000 non-null  int64
 14  BILL_AMT4  10000 non-null  int64
 15  BILL_AMT5  10000 non-null  int64
 16  BILL_AMT6  10000 non-null  int64
 17  PAY_AMT1   10000 non-null  int64
 18  PAY_AMT2   10000 non-null  int64
 19  PAY_AMT3   10000 non-null  int64
 20  PAY_AMT4   10000 non-null  int64
 21  PAY_AMT5   10

#### Missing Data

In [7]:
default['SEX'].unique()

array([2, 1], dtype=int64)

In [8]:
default['EDUCATION'].unique()

array([2, 1, 3, 5, 4, 6, 0], dtype=int64)

In [9]:
default['MARRIAGE'].unique()

array([1, 2, 3, 0], dtype=int64)

In [10]:
len(default.loc[(default['EDUCATION'] == 0) | (default['MARRIAGE'] == 0)])

20

In [11]:
default = default.loc[(default['EDUCATION'] != 0) & (default['MARRIAGE'] != 0)]

In [13]:
features = default.drop(['DEFAULT'], axis = 1)
target = default['DEFAULT']

#### One-Hot Encoding

In [14]:
features = pd.get_dummies(features, columns = ['SEX', 'EDUCATION', 'MARRIAGE', 
                                               'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'])
features.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,20000,24,3913,3102,689,0,0,0,0,689,...,1,0,0,0,0,0,0,0,0,0
1,120000,26,2682,1725,2682,3272,3455,3261,0,1000,...,0,0,0,1,0,0,0,0,0,0
2,90000,34,29239,14027,13559,14331,14948,15549,1518,1500,...,0,0,1,0,0,0,0,0,0,0
3,50000,37,46990,48233,49291,28314,28959,29547,2000,2019,...,0,0,1,0,0,0,0,0,0,0
4,50000,57,8617,5670,35835,20940,19146,19131,2000,36681,...,0,0,1,0,0,0,0,0,0,0


# Split the Data

In [15]:
x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 42)

# Scale the Data

In [16]:
x_train = scale(x_train)
x_test = scale(x_test)

# Select and Train the Model

In [17]:
model = SVC(random_state = 42)
model.fit(x_train, y_train)

SVC(random_state=42)

In [18]:
prediction = model.predict(x_test)

# Evaluate the Model

In [19]:
the_matrix = confusion_matrix(y_test, prediction)
print(the_matrix)

[[1888   77]
 [ 388  142]]


# Adjust the Hyperparameters

In [20]:
param_grid = [{'C': [0.5, 1, 10, 100], 'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}]
optimal_params = GridSearchCV(SVC(), param_grid, cv = 5, scoring = 'accuracy', verbose = 0)
optimal_params.fit(x_train, y_train)
print(optimal_params.best_params_)

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


In [21]:
model = SVC(random_state = 42, C = 1, gamma = 'scale')
model.fit(x_train, y_train)

SVC(C=1, random_state=42)

In [22]:
the_matrix = confusion_matrix(y_test, prediction)
print(the_matrix)

[[1888   77]
 [ 388  142]]
