In [29]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

import os

In [40]:
os.chdir(r"C:\Hogwarts\machine_learning\Cases\Bankruptcy")
bankr = pd.read_csv("Bankruptcy.csv",
                    index_col = 0)
bankr.head()

Unnamed: 0_level_0,D,YR,R1,R2,R3,R4,R5,R6,R7,R8,...,R15,R16,R17,R18,R19,R20,R21,R22,R23,R24
NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,78,0.23,0.08,0.02,0.03,0.46,0.12,0.19,10.36,...,0.05,0.57,0.15,0.23,3.56,0.26,1.55,0.43,0.11,0.17
2,0,77,0.19,0.07,0.09,0.12,0.02,0.02,0.03,3.13,...,0.09,0.12,0.16,0.22,3.78,1.29,1.4,0.06,0.07,0.1
3,0,72,0.07,0.02,0.03,0.05,0.06,0.1,0.14,2.41,...,-0.03,0.02,0.02,0.04,13.29,1.61,1.43,0.03,0.05,0.07
4,0,80,0.07,0.03,0.04,0.04,0.04,0.06,0.06,5.55,...,-0.02,0.01,0.02,0.02,5.36,1.3,1.12,-0.06,-0.08,-0.09
5,0,81,0.09,0.02,0.03,0.04,0.06,0.08,0.11,2.85,...,0.02,0.07,0.1,0.14,7.74,1.48,1.41,0.03,0.04,0.06


In [41]:
X = bankr.drop(["D",
                "YR"],
               axis = 1)

y= bankr["D"]

In [42]:
X_train,X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   stratify=y,
                                                   train_size=0.7,
                                                   random_state=2022)

In [43]:
svm = SVC(kernel = "linear",
         probability = True,
         random_state = 2022)
svm.fit(X_train, y_train)

SVC(kernel='linear', probability=True, random_state=2022)

In [44]:
y_pred = svm.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.825


In [45]:
y_pred_prob = svm.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.7875


Grid Search CV

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
params = {"C": np.linspace(0.001, 10, 20)}

kfold = StratifiedKFold(n_splits = 5,
                       shuffle = True,
                       random_state = 2022)
svm = SVC(kernel = "linear",
         probability = True,
         random_state = 2022)

gcv = GridSearchCV(svm,
                  param_grid = params,
                  cv = kfold,
                  verbose = 3,
                  scoring = "roc_auc")
gcv.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ...........................C=0.001;, score=0.830 total time=   0.0s
[CV 2/5] END ...........................C=0.001;, score=0.621 total time=   0.0s
[CV 3/5] END ...........................C=0.001;, score=0.645 total time=   0.0s
[CV 4/5] END ...........................C=0.001;, score=0.734 total time=   0.0s
[CV 5/5] END ...........................C=0.001;, score=0.527 total time=   0.0s
[CV 1/5] END ..............C=0.5272631578947369;, score=0.830 total time=   0.0s
[CV 2/5] END ..............C=0.5272631578947369;, score=0.841 total time=   0.0s
[CV 3/5] END ..............C=0.5272631578947369;, score=0.781 total time=   0.0s
[CV 4/5] END ..............C=0.5272631578947369;, score=0.959 total time=   0.0s
[CV 5/5] END ..............C=0.5272631578947369;, score=0.876 total time=   0.0s
[CV 1/5] END ..............C=1.0535263157894736;, score=0.835 total time=   0.0s
[CV 2/5] END ..............C=1.0535263157894736

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2022, shuffle=True),
             estimator=SVC(kernel='linear', probability=True,
                           random_state=2022),
             param_grid={'C': array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])},
             scoring='roc_auc', verbose=3)

In [50]:
print(gcv.best_params_)

{'C': 0.5272631578947369}


In [48]:
print(gcv.best_score_)

0.8571428571428573
