In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])

In [3]:

df['bankruptcy'] = (df['class']==b'1')
del df['class']
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [4]:
df.describe()

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X55,X56,X57,X58,X59,X60,X61,X62,X63,X64
count,9791.0,9791.0,9791.0,9749.0,9771.0,9791.0,9791.0,9773.0,9792.0,9791.0,...,9792.0,9771.0,9791.0,9776.0,9791.0,9178.0,9760.0,9771.0,9749.0,9561.0
mean,0.043019,0.596404,0.130959,8.1366,64.65164,-0.059273,0.059446,19.884016,1.882296,0.38904,...,7686.33,-0.992263,0.035022,1.133287,0.856053,118.156064,25.19443,2015.157,8.660813,35.949619
std,0.359321,4.587122,4.559074,290.647281,14759.39,6.812754,0.533344,698.697015,17.67465,4.590299,...,76052.61,77.007971,8.945365,8.038201,26.393305,3230.316692,1099.260821,117146.1,60.838202,483.318623
min,-12.458,0.0,-445.91,-0.045319,-379460.0,-486.82,-12.458,-1.8482,-0.032371,-445.91,...,-713220.0,-7522.1,-597.42,-30.892,-284.38,0.0,-12.656,-14965.0,-0.02439,-1.5e-05
25%,0.001321,0.263145,0.020377,1.047,-51.217,-0.000578,0.003004,0.4283,1.006675,0.29444,...,21.84,0.003121,0.008768,0.885722,0.0,5.356325,4.2677,43.234,2.9388,2.0129
50%,0.041364,0.46774,0.19929,1.5918,-0.055576,0.0,0.04882,1.0887,1.1613,0.51045,...,950.33,0.043679,0.098026,0.958305,0.002129,9.482,6.28355,74.729,4.8489,4.0416
75%,0.11113,0.689255,0.41067,2.8804,55.732,0.065322,0.12694,2.691,1.970225,0.71429,...,4694.55,0.11717,0.24268,0.996163,0.21179,19.506,9.9382,123.345,8.3638,9.4135
max,20.482,446.91,22.769,27146.0,1034100.0,322.2,38.618,53209.0,1704.8,12.602,...,6123700.0,112.02,226.76,668.75,1661.0,251570.0,108000.0,10779000.0,5662.4,21153.0


In [5]:
df.head()

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X56,X57,X58,X59,X60,X61,X62,X63,X64,bankruptcy
0,0.15929,0.4624,0.07773,1.1683,-44.853,0.46702,0.18948,0.82895,1.1223,0.3833,...,0.10899,0.41557,0.89101,0.001422,7.7928,4.9914,119.81,3.0465,3.056,False
1,-0.12743,0.46243,0.26917,1.7517,7.597,0.000925,-0.12743,1.1625,1.2944,0.53757,...,-0.089372,-0.23704,1.0625,0.15041,5.4327,3.4629,100.97,3.615,3.4725,False
2,0.070488,0.2357,0.52781,3.2393,125.68,0.16367,0.086895,2.8718,1.0574,0.67689,...,0.054286,0.10413,0.94571,0.0,7.107,3.3808,76.076,4.7978,4.7818,False
3,0.13676,0.40538,0.31543,1.8705,19.115,0.50497,0.13676,1.4539,1.1144,0.58938,...,0.10263,0.23203,0.89737,0.073024,6.1384,4.2241,88.299,4.1337,4.6484,False
4,-0.11008,0.69793,0.18878,1.2713,-15.344,0.0,-0.11008,0.43282,1.735,0.30207,...,0.43988,-0.3644,0.57153,0.0,18.801,2.7925,146.39,2.4934,15.036,False


In [6]:
sum(df.bankruptcy == True)

515

In [7]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_imp = imp_mean.fit_transform(df.values)

In [8]:
from sklearn.model_selection import train_test_split

X, y = X_imp[:, :-1], X_imp[:, -1]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                     stratify=y)

In [9]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

## Find the 3 most important features

Extract 3 features using PCA method.

In [11]:
from sklearn.decomposition import PCA
PCA = PCA(n_components=3)
PCA = PCA.fit(X_train_std)
PCA_X_train = PCA.transform(X_train_std)
PCA_X_train

array([[-0.27866734, -0.12838674, -0.83934568],
       [-0.49169703, -0.03954937, -0.37901576],
       [ 0.25337529, -0.0080618 ,  0.52857137],
       ...,
       [ 0.61759842, -0.15246874, -0.70193453],
       [-1.55606583,  0.21775718,  3.17678217],
       [ 0.46442677, -0.09412127, -0.4600359 ]])

In [12]:
PCA.explained_variance_

array([11.19940342,  5.8387438 ,  5.19411944])

In [13]:
PCA.explained_variance_ratio_

array([0.17496515, 0.09121706, 0.08114628])

## Apply LR / SVM / Decision Tree below

In [21]:
#LR
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=3),LogisticRegression(penalty='l1',solver='liblinear'))
pipe_lr.fit(X_train_std, y_train)
y_pred = pipe_lr.predict(X_test)

print('Training accuracy:', pipe_lr.score(X_train_std, y_train))
print('Test accuracy:', pipe_lr.score(X_test_std, y_test))

Training accuracy: 0.946892325649256
Test accuracy: 0.9462219196732471


In [26]:
#SVM
from sklearn.svm import SVC

pipe_svm = make_pipeline(StandardScaler(),PCA(n_components=3),SVC(C=1,kernel = 'rbf', degree=3 ,random_state = None, gamma = 'auto'))
pipe_svm.fit(X_train_std, y_train)

print('Training accuracy:', pipe_svm.score(X_train_std,y_train))
print('Test accuracy:',pipe_svm.score(X_test_std, y_test))

Training accuracy: 0.9483513276918588
Test accuracy: 0.9472430224642614


In [29]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
pipe_tree = make_pipeline(StandardScaler(),PCA(n_components=3),DecisionTreeClassifier(criterion = 'gini',splitter='random', max_depth= 4, random_state = 1))
pipe_tree.fit(X_train_std, y_train)
print('Training accuracy:', pipe_tree.score(X_train_std,y_train))
print('Test accuracy:',pipe_tree.score(X_test_std, y_test))

Training accuracy: 0.9477677268748176
Test accuracy: 0.9458815520762424


## Grid search & Cross validation

Use grid search for finding optimal hyperparameters. (PML p199). In the search, apply 10-fold cross-validation.

In [37]:
from sklearn.model_selection import GridSearchCV

penalty_range = ['l2','l1','elasticnet']
c_range = [0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
solver_range = ['liblinear','sag','newton-cg']
param_grid = {'penalty':penalty_range,'C':c_range,'solver':solver_range}

lr_gs = GridSearchCV(estimator = LogisticRegression(random_state=1),param_grid = param_grid,scoring='accuracy',refit=True,cv=10,n_jobs=-1)
lr_gs = lr_gs.fit(PCA_X_train,y_train)
print(lr_gs.best_score_)
print(lr_gs.best_params_)

0.9474758996403567
{'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}


In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

c_range = [0.01,0.1,1.0,10.0]
gamma_range = [0.01,0.1,1,2,3,4,5]
kernel_range = ['rbf','sigmoid']
param_grid = {'C':c_range, 'gamma':gamma_range,'kernel':kernel_range}

svm_gs = GridSearchCV(estimator = SVC(random_state=1),param_grid = param_grid,scoring='accuracy',refit=True,cv=10,n_jobs=-1)
svm_gs = svm_gs.fit(PCA_X_train, y_train)
print(svm_gs.best_score_)
print(svm_gs.best_params_)

0.9474758996403567
{'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}


In [42]:

from sklearn.model_selection import GridSearchCV

criterion_range = ['gini','entropy']
splitter_range = ['best','random']
max_depth_range = [1,2,3,4,5]
param_grid = {'criterion':criterion_range, 'splitter':splitter_range, 'max_depth':max_depth_range}

tree_gs = GridSearchCV(estimator = DecisionTreeClassifier(random_state=1),param_grid = param_grid,scoring='accuracy',refit=True,cv=10,n_jobs=-1)
tree_gs = tree_gs.fit(PCA_X_train, y_train)

print(tree_gs.best_score_)
print(tree_gs.best_params_)

0.9474758996403567
{'criterion': 'gini', 'max_depth': 1, 'splitter': 'best'}


In [35]:
#  Decision tree grid search and cross validation
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.tree import DecisionTreeClassifier

regressor = DecisionTreeClassifier(random_state=0)
parameters = {'max_depth': range(1, 6)}
scoring_fnc = make_scorer(accuracy_score)
kfold = KFold(n_splits=10)
 
grid = GridSearchCV(regressor, parameters, scoring_fnc, cv=kfold)
grid = grid.fit(X_train, y_train)
reg = grid.best_estimator_

print('best score: %f'%grid.best_score_)
print('best parameters:')
for key in parameters.keys():
    print('%s: %d'%(key, reg.get_params()[key]))
 
print('test score: %f'%reg.score(X_test, y_test))
 


best score: 0.958127
best parameters:
max_depth: 3
test score: 0.956093


In [20]:
import pandas as pd
pd.DataFrame(grid.cv_results_).T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.00072813,0.000929022,0.000948572,0.000641346,0.00056777
std_fit_time,0.000261842,0.000270745,0.000195379,0.000114176,9.91952e-05
mean_score_time,0.000328612,0.000389051,0.000380039,0.000282598,0.000241518
std_score_time,0.000162438,0.000112084,9.10421e-05,7.37296e-05,5.68254e-05
param_max_depth,1,2,3,4,5
params,{'max_depth': 1},{'max_depth': 2},{'max_depth': 3},{'max_depth': 4},{'max_depth': 5}
split0_test_score,0.444444,0.888889,0.888889,0.888889,0.888889
split1_test_score,0.666667,1,1,1,1
split2_test_score,0.777778,0.888889,1,1,1
split3_test_score,0.333333,1,1,1,1


In [21]:
#  SVM grid search and cross validation
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score
 
# grid search start
best_score = 0
for gamma in [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000,100000]:
    for c in [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000,100000]:
        # 对于每种参数可能的组合，进行一次训练
        svm = SVC(gamma=gamma,C=c)
        scores = cross_val_score(svm,X_trainval,y_trainval,cv=10)
        score = scores.mean()
        # 找到表现最好的参数
        if score > best_score:
            best_score = score
            best_parameters = {'gamma':gamma,"C":c}
 

svm = SVC(**best_parameters)
svm.fit(X_train,y_train)
 
# evalyation 模型评估
test_score = svm.score(X_test,y_test)
 
print('Best socre:{:.2f}'.format(best_score))
print('Best parameters:{}'.format(best_parameters))
print('Best score on test set:{:.2f}'.format(test_score))


Best socre:0.98
Best parameters:{'gamma': 1e-05, 'C': 100000}
Best score on test set:0.97
