# load data

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import zipfile
with zipfile.ZipFile('KaggleCredit2.csv.zip', 'r') as z:
    f = z.open('KaggleCredit2.csv')
    data = pd.read_csv(f, index_col=0)
data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45.0,2.0,0.802982,9120.0,13.0,0.0,6.0,0.0,2.0
1,0,0.957151,40.0,0.0,0.121876,2600.0,4.0,0.0,0.0,0.0,1.0
2,0,0.65818,38.0,1.0,0.085113,3042.0,2.0,1.0,0.0,0.0,0.0
3,0,0.23381,30.0,0.0,0.03605,3300.0,5.0,0.0,0.0,0.0,0.0
4,0,0.907239,49.0,1.0,0.024926,63588.0,7.0,0.0,1.0,0.0,0.0


In [2]:
data.shape

(112915, 11)

# drop na

In [3]:
data.isnull().sum(axis=0)

SeriousDlqin2yrs                           0
RevolvingUtilizationOfUnsecuredLines       0
age                                     4267
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                              0
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      4267
dtype: int64

In [4]:
data.dropna(inplace=True)
data.shape

(108648, 11)

# create X and y

In [5]:
y = data['SeriousDlqin2yrs']
X = data.drop('SeriousDlqin2yrs', axis=1)

In [6]:
y.mean()

0.06742876076872101

# exercise 1
把数据切分成训练集和测试集

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((76053, 10), (32595, 10), (76053,), (32595,))

# exercie 2 & 3
使用logistic regression/决策树/SVM/KNN...等sklearn分类算法进行分类

## Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
import numpy as np

lr = LogisticRegression(C=1000.0, max_iter=1000, random_state=0)
lr.fit(X_train, y_train)

In [9]:
y_pred_lr = lr.predict(X_test)
np.sum(y_pred_lr != 0)

147

In [10]:
(y_pred_lr != y_test).sum()

2204

In [11]:
from sklearn.metrics import accuracy_score

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_lr))

Accuracy: 0.93


## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
tree.fit(X_train, y_train)

In [13]:
y_pred_tree = tree.predict(X_test)
(y_pred_tree != y_test).sum()

2187

In [14]:
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_tree))

Accuracy: 0.93


## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', 
                                n_estimators=10, # The number of trees in the forest.
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

In [16]:
y_pred_forest = forest.predict(X_test)
(y_pred_forest != y_test).sum()

2230

In [17]:
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_forest))

Accuracy: 0.93


## SVM

In [18]:
# from sklearn.svm import SVC

# svm = SVC(kernel='linear', C=1.0, random_state=0)
# svm.fit(X_train, y_train)

## KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train, y_train)

In [20]:
y_pred_knn = knn.predict(X_test)
(y_pred_knn != y_test).sum()

2263

In [21]:
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_knn))

Accuracy: 0.93


# exercise 4

In [22]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test,y_pred_knn)

In [23]:
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print("TP:", TP)
print("TN:", TN)
print("FP:", FP)
print("FN:", FN)

TP: 42
TN: 30290
FP: 68
FN: 2195


In [24]:
print('accuracy:',(TP+TN) / (TP+TN+FN+FP))

accuracy: 0.9305721736462648


In [25]:
print('recall:',TP / (TP+FN))

recall: 0.018775145283862316


# exercise 5

In [39]:
# y_predict_proba = lr.predict_proba(X_test)
lr.predict_proba(X_test)

array([[0.88121479, 0.11878521],
       [0.96366915, 0.03633085],
       [0.94710752, 0.05289248],
       ...,
       [0.90858472, 0.09141528],
       [0.9271599 , 0.0728401 ],
       [0.96688444, 0.03311556]])

In [40]:
y_pred_prob = lr.predict_proba(X_test)[:, 1]

In [44]:
from sklearn.preprocessing import binarize
y_pred_class = binarize(y_pred_prob.reshape(-1, 1), threshold=0.7)[:,0]

In [45]:
y_pred_class == y_test

79082     False
79084      True
79085      True
79086      True
79087      True
          ...  
112910     True
112911     True
112912     True
112913     True
112914     True
Name: SeriousDlqin2yrs, Length: 32595, dtype: bool

In [46]:
confusion = confusion_matrix(y_test,y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print("TP:", TP)
print("TN:", TN)
print("FP:", FP)
print("FN:", FN)

print('accuracy:',(TP+TN) / (TP+TN+FN+FP))
print('recall:',TP / (TP+FN))

TP: 37
TN: 30336
FP: 22
FN: 2200
accuracy: 0.9318300352814849
recall: 0.016540008940545373
