In [1]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score

In [2]:
bankdata  = pd.read_csv("cleanbankdata.csv")
testdata = pd.read_csv("cleantestdata.csv")
data = pd.concat([bankdata,testdata]).reset_index(drop=True)

In [3]:
data

Unnamed: 0.1,Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,...,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,issue_date_month,issue_date_dayofweek,earliesCreditMon,earliesCreditYear
0,0,1040418,240418,31818.181820,3,11.466,1174.91,3,3,13,...,5.0,4.0,3,9927,0.000000,0.0,10,5,12,2001
1,1,1025197,225197,28000.000000,5,16.841,670.69,3,3,13,...,45.0,22.0,0,0,0.000000,0.0,6,5,4,1990
2,2,1009360,209360,17272.727270,3,8.900,603.32,1,3,3,...,28.0,19.0,0,0,0.000000,0.0,1,2,10,1991
3,3,1039708,239708,20000.000000,3,4.788,602.30,1,1,10,...,15.0,9.0,0,0,0.000000,0.0,7,2,6,2001
4,4,1027483,227483,15272.727270,3,12.790,470.31,3,3,2,...,15.0,4.0,0,0,0.000000,0.0,7,4,5,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14993,4995,1008856,208856,9454.545455,5,12.015,183.47,3,3,2,...,14.0,8.0,0,0,0.000000,,1,5,7,2001
14994,4996,1016651,216651,5500.000000,3,7.970,172.28,1,3,8,...,5.0,3.0,3,1564,0.000000,,5,0,4,2001
14995,4997,1024140,224140,30545.454550,3,8.900,889.09,1,0,8,...,20.0,14.0,2,5456,1510.892308,,12,6,10,1986
14996,4998,1014316,214316,4090.909091,3,6.030,152.18,1,3,10,...,10.0,10.0,3,223,41.169231,,9,5,3,1999


In [4]:
train = data[data["isDefault"].notna()]
test = data[data["isDefault"].isna()]
y = train["isDefault"]

In [6]:
features = [f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault']]

In [7]:
ordata_preds = np.zeros(train.shape[0])
preds = np.zeros(test.shape[0])

In [8]:
kfold = StratifiedKFold(n_splits = 10,shuffle=True ,random_state=0)

In [15]:
for k, (train_index, valuation_index) in enumerate(kfold.split(train,y)):
    train_x, train_y = train[features].iloc[train_index], y.iloc[train_index]
    valuation_x, valuation_y = train[features].iloc[valuation_index], y.iloc[valuation_index]
    svm = SVC(random_state=0,gamma=0.10, C=10.0)    
    svm.fit(train_x,train_y)
    ordata_preds[valuation_index]  = svm.predict(valuation_x)
    print("Fold %2d AUC : %.6f" % (k + 1, roc_auc_score(valuation_y, ordata_preds[valuation_index])))
    preds  += svm.predict(test[features]) / kfold.n_splits
    del svm, train_x, train_y, valuation_x, valuation_y 
    gc.collect()
print(" Full AUC score %.6f" % roc_auc_score(y, ordata_preds)) 


Fold  1 AUC : 0.500000
Fold  2 AUC : 0.500000
Fold  3 AUC : 0.500000
Fold  4 AUC : 0.500000
Fold  5 AUC : 0.500000
Fold  6 AUC : 0.500000
Fold  7 AUC : 0.500000
Fold  8 AUC : 0.500000
Fold  9 AUC : 0.500000
Fold 10 AUC : 0.500000
 Full AUC score 0.500000


In [12]:
test["isDefault"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["isDefault"] = preds


In [13]:
results = test[["loan_id","isDefault"]]
results = results.rename(columns={"loan_id":"id"})
results

Unnamed: 0,id,isDefault
9998,1000575,0.0
9999,1028125,0.0
10000,1010694,0.0
10001,1026712,0.0
10002,1002895,0.0
...,...,...
14993,1008856,0.0
14994,1016651,0.0
14995,1024140,0.0
14996,1014316,0.0


^C
Note: you may need to restart the kernel to use updated packages.


In [14]:
results.to_csv("svmmodel.csv",index=False)