In [1]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score

In [2]:
bankdata  = pd.read_csv("cleanbankdata.csv")
testdata = pd.read_csv("cleantestdata.csv")
data = pd.concat([bankdata,testdata]).reset_index(drop=True)

In [3]:
train = data[data["isDefault"].notna()]
test = data[data["isDefault"].isna()]
y = train["isDefault"]

In [21]:
features = [f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault']]
ordata_preds = np.zeros(train.shape[0])
preds = np.zeros(test.shape[0])

In [22]:
forest = RandomForestClassifier(criterion='entropy',n_estimators=100,max_features="sqrt",random_state=1,n_jobs=2)
forest.fit(train[features],y)
preds = forest.predict(test[features])

In [18]:
kfold = StratifiedKFold(n_splits = 5,shuffle=True ,random_state=0)

In [19]:
for k, (train_index, valuation_index) in enumerate(kfold.split(train,y)):
    train_x, train_y = train[features].iloc[train_index], y.iloc[train_index]
    valuation_x, valuation_y = train[features].iloc[valuation_index], y.iloc[valuation_index]
    forest = RandomForestClassifier(criterion='entropy',n_estimators=200,max_features="sqrt",random_state=1,n_jobs=2)
    forest.fit(train_x,train_y)
    ordata_preds[valuation_index]  = forest.predict(valuation_x)
    print("Fold %2d AUC : %.6f" % (k + 1, roc_auc_score(valuation_y, ordata_preds[valuation_index])))
    preds  += forest.predict(test[features]) / kfold.n_splits
    del forest, train_x, train_y, valuation_x, valuation_y 
    gc.collect()
print(" Full AUC score %.6f" % roc_auc_score(y, ordata_preds)) 

Fold  1 AUC : 0.690920
Fold  2 AUC : 0.684309
Fold  3 AUC : 0.678708
Fold  4 AUC : 0.715561
Fold  5 AUC : 0.724204
 Full AUC score 0.698727


In [23]:
test["isDefault"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["isDefault"] = preds


In [24]:
results = test[["loan_id","isDefault"]]
results = results.rename(columns={"loan_id":"id"})
results

Unnamed: 0,id,isDefault
9998,1000575,0.0
9999,1028125,0.0
10000,1010694,0.0
10001,1026712,0.0
10002,1002895,0.0
...,...,...
14993,1008856,0.0
14994,1016651,0.0
14995,1024140,0.0
14996,1014316,0.0


In [25]:
results.to_csv("randomforest100.csv",index=False)