# Give me some credit

In [None]:
import numpy as np
import pandas as pd
import sklearn
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import re as re
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler


from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


%matplotlib inline

## Read in data

In [None]:
train_df = pd.read_csv('data/cs-training.csv')
test_df = pd.read_csv('data/cs-test.csv')

## Check data

In [None]:
train_df.head(5)

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
plt.figure()
sns.countplot('SeriousDlqin2yrs',data=train_df)

## Wash data

In [None]:
train_df.loc[train_df['age'] == 0, 'age'] = train_df['age'].median()
train_df['MonthlyIncome'] = train_df['MonthlyIncome'].replace(np.nan,train_df['MonthlyIncome'].mean())
train_df['NumberOfDependents'].fillna(train_df['NumberOfDependents'].median(), inplace=True)
train_df.info()

In [None]:
test_df.loc[train_df['age'] == 0, 'age'] = test_df['age'].median()
test_df['MonthlyIncome'] = test_df['MonthlyIncome'].replace(np.nan,test_df['MonthlyIncome'].mean())
test_df['NumberOfDependents'].fillna(test_df['NumberOfDependents'].median(), inplace=True)
test_df.info()

In [None]:
# check correlation
corr = train_df.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

## Data analysis

In [None]:
# rename
X = train_df.drop(['SeriousDlqin2yrs'],axis=1)
y = train_df['SeriousDlqin2yrs']
W = test_df.drop(['SeriousDlqin2yrs'],axis=1)
z = test_df['SeriousDlqin2yrs']

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--") # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
def plot_feature_importances(model):
    plt.figure(figsize=(10,8))
    n_features = X.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.ylim(-1, n_features)

### Linear Regression

In [None]:
# Linear Regression
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=111)

lr = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr.fit(X_train_scaled, y_train)

lr_scores_proba = lr.predict_proba(X_train_scaled)

lr_scores = lr_scores_proba[:,1]

fpr_lr, tpr_lr, thresh_lr = roc_curve(y_train, lr_scores)

plot_roc_curve(fpr_lr, tpr_lr)
print ('AUC Score : ', (roc_auc_score(y_train, lr_scores)))

### Random Forest Classifier

In [None]:
print ('Original dataset shape :', Counter(y))
rus = RandomUnderSampler(random_state=111)
X_resampled, y_resampled = rus.fit_sample(X, y)
print ('Resampled dataset shape:', Counter(y_resampled))
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_resampled, y_resampled, random_state=111)


forest = RandomForestClassifier(n_estimators=300, random_state=111, max_depth=5, class_weight='balanced')
forest.fit(X_train_rus, y_train_rus)
y_scores_prob = forest.predict_proba(X_train_rus)
y_scores = y_scores_prob[:, 1]
fpr, tpr, thresh = roc_curve(y_train_rus, y_scores)
plot_roc_curve(fpr, tpr)
print ('AUC score:', roc_auc_score(y_train_rus, y_scores))

In [None]:
y_test_proba = forest.predict_proba(X_test_rus)
y_scores_test = y_test_proba[:, 1]
fpr_test, tpr_test, thresh_test = roc_curve(y_test_rus, y_scores_test)
plot_roc_curve(fpr_test, tpr_test)
print ('AUC Score:', roc_auc_score(y_test_rus, y_scores_test))

In [None]:
plot_feature_importances(forest)

### GBC

In [None]:
gbc_clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=8, random_state=112)
gbc_clf.fit(X_train, y_train)
gbc_clf_proba = gbc_clf.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:, 1]
fpr_gbc, tpr_gbc, thres_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc, tpr_gbc)
print ('AUC Score:', roc_auc_score(y_train, gbc_clf_scores))

In [None]:
gbc_val_proba = gbc_clf.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:, 1]
print ('AUC score:', roc_auc_score(y_test, gbc_val_scores))

In [None]:
gbc_clf_submission = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
gbc_clf_submission.fit(X_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
gbc_val_proba = gbc_clf_submission.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
print ('AUC Score :', roc_auc_score(y_train, gbc_clf_scores))
print ('AUC Score :', roc_auc_score(y_test, gbc_val_scores))
plot_feature_importances(gbc_clf)

## Data Output

In [None]:
temp = W.replace([np.inf, -np.inf], np.nan, inplace=False)
temp.isna().sum()
scaler.transform(W)

In [None]:
submission_proba = gbc_clf_submission.predict_proba(W)
submission_scores = submission_proba[:, 1]
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier
base_estimator = DecisionTreeClassifier(max_depth=4, random_state=0)
gbc_clf = sklearn.ensemble.AdaBoostClassifier(base_estimator=base_estimator, n_estimators=32,
                         random_state=0, learning_rate=0.1)
gbc_clf.fit(X_train, y_train)
gbc_clf_scores = gbc_clf.predict_proba(X_train)[:, 1]
fpr_gbc, tpr_gbc, thres_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc, tpr_gbc)
print ('AUC Score:', roc_auc_score(y_train, gbc_clf_scores))

In [None]:
gbc_val_proba = gbc_clf.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:, 1]
print ('AUC score:', roc_auc_score(y_test, gbc_val_scores))

In [None]:
submission_proba = gbc_clf.predict_proba(W)
submission_scores = submission_proba[:, 1]
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn_porter import Porter

porter = Porter(gbc_clf, language='js')
output = porter.export(embed_data=True)

print(output)

with open('AdaBoostClassifier.js', 'w') as f:
    f.write(output)