In [None]:
from IPython.display import display
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
matplotlib.rc('figure', figsize=[10,5])
import matplotlib.pyplot as plt
plt.style.use(['seaborn'])
bank_df = pd.read_csv("~\\banking_data.csv")

def plot_learning_curve(train_scores, val_scores, indices, title):
    plt.plot(indices, train_scores , "g-+", linewidth=2, label="train")
    plt.plot(indices, val_scores, "b-+", linewidth=2, label="validation")
    plt.title(title)
    plt.legend()
    plt.xlabel('Number of Training Instances Used')
    plt.grid(color='black', linestyle='-', linewidth=0.5, alpha=0.3)
        
def plot_complexity(train_scores, val_scores, indices, title):
    plt.plot(indices, train_scores , "g-+", linewidth=2, label="train")
    plt.plot(indices, val_scores, "b-+", linewidth=2, label="validation")
    plt.title(title)
    plt.legend()
    plt.grid(color='black', linestyle='-', linewidth=0.5, alpha=0.3)
    plt.gca().invert_xaxis()
    

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone    

def stratFold(train_X_all, train_y_all, classifier, metric, average=None,
              fold=5):
    train_metric, val_metric, indices = [], [], []
    for m in range(100, len(train_X_all), 100):
        skfolds = StratifiedKFold(n_splits=fold)
        metric_list = []
        metric_list_train = []
        train_X = train_X_all[:m]
        train_y = train_y_all[:m]
        for train_index, test_index in skfolds.split(train_X, train_y):
            train_X_folds = train_X.iloc[train_index]
            train_y_folds = train_y.iloc[train_index]
            test_X_fold = train_X.iloc[test_index]
            test_y_fold = train_y.iloc[test_index]
            classifier.fit(train_X_folds.values, train_y_folds.values.ravel())
            train_y_folds_pred = classifier.predict(train_X_folds)
            test_y_fold_pred = classifier.predict(test_X_fold)
            if average:
                metric_list.append(metric(test_y_fold.values, test_y_fold_pred,
                                     average=average))
                metric_list_train.append(metric(train_y_folds.values, train_y_folds_pred, 
                                            average=average))
            else:
                metric_list.append(metric(test_y_fold.values, test_y_fold_pred))
                metric_list_train.append(metric(train_y_folds.values, train_y_folds_pred))
                
        test_avg = sum(metric_list)/len(metric_list)
        train_avg = sum(metric_list_train)/len(metric_list_train)
        val_metric.append(test_avg)
        train_metric.append(train_avg)
        indices.append(m)
    
    return train_metric, val_metric, indices

In [None]:
from sklearn import preprocessing
x = bank_df.values # convert to numpy array
print(x.shape)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
print(x_scaled.shape)
bank_df = pd.DataFrame(x_scaled, columns=bank_df.columns)

from sklearn.model_selection import StratifiedShuffleSplit
data_split = StratifiedShuffleSplit(n_splits=1 , test_size=0.3, random_state=30)
for train_ind, test_ind in data_split.split(bank_df, bank_df["Bankrupt"]):
    strat_train_set = bank_df.loc[train_ind]
    strat_test_set = bank_df.loc[test_ind]
    
train_set = strat_train_set
test_set = strat_test_set

train_y = train_set[["Bankrupt"]]
train_X = train_set.drop("Bankrupt", axis=1)
test_y = test_set[["Bankrupt"]]
test_X = test_set.drop("Bankrupt", axis=1)

In [None]:
from sklearn.svm import SVC
bank_svm_classifier = SVC(gamma='auto', class_weight='balanced')
bank_svm_classifier.fit(train_X, train_y.values.ravel())

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
bank_train_predictions = bank_svm_classifier.predict(train_X)
accuracy = accuracy_score(train_y, bank_train_predictions)
print(classification_report(train_y, bank_train_predictions))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(bank_svm_classifier, train_X, train_y.values.ravel(),
                         scoring="f1_macro", cv=10)
scores.mean()

In [None]:
start = time.time()


from sklearn.metrics import f1_score
f1_scores = stratFold(train_X, train_y, bank_svm_classifier, f1_score, average="macro")
plot_learning_curve(f1_scores[0], f1_scores[1], f1_scores[2], 
                    title="F1 MACRO score learning curve ")

elapsed = (time.time() - start)

print(elapsed)

In [None]:
from sklearn.metrics import roc_curve, precision_score, accuracy_score, average_precision_score, recall_score, f1_score, classification_report, confusion_matrix
bank_svm_classifier.fit(train_X, train_y)
test_predictions = bank_svm_classifier.predict(test_X)
print(classification_report(test_y, test_predictions,digits=4,zero_division=True))
print(confusion_matrix(test_y, test_predictions))

In [None]:
from sklearn.model_selection import validation_curve
bank_svc = SVC(C=4.5, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0016025641025641025,
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

param_range = np.linspace(1, 500, 100)

train_scores, test_scores = validation_curve(
    bank_svc, train_X, train_y.values.ravel(), param_name="C", 
    param_range=param_range, scoring='f1_macro', verbose=1, cv=5, n_jobs=-1
)

print(train_scores.mean(axis=1), test_scores.mean(axis=1), param_range)
plot_complexity(train_scores.mean(axis=1), test_scores.mean(axis=1), param_range, 
                title='ROC Complexity Curve on ccp_alpha', inverse_x=False)

In [None]:
start = time.time()

best_linear_svc = SVC(C=105, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0016025641025641025,
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

f1_scores = stratFold(train_X, train_y, best_linear_svc, f1_score, 
                      average='macro')

plot_learning_curve(f1_scores[0], f1_scores[1], f1_scores[2], 
                    title="F1 MACRO score with linear kernel SVM")

elapsed = (time.time() - start)

print(elapsed)

In [None]:
from sklearn.metrics import roc_curve, precision_score, accuracy_score, average_precision_score, recall_score, f1_score, classification_report, confusion_matrix
best_linear_svc.fit(train_X, train_y)
test_predictions = best_linear_svc.predict(test_X)
print(classification_report(test_y, test_predictions,digits=4,zero_division=True))
print(confusion_matrix(test_y, test_predictions))