In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve, KFold
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
import random
from sklearn.svm import SVC
import sklearn.metrics as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import fbeta_score

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
validation = pd.read_csv("data/validation.csv")

In [3]:
columns_tree = pd.read_csv("data/cols_Decission_tree.csv")

In [4]:
columns_tree = columns_tree["0"].to_list()

In [5]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

In [6]:
to_drop = to_drop + columns_tree

In [7]:
def clean(df):
    df = df.drop(columns=["Unnamed: 0"])
    df = df.drop(columns=to_drop)
    df = df.rename(columns=lambda x: x.lstrip(" "))
    df = df.rename(columns=lambda x: x.lower())
    df = df.rename(columns=lambda x: x.replace(" ", "_"))
    return df

In [8]:
train = clean(train)
test = clean(test)
validation = clean(validation)

In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
train.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
roa(c)_before_interest_and_depreciation_before_interest,3426.0,0.5064349,0.06126178,0.066933,0.4769171,0.504485,0.5377078,0.9715302
operating_gross_margin,3426.0,0.6081864,0.01838928,0.0,0.6006284,0.6062353,0.6142493,1.0
persistent_eps_in_the_last_four_seasons,3426.0,0.2292676,0.03113928,0.078567,0.2149003,0.2252529,0.2397655,0.6621915
cash_flow_per_share,3426.0,0.323834,0.0185392,0.169449,0.3178185,0.3226993,0.3289595,1.0
realized_sales_gross_profit_growth_rate,3426.0,0.02255897,0.0168836,0.009889,0.02206536,0.02210285,0.02215335,1.0
operating_profit_growth_rate,3426.0,0.847779,0.01479187,0.0,0.8479853,0.8480424,0.8481219,0.8877039
after-tax_net_profit_growth_rate,3426.0,0.6891099,0.01424621,0.0,0.6892779,0.6894409,0.6896457,0.837205
total_asset_growth_rate,3426.0,5560759000.0,2876862000.0,0.0001,4990000000.0,6430000000.0,7420000000.0,9990000000.0
cash_reinvestment_%,3426.0,0.3797999,0.02027012,0.0,0.374837,0.3806676,0.3868312,0.7599475
current_ratio,3426.0,0.01544379,0.02725959,0.000193,0.007579072,0.01058655,0.01625502,1.0


In [10]:
pd.set_option('display.max_rows', 10)

In [11]:
train["bankrupt?"].value_counts()

0    3323
1     103
Name: bankrupt?, dtype: int64

In [12]:
y_train = train["bankrupt?"]
X_train = train.loc[:, train.columns != 'bankrupt?']
y_test = test["bankrupt?"]
X_test = test.loc[:, test.columns != 'bankrupt?']
y_val = train["bankrupt?"]
X_val = train.loc[:, train.columns != 'bankrupt?']

In [13]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors = 3, random_state = 42)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)
y_train_SMOTE.value_counts(normalize = True)

0    0.5
1    0.5
Name: bankrupt?, dtype: float64

In [14]:
sc = StandardScaler()

In [15]:
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [16]:
X_train_SMOTE = sc.fit_transform(X_train_SMOTE)

In [17]:
kernel = "linear"
x_data = [X_train, X_train_SMOTE]
y_data = [y_train, y_train_SMOTE]
x_predict = [X_val, X_test]
y_predict = [y_val, y_test]

In [18]:
def SVM(kernel,X,y,xp,yp):
    lsvclassifier = SVC(kernel=kernel)
    lsvclassifier.fit(X,y)
    accuracies = cross_val_score(estimator = lsvclassifier, X = X, y = y, cv = 5)
    mean_svm_linear=accuracies.mean()
    std_svm_linear=accuracies.std()
    
    print('After 5 fold cross validation:')
    print('Mean of Accuracies: ',mean_svm_linear*100,end='\n')
    print('Standard deviation of Accuracies',std_svm_linear*100,end='\n')
    
    y_pred = lsvclassifier.predict(xp)
    
    print('Test Output:')
    print('Confusion Matrix:')
    print(sk.confusion_matrix(yp,y_pred))
    print('Classification Report:')
    print(sk.classification_report(yp,y_pred))
    print('Accuracy: ',sk.accuracy_score(yp, y_pred, normalize=True, sample_weight=None))
    for i in [0.5, 1, 2]:
        print("beta = ", i ,fbeta_score(yp, y_pred, average = None , beta = i))

In [19]:
for i in range(len(x_data)):
    for m in range(len(x_predict)):
        SVM(kernel,x_data[i],y_data[i],x_predict[m], y_predict[m])
        print("|||||||||||||||||||||||||")
    print("-------------------")

After 5 fold cross validation:
Mean of Accuracies:  96.96439743780724
Standard deviation of Accuracies 0.19354498524236116
Test Output:
Confusion Matrix:
[[3322    1]
 [  99    4]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3323
           1       0.80      0.04      0.07       103

    accuracy                           0.97      3426
   macro avg       0.89      0.52      0.53      3426
weighted avg       0.97      0.97      0.96      3426

Accuracy:  0.9708114419147694
beta =  0.5 [0.97665667 0.16260163]
beta =  1 [0.985172   0.07407407]
beta =  2 [0.99383713 0.04796163]
|||||||||||||||||||||||||
After 5 fold cross validation:
Mean of Accuracies:  96.96439743780724
Standard deviation of Accuracies 0.19354498524236116
Test Output:
Confusion Matrix:
[[2170    0]
 [  79    2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2

In [20]:
from sklearn.utils import compute_class_weight

weights = compute_class_weight('balanced', classes=train['bankrupt?'].unique(), y=train['bankrupt?'])
weights

array([ 0.51549804, 16.63106796])

In [21]:
#playing around with the pruning to get the best boosting tree
# Applying Grid Search to find the best model and the best parameters
classifiersvc = SVC(random_state=1, class_weight = "balanced")
classifiersvc.fit(X_train_SMOTE, y_train_SMOTE)
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [1,100,300,1000]},
             {'degree': [2,3,4]},
             {'decision_function_shape': ["ovo", "ovr"]}] 
for item in parameters:
    grid_search = GridSearchCV(estimator = classifiersvc,
                               param_grid = item,
                               scoring = 'accuracy',
                               cv = 10,
                               n_jobs = -1)
    grid_search = grid_search.fit(X_train_SMOTE, y_train_SMOTE)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_

    print('Accuracy: ',best_accuracy,end='\n')
    print('Best Parameters: ',best_parameters,end='\n')

Accuracy:  0.9858558293323669
Best Parameters:  {'C': 1000}
Accuracy:  0.9470359634024821
Best Parameters:  {'degree': 2}
Accuracy:  0.9470359634024821
Best Parameters:  {'decision_function_shape': 'ovo'}


# Predictions on test

In [22]:
classifiersvc = SVC(random_state=1, class_weight = "balanced", C = 1000, degree = 2, decision_function_shape = "ovo")
classifiersvc.fit(X_train_SMOTE,y_train_SMOTE)
y_pred = classifiersvc.predict(X_test)
print(sk.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.83      0.89      2170
           1       0.09      0.48      0.16        81

    accuracy                           0.81      2251
   macro avg       0.54      0.65      0.53      2251
weighted avg       0.95      0.81      0.87      2251



In [23]:
classifiersvc = SVC(random_state=1, class_weight = "balanced")
classifiersvc.fit(X_train_SMOTE,y_train_SMOTE)
y_pred = classifiersvc.predict(X_test)
print(sk.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.61      0.75      2170
           1       0.07      0.81      0.13        81

    accuracy                           0.61      2251
   macro avg       0.53      0.71      0.44      2251
weighted avg       0.96      0.61      0.73      2251



In [24]:
for i in [0.5, 1, 2]:
    print("beta = ", i ,fbeta_score(y_test, y_pred, average = None , beta = i))

beta =  0.5 [0.87803576 0.08774262]
beta =  1 [0.7517852  0.13186813]
beta =  2 [0.657277   0.26527331]


In [25]:
print("beta = ", 0.001 ,fbeta_score(y_test, y_pred, average = None , beta = 0.001))

beta =  0.001 [0.98872965 0.0717392 ]


In [26]:
# TomekLink
from imblearn.under_sampling import TomekLinks

In [27]:
tl = TomekLinks(sampling_strategy='majority') # play around with sampling_strategy_
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [28]:
print(X_train.shape)
print(X_train_tl.shape) # created synthetic data

(3426, 37)
(3398, 37)


In [29]:
y_train_tl.value_counts()

0    3295
1     103
Name: bankrupt?, dtype: int64

In [30]:
#playing around with the pruning to get the best boosting tree
# Applying Grid Search to find the best model and the best parameters
from sklearn.ensemble import GradientBoostingClassifier
classifiergb = GradientBoostingClassifier(random_state=1)
classifiergb.fit(X_train, y_train)
from sklearn.model_selection import GridSearchCV

parameters = [{'n_estimators': [50,100,200]},
             {'learning_rate': [0.01,0.1,1]},
             {'subsample': [0.1,0.5,1]}] 
for item in parameters:
    grid_search = GridSearchCV(estimator = classifiergb,
                               param_grid = item,
                               scoring = 'accuracy',
                               cv = 10,
                               n_jobs = -1)
    grid_search = grid_search.fit(X_train, y_train)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_

    print('Accuracy: ',best_accuracy,end='\n')
    print('Best Parameters: ',best_parameters,end='\n')

Accuracy:  0.9664305321125944
Best Parameters:  {'n_estimators': 50}
Accuracy:  0.9693528037781528
Best Parameters:  {'learning_rate': 0.01}
Accuracy:  0.9649711012224438
Best Parameters:  {'subsample': 1}


In [31]:
classifiergb = GradientBoostingClassifier(random_state=1, n_estimators = 50, learning_rate = 0.01, subsample = 1)
classifiergb.fit(X_train_SMOTE, y_train_SMOTE)
y_predgb = classifiergb.predict(X_test)
print(sk.classification_report(y_test, y_predgb))

              precision    recall  f1-score   support

           0       0.99      0.43      0.60      2170
           1       0.06      0.94      0.11        81

    accuracy                           0.45      2251
   macro avg       0.53      0.68      0.35      2251
weighted avg       0.96      0.45      0.58      2251

