In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score

In [4]:
data=pd.read_csv('Financial Distress.csv')

In [5]:
data.head()

Unnamed: 0,Company,Time,x1,x2,x3,x4,x5,x6,x7,x8,...,x75,x76,x77,x78,x79,x80,x81,x82,x83,Financial Distress
0,1,1,1.281,0.022934,0.87454,1.2164,0.06094,0.18827,0.5251,0.018854,...,27.07,26.102,16.0,16.0,0.2,22,0.06039,30,49,0
1,1,2,1.27,0.006454,0.82067,1.0049,-0.01408,0.18104,0.62288,0.006423,...,31.31,30.194,17.0,16.0,0.4,22,0.010636,31,50,0
2,1,3,1.0529,-0.059379,0.92242,0.72926,0.020476,0.044865,0.43292,-0.081423,...,36.07,35.273,17.0,15.0,-0.2,22,-0.45597,32,51,0
3,1,4,1.1131,-0.015229,0.85888,0.80974,0.076037,0.091033,0.67546,-0.018807,...,39.8,38.377,17.167,16.0,5.6,22,-0.32539,33,52,1
4,2,1,1.0623,0.10702,0.8146,0.83593,0.19996,0.0478,0.742,0.12803,...,27.07,26.102,16.0,16.0,0.2,29,1.251,7,27,0


In [5]:
data.columns

Index(['Company', 'Time', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9',
       'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19',
       'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29',
       'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39',
       'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49',
       'x50', 'x51', 'x52', 'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59',
       'x60', 'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x67', 'x68', 'x69',
       'x70', 'x71', 'x72', 'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x79',
       'x80', 'x81', 'x82', 'x83', 'Financial Distress'],
      dtype='object')

In [6]:
data['Financial Distress'].value_counts()

0    3536
1     136
Name: Financial Distress, dtype: int64

In [7]:
X, y = data.loc[:,data.columns!='Financial Distress'], data.loc[:,'Financial Distress'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=123,stratify=data['Financial Distress'])

In [8]:
# We have used stratified above to split the data distribution in equal manner
print(pd.value_counts(y_train)/y_train.size * 100)
print(pd.value_counts(y_test)/y_test.size * 100)

0    96.303502
1     3.696498
dtype: float64
0    96.279492
1     3.720508
dtype: float64


In [10]:
# Importing KNN module from PyOD
from pyod.models.knn import KNN

In [11]:
# Train kNN detector
clf = KNN(contamination=0.02, n_neighbors=5)
clf.fit(X_train)

KNN(algorithm='auto', contamination=0.02, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

In [12]:
# Get the prediction labels of the training data
y_train_pred = clf.labels_ 

# Outlier scores
y_train_scores = clf.decision_scores_  

In [13]:
from pyod.utils import evaluate_print

# Evaluate on the training data
evaluate_print('KNN', y_train, y_train_scores)

KNN ROC:0.4627, precision @ rank n:0.0211


In [14]:
unique, counts = np.unique(y_train_pred, return_counts=True)

In [15]:
unique,counts

(array([0, 1]), array([2518,   52]))

Accuracy is not the best metric to use when evaluating imbalanced datasets as it can be misleading. Metrics that can provide better insight include:

    Confusion Matrix: a talbe showing correct predictions and types of incorrect predictions.
    Precision: the number of true positives divided by all positive predictions. Precision is also called Positive Predictive Value. It is a measure of a classifier's exactness. Low precision indicates a high number of false positives.
    Recall: the number of true positives divided by the number of positive values in the test data. Recall is also called Sensitivity or the True Positive Rate. It is a measure of a classifier's completeness. Low recall indicates a high number of false negatives.
    F1: Score: the weighted average of precision and recall.

Since our main objective with the dataset is to prioritize accuraltely classifying financial instability cases the recall score can be considered our main metric to use for evaluating outcomes.

## Trying different algorithm

In [16]:
from sklearn.ensemble import RandomForestClassifier
# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test)

accuracy_score(y_test, rfc_pred)

0.9637023593466425

In [17]:
# f1 score
f1_score(y_test, rfc_pred, average='weighted')


0.95452821664783

In [18]:
# confusion matrix
pd.DataFrame(confusion_matrix(y_test, rfc_pred))

Unnamed: 0,0,1
0,1055,6
1,34,7


In [19]:
# recall score
recall_score(y_test, rfc_pred)

0.17073170731707318

Oversampling Minority Class

Oversampling can be defined as adding more copies of the minority class. Oversampling can be a good choice when you don't have a ton of data to work with. A con to consider when undersampling is that it can cause overfitting and poor generalization to your test set.

We will use the resampling module from Scikit-Learn to randomly replicate samples from the minority class.
Important Note

Always split into test and train sets BEFORE trying any resampling techniques! Oversampling before splitting the data can allow the exact same observations to be present in both the test and train sets! This can allow our model to simply memorize specific data points and cause overfitting.

In [20]:
from sklearn.utils import resample


In [21]:
X['class']=y

In [22]:
X.columns

Index(['Company', 'Time', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9',
       'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19',
       'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29',
       'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39',
       'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49',
       'x50', 'x51', 'x52', 'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59',
       'x60', 'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x67', 'x68', 'x69',
       'x70', 'x71', 'x72', 'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x79',
       'x80', 'x81', 'x82', 'x83', 'class'],
      dtype='object')

In [23]:
# separate minority and majority classes
not_distress = X[X['class']==0]
distress = X[X['class']==1]

# upsample minority
fraud_upsampled = resample(distress,
                          replace=True, # sample with replacement
                          n_samples=len(not_distress), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_distress, fraud_upsampled])

# check new class counts
upsampled['class'].value_counts()

1    3536
0    3536
Name: class, dtype: int64

In [24]:
# trying logistic regression again with the balanced dataset
y_train = upsampled['class']
X_train = upsampled.drop('class', axis=1)

upsampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

upsampled_pred = upsampled.predict(X_test)



In [25]:
# Checking accuracy
accuracy_score(y_test, upsampled_pred)

0.8112522686025408

In [26]:
# f1 score
f1_score(y_test, upsampled_pred, average='weighted')

0.8676836378769709

In [27]:
# confusion matrix
pd.DataFrame(confusion_matrix(y_test, upsampled_pred))

Unnamed: 0,0,1
0,864,197
1,11,30


In [28]:
recall_score(y_test, upsampled_pred)

0.7317073170731707

Undersampling Majority Class

Undersampling can be defined as removing some observations of the majority class. Undersampling can be a good choice when you have a ton of data -think millions of rows. But a drawback to undersampling is that we are removing information that may be valuable.

We will again use the resampling module from Scikit-Learn to randomly remove samples from the majority class.

In [29]:
# downsample majority
not_fraud_downsampled = resample(not_distress,
                                replace = False, # sample without replacement
                                n_samples = len(distress), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, distress])

# checking counts
downsampled['class'].value_counts()

1    136
0    136
Name: class, dtype: int64

In [30]:
y_train = downsampled['class']
X_train = downsampled.drop('class', axis=1)

undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

undersampled_pred = undersampled.predict(X_test)

In [31]:
# Checking accuracy
accuracy_score(y_test, undersampled_pred)

0.7468239564428312

In [32]:
# f1 score
f1_score(y_test, undersampled_pred, average='weighted')

0.8254815387502177

In [33]:
# confusion matrix
pd.DataFrame(confusion_matrix(y_test, undersampled_pred))

Unnamed: 0,0,1
0,785,276
1,3,38


In [34]:
recall_score(y_test, undersampled_pred)

0.926829268292683

Generate Synthetic Samples

SMOTE or Synthetic Minority Oversampling Technique is a popular algorithm to creates sythetic observations of the minority class.

In [35]:
from imblearn.over_sampling import SMOTE

# Separate input features and target
#y = data['Financial Distress']
#X = data.drop('Financial Distress', axis=1)

# setting up testing and training sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27, ratio=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [36]:
smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

smote_pred = smote.predict(X_test)

# Checking accuracy
accuracy_score(y_test, smote_pred)


0.7468239564428312

In [37]:
# f1 score
f1_score(y_test, smote_pred, average='weighted')

0.8254815387502177

In [38]:
# confustion matrix
pd.DataFrame(confusion_matrix(y_test, smote_pred))

Unnamed: 0,0,1
0,785,276
1,3,38


In [39]:
recall_score(y_test, smote_pred)

0.926829268292683

### Using class weights

In [40]:
from sklearn.utils import class_weight

In [44]:
#In order to calculate the class weight do the following
#class_weights = class_weight.compute_class_weight('imbalanced',
                                                # np.unique(y_train),
                                                # y_train)
#class_weights={0:1,1:200} even can assign manual weights

In [49]:
from collections import Counter

def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}


class_weights = get_class_weights(data['Financial Distress'].values)
print(class_weights)

{0: 1.0, 1: 26.0}


In [79]:
class_weights={0:1.0,1:26.0}

In [80]:
model = LogisticRegression(solver='liblinear',  class_weight=class_weights)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={0: 1.0, 1: 26.0}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [81]:
y_pred=model.predict(X_test)

In [82]:
f1_score(y_test, y_pred, average='weighted')

0.23878764935780006

In [83]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,148,913
1,0,41


In [84]:
recall_score(y_test, y_pred)

1.0