In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,  classification_report
import pandas as pd

In [2]:
train_data = pd.read_csv('train_data_updated_with_bankrupt.csv')

In [3]:
data = train_data.drop('Index', axis=1)

### Part 3.1

In [4]:
X = data.drop('cluster', axis=1)
y = data['cluster']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train the RandomForest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print('Metrics for cluster 1 companys:')
print(f'Accuracy: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_pred))


Metrics for cluster 1 companys:
Accuracy: 0.8890
Confusion Matrix:
[[ 42  10   2  17   4]
 [  0 508   4  26   4]
 [  0   6  96   0   1]
 [  1  27   0 329   0]
 [  0  22   4   1  58]]
              precision    recall  f1-score   support

           0       0.98      0.56      0.71        75
           1       0.89      0.94      0.91       542
           2       0.91      0.93      0.92       103
           3       0.88      0.92      0.90       357
           4       0.87      0.68      0.76        85

    accuracy                           0.89      1162
   macro avg       0.90      0.81      0.84      1162
weighted avg       0.89      0.89      0.89      1162



In [5]:
data_0 = data[data['cluster']==0]
data_1 = data[data['cluster']==1]

In [6]:
data_0 = data_0.drop('Unnamed: 0',axis = 1)

In [7]:
data_1 = data_1.drop('Unnamed: 0',axis = 1)

In [8]:
data_new = data.drop('Unnamed: 0',axis = 1)

In [9]:
data_0['Bankrupt?'].unique()

array([0, 1], dtype=int64)

### Part 3.2

### Cluster 0 Models

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import recall_score

X = data_0.drop(['cluster','Bankrupt?'],axis = 1)
y = data_0['Bankrupt?']
           
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Define the base learners
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define the meta learner
meta_learner = GaussianNB()

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the stacking classifier
stacking_classifier.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = stacking_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print('Metrics for cluster 0 companys:')
print(f'Accuracy Score: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_pred))


for name, model in base_learners:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    recall = recall_score(y_test, y_pred)  # 'macro' average for binary classification
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print the model's performance metrics
    print(f'{name} Model Evaluation:')
    print(f'Recall Score: {recall:.4f}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print()

Metrics for cluster 0 companys:
Accuracy Score: 0.8992
Confusion Matrix:
[[104   6]
 [  7  12]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       110
           1       0.67      0.63      0.65        19

    accuracy                           0.90       129
   macro avg       0.80      0.79      0.79       129
weighted avg       0.90      0.90      0.90       129

rf Model Evaluation:
Recall Score: 0.3684
Confusion Matrix:
[[110   0]
 [ 12   7]]

knn Model Evaluation:
Recall Score: 0.3684
Confusion Matrix:
[[104   6]
 [ 12   7]]

svc Model Evaluation:
Recall Score: 0.0526
Confusion Matrix:
[[110   0]
 [ 18   1]]



### Cluster 1 Models

In [11]:

X = data_1.drop(['cluster','Bankrupt?'],axis = 1)
y = data_1['Bankrupt?']
           
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the base learners
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define the meta learner
meta_learner = GaussianNB()

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the stacking classifier
stacking_classifier.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = stacking_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print('Metrics')
print(f'Accuracy Score: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_pred))


for name, model in base_learners:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    recall = recall_score(y_test, y_pred)  # 'macro' average for binary classification
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print the model's performance metrics
    print(f'{name} Model Evaluation:')
    print(f'Recall Score: {recall:.4f}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print()

Metrics
Accuracy Score: 0.0644
Confusion Matrix:
[[ 51 740]
 [  1   0]]
              precision    recall  f1-score   support

           0       0.98      0.06      0.12       791
           1       0.00      0.00      0.00         1

    accuracy                           0.06       792
   macro avg       0.49      0.03      0.06       792
weighted avg       0.98      0.06      0.12       792

rf Model Evaluation:
Recall Score: 0.0000
Confusion Matrix:
[[791   0]
 [  1   0]]

knn Model Evaluation:
Recall Score: 0.0000
Confusion Matrix:
[[791   0]
 [  1   0]]

svc Model Evaluation:
Recall Score: 0.0000
Confusion Matrix:
[[791   0]
 [  1   0]]



In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, confusion_matrix
# Example feature matrix X and target vector y
X = data_new.drop(['Bankrupt?'],axis = 1)
y = data_new['Bankrupt?']


# Standardize features if X is a DataFrame
if isinstance(X, pd.DataFrame):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # Convert back to DataFrame
else:
    X_scaled = X

# Reset index to align with numpy array style indexing
X_scaled = X_scaled.reset_index(drop=True)
y = y.reset_index(drop=True)

# Define the models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100,random_state = 42, class_weight='balanced'),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVC": SVC(class_weight='balanced'),
    "Gausssian": GaussianNB()
}

overall_accuracy = {name: [] for name in models} 
model_predictions = {name: [] for name in models}
model_actuals = {name: [] for name in models}

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=3)
print('Metrics:')
# Cross-validation loop
for name, model in models.items():
    tp = 0
    fn = 0
    print(f"Training {name}:")
    for train_index, test_index in skf.split(X_scaled, y):
        X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE 
        if name in ["KNN", "SVC","RandomForest"]:
            smote = SMOTE()
            X_train, y_train = smote.fit_resample(X_train, y_train)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        
        model_predictions[name].extend(y_pred)
        model_actuals[name].extend(y_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        overall_accuracy[name].append(accuracy)
        print(cm)
        tp = tp+cm[1,1]
        fn = fn+cm[1,0]
        print(f'Accuracy Score: {accuracy:.4f}')
        print(classification_report(y_test, y_pred))
    
    print('Overall Recall: ', round(tp/3)/(round(tp/3)+round(fn/3)))
    print('tp: ',round(tp/3))
    print('fn: ',round(fn/3))
    print('--------------------------')

# Calculate and print the average accuracy for each model
# Print overall recall for each model
for name in models:
    overall_recall = recall_score(model_actuals[name], model_predictions[name])  # 'macro' average computes recall independently for each class and then takes the average
    print(f"Overall recall for {name}: {overall_recall:.2f}")


Metrics:
Training RandomForest:
[[1795   75]
 [  36   30]]
Accuracy Score: 0.9427
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1870
           1       0.29      0.45      0.35        66

    accuracy                           0.94      1936
   macro avg       0.63      0.71      0.66      1936
weighted avg       0.96      0.94      0.95      1936

[[1812   58]
 [  38   28]]
Accuracy Score: 0.9504
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1870
           1       0.33      0.42      0.37        66

    accuracy                           0.95      1936
   macro avg       0.65      0.70      0.67      1936
weighted avg       0.96      0.95      0.95      1936

[[1812   57]
 [  36   30]]
Accuracy Score: 0.9519
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1869
           1       0.34      0.45      0.39        66

