In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns

# Connect to Google Drive and Load Data
from google.colab import drive
drive.mount('/content/drive')

# Load Data
# NOTE: make sure to change the path below to match the location of your Excel file in your own Google Drive
file_path = "/content/drive/MyDrive/ΜΕΘΟΔΟΙ ΚΑΙ ΕΡΓΑΛΕΙΑ ΤΕΧΝΗΤΗΣ ΝΟΗΜΟΣΥΝΗΣ - Εργασία – Classification problems/Dataset2Use_Assignment2.xlsx"

# Load the Excel dataset into a DataFrame
df = pd.read_excel(file_path)

# Create a bar chart showing the number of healthy and bankrupt companies per year
# The chart groups data by year and status, and shows stacked bars for each year.
plt.figure(figsize=(10, 6))
df.groupby(['ΕΤΟΣ', 'ΕΝΔΕΙΞΗ ΑΣΥΝΕΠΕΙΑΣ (=2) (ν+1)']).size().unstack().plot(kind='bar')
plt.title('Figure 1: Number of Healthy and Bankrupt Companies per Year')
plt.ylabel('Number of Companies')
plt.xlabel('Year')
plt.legend(['Healthy (1)', 'Bankrupt (2)'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Figure 2: Comparison of Indicators Between Healthy and Bankrupt Companies Using Boxplots
# Create two subsets of the data:
# - 'healthy' contains only the companies labeled as healthy (1)
# - 'bankrupt' contains only the companies labeled as bankrupt (2)
indicators = df.columns.drop(['ΕΝΔΕΙΞΗ ΑΣΥΝΕΠΕΙΑΣ (=2) (ν+1)', 'ΕΤΟΣ'])
healthy = df[df['ΕΝΔΕΙΞΗ ΑΣΥΝΕΠΕΙΑΣ (=2) (ν+1)'] == 1]
bankrupt = df[df['ΕΝΔΕΙΞΗ ΑΣΥΝΕΠΕΙΑΣ (=2) (ν+1)'] == 2]

# Create one boxplot for each indicator
# This section generates boxplots for each financial indicator in the dataset.
# The goal is to visually compare the distribution of each indicator between
# healthy and bankrupt companies.
#
# For every indicator:
# - A boxplot is created with one box for healthy companies and one for bankrupt.
# - Different colors are used to distinguish the two groups.
# - The mean value for each group is shown as a red point.
# - The min and max values are shown with blue dashed lines.
# - A detailed summary (min, max, mean) is also printed for each group.
for indicator in indicators:
    plt.figure(figsize=(10, 6))

    # In the same plot, we will include both categories
    data_to_plot = [healthy[indicator].dropna(), bankrupt[indicator].dropna()]

    # Create boxplot
    box = plt.boxplot(data_to_plot, patch_artist=True, labels=['Healthy', 'Bankrupt'])

    # Colors for the boxplots
    colors = ['lightgreen', 'lightcoral']
    for patch, color in zip(box['boxes'], colors):
        patch.set_facecolor(color)

    # Add mean values as points
    plt.scatter([1, 2], [healthy[indicator].mean(), bankrupt[indicator].mean()],
                color='red', zorder=3, label='Mean')

    # Add min and max as lines
    plt.vlines(1, healthy[indicator].min(), healthy[indicator].max(),
               colors='blue', linestyles='dashed', label='Min/Max')
    plt.vlines(2, bankrupt[indicator].min(), bankrupt[indicator].max(),
               colors='blue', linestyles='dashed')

    plt.title(f'Figure 2: Comparison of indicator {indicator}\nbetween healthy and bankrupt companies')
    plt.ylabel('Indicator value')
    plt.xlabel('Company status')
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # Print statistics
    print(f"\nStatistics for indicator {indicator}:")
    print("Healthy companies:")
    print(f"Min: {healthy[indicator].min():.2f}, Max: {healthy[indicator].max():.2f}, Mean: {healthy[indicator].mean():.2f}")
    print("Bankrupt companies:")
    print(f"Min: {bankrupt[indicator].min():.2f}, Max: {bankrupt[indicator].max():.2f}, Mean: {bankrupt[indicator].mean():.2f}")
    print("-"*50)

# It checks for any missing values in the dataset.
# - If found, it prints out which columns have missing data and their counts.
# - If no missing values are present, it confirms the dataset is complete.
missing_values = df.isnull().sum()
if missing_values.any():
    print("\nWarning: There are missing values in the dataset:")
    print(missing_values[missing_values > 0])
else:
    print("\nNo missing values found in the dataset.")

# Data preparation
# - Separates the features (X) from the target variable (y).
# - The target column indicating company status is converted to binary:
#   0 for healthy companies, 1 for bankrupt companies.
X = df.drop(columns=['ΕΝΔΕΙΞΗ ΑΣΥΝΕΠΕΙΑΣ (=2) (ν+1)', 'ΕΤΟΣ'])
y = df['ΕΝΔΕΙΞΗ ΑΣΥΝΕΠΕΙΑΣ (=2) (ν+1)'].replace({1: 0, 2: 1})

# Normalization
# - Applies Min-Max scaling to all feature columns to bring values between 0 and 1
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Verify normalization
# - Prints the minimum and maximum values of the normalized features to confirm scaling
print("\nNormalization verification:")
print(f"Minimum values: {np.min(X_scaled, axis=0)}")
print(f"Maximum values: {np.max(X_scaled, axis=0)}")


# Cross-Validation Setup:
# - Uses Stratified K-Fold cross-validation with 4 splits.
# - Stratification ensures class distribution (healthy vs bankrupt) remains consistent across folds.
# - Data is shuffled before splitting to improve randomness.
# - A fixed random seed (42) is set for reproducibility of results.
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
results = []

# Models Definition:
models = {
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Trees": DecisionTreeClassifier(random_state=42),
    "Random Forests": RandomForestClassifier(random_state=42),
    "k-Nearest Neighbors": KNeighborsClassifier(),
    "Naïve Bayes": GaussianNB(),
    "Support Vector Machines": SVC(probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Fold Tracking:
fold_column = pd.Series(index=df.index, dtype=int)

# For each fold of the StratifiedKFold (4 folds):
# - Assign fold indices to keep track of which samples belong to each fold.
# - Split the data into training and test sets according to the current fold.
# - Print the class distribution (healthy vs bankrupt) in train and test sets.
for fold, (train_idx, test_idx) in enumerate(skf.split(X_scaled, y), 1):
    # Update the 'Fold' column in the dataframe
    fold_column.iloc[test_idx] = fold

    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print(f"\n--- Fold {fold} ---")
    print(f"Train set: Healthy = {sum(y_train == 0)}, Bankrupt = {sum(y_train == 1)}")
    print(f"Test set: Healthy = {sum(y_test == 0)}, Bankrupt = {sum(y_test == 1)}")

    # Balance the training set if the ratio of healthy to bankrupt companies exceeds 3:1,
    # using Random Under Sampling to reduce majority class size to maintain a 3:1 ratio.
    # If ratio is less or equal, training set remains unbalanced.
    if sum(y_train == 0) / sum(y_train == 1) > 3:
        rus = RandomUnderSampler(sampling_strategy=1/3, random_state=42)
        X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)
        print(f"\nAfter balancing (3:1):")
        print(f"Train set: Healthy = {sum(y_train_bal == 0)}, Bankrupt = {sum(y_train_bal == 1)}")
        balanced = "balanced"
    else:
        X_train_bal, y_train_bal = X_train, y_train
        balanced = "unbalanced"

    # For each model in the predefined list:
    # - Train (fit) the model on the balanced training set.
    # - Evaluate on both training and test sets:
    #   - Predict classes and probabilities (if available).
    #   - Compute confusion matrix (TP, TN, FP, FN).
    #   - Calculate ROC-AUC score on the test set only.
    #   - Store metrics and details in the results list for later analysis.
    #   - Plot and display confusion matrices for visual inspection.
    #   - Print ROC-AUC score for the test set.
    for name, model in models.items():
        model.fit(X_train_bal, y_train_bal)

        for set_type, X_data, y_data in [("train", X_train_bal, y_train_bal), ("test", X_test, y_test)]:
            y_pred = model.predict(X_data)
            y_proba = model.predict_proba(X_data)[:, 1] if hasattr(model, "predict_proba") else [0]*len(y_data)

            cm = confusion_matrix(y_data, y_pred)
            tn, fp, fn, tp = cm.ravel()

            auc = roc_auc_score(y_data, y_proba) if set_type == "test" else 0

            results.append({
                "Classifier Name": name,
                "Fold": fold,
                "Training or test set": set_type,
                "Balanced or unbalanced train set": balanced,
                "Number of training samples": len(X_train_bal),
                "Number of non-healthy companies in training sample": sum(y_train_bal == 1),
                "True positives TP": tp,
                "True negatives TN": tn,
                "False positives FP": fp,
                "False negatives FN": fn,
                "ROC-AUC": auc
            })

            plt.figure(figsize=(5, 4))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                        xticklabels=['Healthy', 'Bankrupt'],
                        yticklabels=['Healthy', 'Bankrupt'])
            plt.title(f'Confusion Matrix - {name}\n{set_type.capitalize()} set (Fold {fold})')
            plt.ylabel('Actual Value')
            plt.xlabel('Predicted Value')
            plt.tight_layout()
            plt.show()

            print(f"\n{name} - {set_type.capitalize()} set (Fold {fold}):")
            if set_type == "test":
                print(f"AUC-ROC: {auc:.2f}")

# Add the Fold column to the original dataframe
df["Fold"] = fold_column

# Save the results to Excel
# Create and save a dataframe with the results
results_df = pd.DataFrame(results)

save_path = "/content/drive/MyDrive/ΜΕΘΟΔΟΙ ΚΑΙ ΕΡΓΑΛΕΙΑ ΤΕΧΝΗΤΗΣ ΝΟΗΜΟΣΥΝΗΣ - Εργασία – Classification problems/balancedDataOutcomes2.csv"

results_df.to_csv(save_path, index=False)
print(f"\nThe results have been saved to: {save_path}")


Output hidden; open in https://colab.research.google.com to view.