<a href="https://colab.research.google.com/github/hassnaakharboush/soft/blob/main/linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyswarms  # uncomment if pyswarms not installed

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

import pyswarms as ps  # for PSO

# --- Load and preprocess dataset ---
df = pd.read_csv('/content/sample_data/data.csv')  # change path if needed

# Drop highly correlated features (threshold 0.9)
corr_matrix = df.drop('Bankrupt?', axis=1).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
df.drop(columns=to_drop, inplace=True)

print(f"Dropped features: {to_drop}")

X = df.drop('Bankrupt?', axis=1).values
y = df['Bankrupt?'].values

# Separate features and target
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

# Handle missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)  # Now X is a NumPy array with no NaNs
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------
# 1) ML baseline: Logistic Regression with all features
# -------------------------
model_ml = LogisticRegression(max_iter=1000, random_state=42)
model_ml.fit(X_train, y_train)
y_pred_ml = model_ml.predict(X_test)

# Metrics function
def print_metrics(y_true, y_pred, title=""):
    print(f"\n--- {title} ---")
    print(classification_report(y_true, y_pred, zero_division=0))
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, zero_division=0):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, zero_division=0):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, zero_division=0):.4f}")

print_metrics(y_test, y_pred_ml, "Logistic Regression (All features)")

Dropped features: [' ROA(A) before interest and % after tax', ' ROA(B) before interest and depreciation after tax', ' Realized Sales Gross Margin', ' Pre-tax net Interest Rate', ' After-tax net Interest Rate', ' Continuous interest rate (after tax)', ' Net Value Per Share (A)', ' Net Value Per Share (C)', ' Per Share Net profit before tax (Yuan ¥)', ' Regular Net Profit Growth Rate', ' Net worth/Assets', ' Operating profit/Paid-in capital', ' Net profit before tax/Paid-in capital', ' Current Liabilities/Equity', ' Working capitcal Turnover Rate', ' Cash Flow to Sales', ' Current Liability to Liability', ' Current Liability to Equity', ' Net Income to Total Assets', ' Gross Profit to Sales', ' Liability to Equity']

--- Logistic Regression (All features) ---
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       592
           1       0.50      0.24      0.32        34

    accuracy                           0.95       626
   macro avg  

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.base import clone
import random

def fitness_function(chromosome, X_train, y_train):
    if sum(chromosome) == 0:
        return 0
    X_subset = X_train[:, chromosome==1]
    model = LogisticRegression(max_iter=1000, random_state=42)
    scores = cross_val_score(model, X_subset, y_train, cv=3, scoring='accuracy')
    return scores.mean()

def run_ga_feature_selection(X_train, y_train, n_pop=20, n_gen=10, mutation_rate=0.1):
    n_features = X_train.shape[1]
    population = [np.random.randint(0,2,n_features) for _ in range(n_pop)]

    best_chromosome = None
    best_fitness = 0

    for gen in range(n_gen):
        fitness_scores = np.array([fitness_function(ind, X_train, y_train) for ind in population])

        # Select best
        best_idx = np.argmax(fitness_scores)
        if fitness_scores[best_idx] > best_fitness:
            best_fitness = fitness_scores[best_idx]
            best_chromosome = population[best_idx].copy()

        # Selection (tournament)
        selected = []
        for _ in range(n_pop//2):
            i1, i2 = random.sample(range(n_pop), 2)
            winner = population[i1] if fitness_scores[i1]>fitness_scores[i2] else population[i2]
            selected.append(winner)

        # Crossover + Mutation
        children = []
        while len(children) < n_pop:
            p1, p2 = random.sample(selected, 2)
            point = random.randint(1, n_features-1)
            child = np.concatenate([p1[:point], p2[point:]])
            # mutation
            for i in range(n_features):
                if random.random() < mutation_rate:
                    child[i] = 1 - child[i]
            children.append(child)
        population = children
        print(f"GA Generation {gen+1}: Best fitness = {best_fitness:.4f}")

    return best_chromosome.astype(bool)

best_chromosome = run_ga_feature_selection(X_train, y_train)

# Train logistic regression on GA selected features
X_train_ga = X_train[:, best_chromosome]
X_test_ga = X_test[:, best_chromosome]
model_ga = LogisticRegression(max_iter=1000, random_state=42)
model_ga.fit(X_train_ga, y_train)
y_pred_ga = model_ga.predict(X_test_ga)

print_metrics(y_test, y_pred_ga, "Logistic Regression + GA Feature Selection")


GA Generation 1: Best fitness = 0.9485
GA Generation 2: Best fitness = 0.9485
GA Generation 3: Best fitness = 0.9485
GA Generation 4: Best fitness = 0.9485
GA Generation 5: Best fitness = 0.9509
GA Generation 6: Best fitness = 0.9509
GA Generation 7: Best fitness = 0.9509
GA Generation 8: Best fitness = 0.9509
GA Generation 9: Best fitness = 0.9509
GA Generation 10: Best fitness = 0.9509

--- Logistic Regression + GA Feature Selection ---
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       592
           1       0.54      0.21      0.30        34

    accuracy                           0.95       626
   macro avg       0.75      0.60      0.64       626
weighted avg       0.93      0.95      0.94       626

Accuracy: 0.9473
Precision: 0.5385
Recall: 0.2059
F1 Score: 0.2979


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np
import pyswarms as ps

# Function to evaluate model
def evaluate_model(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0)
    }

# PSO Fitness Function
def pso_fitness(mask):
    scores = []
    for particle in mask:
        selected = np.where(particle == 1)[0]
        if len(selected) == 0:
            scores.append(0)
            continue
        X_train_sel = X_train[:, selected]
        X_test_sel = X_test[:, selected]
        score = evaluate_model(X_train_sel, X_test_sel, y_train, y_test)["f1"]
        scores.append(score)
    return -np.array(scores)  # PSO minimizes

# Run PSO
n_features = X_train.shape[1]
options = {'c1': 2.0, 'c2': 2.0, 'w': 0.5, 'k': 5, 'p': 2}
optimizer = ps.discrete.BinaryPSO(n_particles=60, dimensions=n_features, options=options)

cost, pos = optimizer.optimize(pso_fitness, iters=100)

# Evaluate Final Model with PSO-selected features
selected_pso = np.where(pos == 1)[0]
results_pso = evaluate_model(X_train[:, selected_pso], X_test[:, selected_pso], y_train, y_test)

# Print PSO Summary
print("\n--- PSO Feature Selection Summary ---")
print(f"Selected Features (indices): {selected_pso}")
print(f"Accuracy:  {results_pso['accuracy']:.4f}")
print(f"Precision: {results_pso['precision']:.4f}")
print(f"Recall:    {results_pso['recall']:.4f}")
print(f"F1 Score:  {results_pso['f1']:.4f}")


2025-05-17 17:44:12,635 - pyswarms.discrete.binary - INFO - Optimize for 100 iters with {'c1': 2.0, 'c2': 2.0, 'w': 0.5, 'k': 5, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|100/100, best_cost=-0.458
2025-05-17 17:47:45,188 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -0.4583333333333333, best pos: [1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0 1 0
 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 1 0 1]



--- PSO Feature Selection Summary ---
Selected Features (indices): [ 0  2  3  5  7  8  9 11 12 14 16 18 21 24 26 27 29 30 31 32 35 37 42 43
 44 51 52 55 58 61 63 65 66 67 71 73]
Accuracy:  0.9585
Precision: 0.7857
Recall:    0.3235
F1 Score:  0.4583


In [None]:
import pandas as pd

# Collect results in a dictionary
comparison_results = {
    "Model": [
        "Logistic Regression (All features)",
        "Logistic Regression + GA Feature Selection",
        "Logistic Regression + PSO Feature Selection"
    ],
    "Accuracy": [
        accuracy_score(y_test, y_pred_ml),
        accuracy_score(y_test, y_pred_ga),
        results_pso['accuracy']
    ],
    "Precision": [
        precision_score(y_test, y_pred_ml, zero_division=0),
        precision_score(y_test, y_pred_ga, zero_division=0),
        results_pso['precision']
    ],
    "Recall": [
        recall_score(y_test, y_pred_ml, zero_division=0),
        recall_score(y_test, y_pred_ga, zero_division=0),
        results_pso['recall']
    ],
    "F1 Score": [
        f1_score(y_test, y_pred_ml, zero_division=0),
        f1_score(y_test, y_pred_ga, zero_division=0),
        results_pso['f1']
    ]
}

# Create a DataFrame
df_comparison = pd.DataFrame(comparison_results)

print("\n--- Model Comparison ---")
print(df_comparison)



--- Model Comparison ---
                                         Model  Accuracy  Precision    Recall  \
0           Logistic Regression (All features)  0.945687   0.500000  0.235294   
1   Logistic Regression + GA Feature Selection  0.947284   0.538462  0.205882   
2  Logistic Regression + PSO Feature Selection  0.958466   0.785714  0.323529   

   F1 Score  
0  0.320000  
1  0.297872  
2  0.458333  
