# Pre-Processing the  Feature's

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

cols = ['fLength','fWidth','fSize','fConc','fConc1',
        'fAsym','fM3Long','fM3Trans','fAlpha','fDist','class']

df = pd.read_csv('/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/Dataset/magic04.data',names = cols)
X = df.iloc[:, :-1]
y = df["class"] 

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,        
    random_state=42,      
    stratify=y            # keep same g:h ratio in train and test ( Important when Data is Imbalanced.)
)

In [3]:
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.pipeline import Pipeline

logreg_pipe = Pipeline([
    ("power", PowerTransformer(method="yeo-johnson")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

svm_pipe = Pipeline([
    ("power", PowerTransformer(method="yeo-johnson")),
    ("scaler", StandardScaler()),
    ("clf", SVC())
])

gnb_pipe = Pipeline([
    ("power", PowerTransformer(method="yeo-johnson")),
    ("scaler", StandardScaler()),
    ("clf", GaussianNB())
])

rf_pipe = RandomForestClassifier(random_state=42)  # tree model doesn’t need scaling


- Why PowerTransformer?:
    - Features like fLength and fWidth are heavily skewed. Gaussian NB and SVM assume more symmetric / Gaussian-like features. Yeo–Johnson power transform reduces skewness without requiring positive-only data.

- Why scale before SVM / LogisticRegression?
    - These models are distance-based or regularized. If one feature has much larger scale, it dominates the distance / penalty. StandardScaler puts all features on the same scale.
- What does C mean in SVM / LogisticRegression?
    - C controls how much we penalize misclassifications:
        - Large C → fit training data more tightly (low bias, high variance)
        - Small C → more regularization (high bias, low variance)

In [4]:
improved_models = {
    "GNB_improved": gnb_pipe,
    "LogReg_improved": logreg_pipe,
    "SVM_improved": svm_pipe,
    "RF_improved": rf_pipe
}

for name, clf in improved_models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred))


=== GNB_improved ===
              precision    recall  f1-score   support

           g       0.74      0.92      0.82      3700
           h       0.74      0.41      0.53      2006

    accuracy                           0.74      5706
   macro avg       0.74      0.66      0.67      5706
weighted avg       0.74      0.74      0.72      5706

=== LogReg_improved ===
              precision    recall  f1-score   support

           g       0.84      0.91      0.88      3700
           h       0.81      0.69      0.74      2006

    accuracy                           0.83      5706
   macro avg       0.83      0.80      0.81      5706
weighted avg       0.83      0.83      0.83      5706

=== SVM_improved ===
              precision    recall  f1-score   support

           g       0.87      0.96      0.91      3700
           h       0.91      0.73      0.81      2006

    accuracy                           0.88      5706
   macro avg       0.89      0.85      0.86      5706
weighted