<a href="https://colab.research.google.com/github/klaxman23/August_pratice/blob/main/Kyphosis_Project_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


In [None]:
# 2. Locate Excel File Automatically
files = [f for f in os.listdir() if f.endswith(".xlsx")]

assert len(files) > 0, " No Excel file found. Please upload the Kyphosis Excel file."

file_path = files[0]
print(" Using file:", file_path)



In [None]:
# 3. Load Dataset
df = pd.read_excel(file_path)

print("Dataset Shape:", df.shape)
print(df.head())
print(df.info())

In [None]:
# 4. Check Target Column
assert 'Kyphosis' in df.columns, "Column 'Kyphosis' not found in dataset."

print("\nClass Distribution:")
print(df['Kyphosis'].value_counts())

In [None]:
# 5. Visualize Class Distribution
sns.countplot(x='Kyphosis', data=df)
plt.title("Kyphosis Class Distribution")
plt.show()

In [None]:
# 6. Encode Target Variable
# absent → 0, present → 1
le = LabelEncoder()
df['Kyphosis'] = le.fit_transform(df['Kyphosis'])

In [None]:
# 7. Feature / Target Split
X = df.drop('Kyphosis', axis=1)
y = df['Kyphosis']

In [None]:
# 8. Train-Test Split (Stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# 9. Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# 10. Train Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

    print(f"\n{name}")
    print(classification_report(y_test, y_pred))

In [None]:
# 11. Model Comparison
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)

In [None]:
# 12. Final Model – Logistic Regression
final_model = LogisticRegression(max_iter=1000)
final_model.fit(X_train, y_train)

y_final_pred = final_model.predict(X_test)

In [None]:
# 13. Confusion Matrix
cm = confusion_matrix(y_test, y_final_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – Logistic Regression")
plt.savefig("confusion_matrix.png")
plt.show()

# 14. Final Metrics
print("Final Accuracy:", accuracy_score(y_test, y_final_pred))
print("Final F1 Score:", f1_score(y_test, y_final_pred))