In [None]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Step 2: Load Dataset
# Replace with your dataset
df = pd.read_csv('your_dataset.csv')

# Step 3: Data Exploration and Analysis
print(df.info())  # Overview of the dataset
print(df.describe())  # Summary statistics
print(df.isnull().sum())  # Check for missing values

# Visualizations
sns.pairplot(df)  # Pairplot to visualize relationships
plt.show()

sns.heatmap(df.corr(), annot=True, cmap='coolwarm')  # Correlation heatmap
plt.show()

# Step 4: Data Preprocessing
# Handle missing values (example: fill missing with mean)
df.fillna(df.mean(), inplace=True)

# Encode categorical variables if any
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Feature-target split
X = df.drop('target_column', axis=1)  # Replace 'target_column' with your target variable name
y = df['target_column']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 5: Define Models
models = {
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Step 6: Train and Evaluate Models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    results[name] = acc

# Plot accuracy comparison
plt.bar(results.keys(), results.values(), color='skyblue')
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.show()

# Step 7: Hyperparameter Tuning with GridSearchCV
# Example for Random Forest
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy', verbose=1)
grid_search_rf.fit(X_train, y_train)

print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Accuracy for Random Forest:", grid_search_rf.best_score_)

# Step 8: Model Formulation (Using Best Parameters)
final_model = RandomForestClassifier(**grid_search_rf.best_params_)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)

# Final evaluation
print("Final Model Accuracy:", accuracy_score(y_test, final_predictions))
print("Final Classification Report:")
print(classification_report(y_test, final_predictions))
