In [8]:
# ml_import

# Data manipulation
import pandas as pd
import numpy as np

# Plotting and Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Profiling for EDA (Exploratory Data Analysis)
import ydata_profiling  # or use 'import pandas_profiling' if you prefer

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Modeling
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor, VotingClassifier, VotingRegressor
from xgboost import XGBClassifier, XGBRegressor

# Metrics and Evaluation
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_curve, auc

# Warnings to avoid unwanted messages
import warnings
warnings.filterwarnings('ignore')



# Display helper for Jupyter (to display the dataframe)
from IPython.display import display

class tools:
    @staticmethod
    def display_dataframe_to_user(name, dataframe):
        print(f"\n{name}:")
        display(dataframe)



In [9]:
# ml_visualize

# Data Profiling for Exploratory Data Analysis (EDA)
def data_profiling(df):
    # Using ydata_profiling to generate a detailed profile report
    profile = ydata_profiling.ProfileReport(df, title="Data Profiling Report", explorative=True)
    profile.to_file("data_profiling_report.html")
    print("Data profiling report generated as 'data_profiling_report.html'")

# Visualizing data and detecting columns
def data_visualization(df, target_column=None):
    """
    Visualizes data and detects columns.
    If target_column is None, it defaults to the last column.
    """
    # 1. Determine the target column immediately
    if target_column is None:
        target_column = df.columns[-1]
        print(f"Target column not specified. Auto-selecting last column: '{target_column}'")
    else:
        print(f"Target column manually set to: '{target_column}'")

    # 2. Detect Numeric and Categorical Columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_columns = df.select_dtypes(include=[object]).columns.tolist()
    
    # Remove high-cardinality categorical columns (likely IDs)
    cat_col_del = []
    for cat_col in categorical_columns:
        if df[cat_col].nunique() >= 0.5 * len(df[cat_col]):
            df.drop(columns=[cat_col], inplace=True)
            cat_col_del.append(cat_col)
    
    categorical_columns = [col for col in categorical_columns if col not in cat_col_del]       
    
    # 3. Generate Profile
    data_profiling(df)
    
    # 4. Plots
    # Box plot for outliers
    if numeric_columns:
        plt.figure(figsize=(12, 6))
        n_cols = len(numeric_columns)
        n_rows = (n_cols // 3) + (1 if n_cols % 3 > 0 else 0)
        for i, col in enumerate(numeric_columns):
            plt.subplot(n_rows, 3, i+1)
            sns.boxplot(x=df[col])
            plt.title(f"Boxplot of {col}")
        plt.tight_layout()
        plt.show()

    # Histograms for distribution
    if numeric_columns:
        plt.figure(figsize=(12, 6))
        for i, col in enumerate(numeric_columns):
            plt.subplot(n_rows, 3, i+1)
            sns.histplot(df[col], kde=True)
            plt.title(f"Histogram of {col}")
        plt.tight_layout()
        plt.show()

    # Bar plot for categorical columns
    if categorical_columns:
        plt.figure(figsize=(12, 6))
        n_cols_cat = len(categorical_columns)
        n_rows_cat = (n_cols_cat // 3) + (1 if n_cols_cat % 3 > 0 else 0)
        for i, col in enumerate(categorical_columns):
            plt.subplot(n_rows_cat, 3, i+1)
            sns.countplot(x=df[col])
            plt.title(f"Bar plot of {col}")
        plt.tight_layout()
        plt.show()

    # Correlation Matrix
    if numeric_columns:
        plt.figure(figsize=(10, 8))
        corr_matrix = df[numeric_columns].corr()
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
        plt.title('Correlation Matrix')
        plt.show()

    return target_column, categorical_columns, numeric_columns

# [Keep the other plotting functions: plot_regression_results, plot_confusion_matrix, etc. as they were]
# ... (Paste the rest of your original plotting functions here if needed) ...
def plot_regression_results(y_true, y_pred):
    plt.figure(figsize=(10,6))
    plt.scatter(y_true, y_pred, color='blue', alpha=0.6)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--')
    plt.title('Regression: Actual vs Predicted')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

def plot_confusion_matrix(y_true, y_pred, labels=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=labels, yticklabels=labels)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

def plot_roc_curve(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(10,6))
    plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()
    
def plot_feature_importance(model, feature_names):
    importance = model.feature_importances_
    feature_importance = pd.Series(importance, index=feature_names).sort_values(ascending=False)
    plt.figure(figsize=(10,6))
    feature_importance.plot(kind='bar', color='skyblue')
    plt.title('Feature Importance')
    plt.ylabel('Importance')
    plt.show()

def plot_class_distribution(y):
    plt.figure(figsize=(8,6))
    sns.countplot(x=y, palette='Set2')
    plt.title('Class Distribution')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.show()

# --- NEW: Time Series Evaluation ---
def evaluate_time_series_model(model, X_test, y_test):
    """
    Plots the forecast against the actual values in a timeline format.
    """
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.4f}")
    r2 = r2_score(y_test, y_pred)
    print(f"R^2 Score: {r2:.4f}")

    # Plotting Timeline
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(y_test)), y_test.values, label='Actual', color='blue')
    plt.plot(range(len(y_pred)), y_pred, label='Predicted', color='orange', linestyle='--')
    plt.title('Time Series Forecasting: Actual vs Predicted')
    plt.xlabel('Time Steps')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

In [10]:
# ml_preprocess

# Function to preprocess data: Handling missing values, scaling, encoding
def preprocess_data(df, target_column, categorical_columns, numeric_columns, time_series=False, date_column=None):
    """
    Preprocesses the input data. If time_series=True, it splits data chronologically.
    """
    # If it's a time series, handle the date column
    if time_series and date_column:
        print(f"Sorting data by time column: {date_column}")
        df[date_column] = pd.to_datetime(df[date_column])
        df = df.sort_values(by=date_column)
        # We usually drop the date column from features for standard ML models 
        # as they can't process timestamp objects directly, 
        # but the order is preserved in the index/rows.
        df = df.drop(columns=[date_column])
        
        # Remove date_column from lists if present
        if date_column in categorical_columns: categorical_columns.remove(date_column)
        if date_column in numeric_columns: numeric_columns.remove(date_column)

    # Exclude the target column
    categorical_columns = [col for col in categorical_columns if col != target_column]
    numeric_columns = [col for col in numeric_columns if col != target_column]
    
    # Split data into features (X) and target (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    numeric_features = numeric_columns
    categorical_features = categorical_columns
    
    # Preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    if not categorical_columns:
        preprocessor = ColumnTransformer(
            transformers=[('num', numeric_transformer, numeric_features)]
        )
    else:
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )
    
    # Apply transformations
    X_processed = preprocessor.fit_transform(X)
    
    # --- TIME SERIES SPLIT LOGIC ---
    if time_series:
        # split WITHOUT shuffling to preserve order
        print("Splitting data chronologically (Time Series mode)...")
        X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, shuffle=False)
    else:
        # Standard random split
        X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, preprocessor

In [11]:
# ml_pipeline

# Function to train and evaluate multiple models with hyperparameter tuning
def train_models(X_train, X_test, y_train, y_test, task_type='classification', search_type='grid', time_series=False):
    """
    Trains and evaluates multiple models based on task type (classification or regression) with hyperparameter tuning.
    """
    
    models = {}
    best_models = {}
    best_params = {}
    best_scores = {}

    # REMOVED: Internal Label Encoding. 
    # We now assume y_train and y_test are already encoded before being passed here.

    if task_type == 'classification':
        models = {
            'Logistic Regression': LogisticRegression(),
            'SVM': SVC(),
            'Decision Tree': DecisionTreeClassifier(),
            'Random Forest': RandomForestClassifier(),
            'Gradient Boosting': GradientBoostingClassifier(),
            'XGBoost': XGBClassifier(),
            'Voting Classifier': VotingClassifier(estimators=[
                ('lr', LogisticRegression()),
                ('rf', RandomForestClassifier()),
                ('svm', SVC())
            ])
        }
        
        param_grids = {
            'Logistic Regression': {
                'C': [0.01, 0.1, 1, 10],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear']
            },
            'SVM': {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto']
            },
            'Decision Tree': {
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            },
            'Random Forest': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            },
            'Gradient Boosting': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.05, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            },
            'XGBoost': {
                'n_estimators': [100, 200],
                'max_depth': [3, 4, 5],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 1.0],
                'colsample_bytree': [0.8, 1.0]
            },
            'Voting Classifier': {
                'voting': ['hard', 'soft']
            }
        }
        
    elif task_type == 'regression':
        models = {
            'Linear Regression': LinearRegression(),
            'SVR': SVR(),
            'Decision Tree Regressor': DecisionTreeRegressor(),
            'Random Forest Regressor': RandomForestRegressor(),
            'Gradient Boosting Regressor': GradientBoostingRegressor(),
            'XGBoost Regressor': XGBRegressor(),
            'Voting Regressor': VotingRegressor(estimators=[
                ('lr', LinearRegression()),
                ('rf', RandomForestRegressor()),
                ('svr', SVR())
            ])
        }
        
        param_grids = {
            'Linear Regression': {},
            'SVR': {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto']
            },
            'Decision Tree Regressor': {
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            },
            'Random Forest Regressor': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            },
            'Gradient Boosting Regressor': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.05, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            },
            'XGBoost Regressor': {
                'n_estimators': [100, 200],
                'max_depth': [3, 4, 5],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 1.0],
                'colsample_bytree': [0.8, 1.0]
            },
            'Voting Regressor': {
                'voting': ['hard', 'soft']
            }
        }

    # Use TimeSeriesSplit for time series data
    if time_series:
        tscv = TimeSeriesSplit(n_splits=5)  # Can adjust the number of splits

    # Train each model with hyperparameter tuning
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        # Get the appropriate param grid for each model
        param_grid = param_grids.get(model_name, {})
        
        # Use GridSearchCV or RandomizedSearchCV based on the search_type parameter
        if search_type == 'grid':
            search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv if time_series else 5, n_jobs=-1, verbose=2)
        elif search_type == 'random':
            search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, cv=tscv if time_series else 5, n_jobs=-1, verbose=2, random_state=42)
        
        # Fit the model
        search.fit(X_train, y_train)
        
        # Get the best model
        best_model = search.best_estimator_
        
        # Predict on the test set
        if task_type == 'classification':
            y_pred = best_model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
        else:
            y_pred = best_model.predict(X_test)
            score = mean_squared_error(y_test, y_pred)
        
        print(f"Best Hyperparameters: {search.best_params_}")
        print(f"Best Score: {score:.4f}")
        
        best_models[model_name] = best_model
        best_params[model_name] = search.best_params_
        best_scores[model_name] = score

    return best_models, best_params, best_scores

In [12]:
# ml_evaluation

# Function to evaluate models for classification tasks
def evaluate_classification_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Function to evaluate models for regression tasks
def evaluate_regression_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.4f}")
    r2 = r2_score(y_test, y_pred)
    print(f"R^2 Score: {r2:.4f}")
    
    plt.figure(figsize=(10,6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Regression: Actual vs Predicted")
    plt.show()

# --- NEW: Time Series Evaluation ---
def evaluate_time_series_model(model, X_test, y_test):
    """
    Plots the forecast against the actual values in a timeline format.
    """
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.4f}")
    r2 = r2_score(y_test, y_pred)
    print(f"R^2 Score: {r2:.4f}")

    # Plotting Timeline
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(y_test)), y_test.values, label='Actual', color='blue')
    plt.plot(range(len(y_pred)), y_pred, label='Predicted', color='orange', linestyle='--')
    plt.title('Time Series Forecasting: Actual vs Predicted')
    plt.xlabel('Time Steps')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

In [13]:
# ml_final_report

# Function to summarize and select the best model
def summarize_best_models(best_models, best_params, best_scores, task_type='classification'):
    # Prepare the data for a summary DataFrame
    summary = {
        'Model': [],
        'Best Hyperparameters': [],
        'Performance Score': []
    }
    
    # Iterate through the models to summarize performance
    for model_name in best_models:
        summary['Model'].append(model_name)
        summary['Best Hyperparameters'].append(best_params[model_name])
        if task_type == 'classification':
            summary['Performance Score'].append(best_scores[model_name])  # Accuracy for classification
        else:
            summary['Performance Score'].append(best_scores[model_name])  # MSE or R^2 for regression
    
    # Convert summary into a DataFrame
    summary_df = pd.DataFrame(summary)
    
    # Sort models based on performance (highest score for classification, lowest MSE for regression)
    if task_type == 'classification':
        summary_df = summary_df.sort_values(by='Performance Score', ascending=False)
    else:
        summary_df = summary_df.sort_values(by='Performance Score', ascending=True)
    
    return summary_df

In [None]:
# ml_run

def run_ml_pipeline(data_path, task_type='classification', search_type='grid', time_series=False, date_column=None, target_column=None):
    # Step 1: Load Data
    print("Loading data...")
    if 'csv' in data_path:
        df = pd.read_csv(data_path)
    elif 'xlsx' in data_path:
        df = pd.read_excel(data_path)
    print("Data loaded successfully!")
    
    # Step 2: Data Profiling (Passing target_column now)
    print("Generating data profiling report and visualizations...")
    # --- UPDATED LINE ---
    target_column, categorical_columns, numeric_columns = data_visualization(df, target_column=target_column) 
    # --------------------

    # Step 3: Preprocess Data
    print(f"Preprocessing data... (Target: {target_column})")
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(
        df, target_column, categorical_columns, numeric_columns, 
        time_series=time_series, date_column=date_column
    )
    print("Data preprocessing completed!")
    
    # Classification Encoding
    if task_type == 'classification':
        print("Encoding target labels...")
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)
        y_test = label_encoder.transform(y_test)
        print(f"Target labels encoded. Classes: {label_encoder.classes_}")

    # Step 4: Train Models
    print(f"Training models (Time Series Mode: {time_series})...")
    best_models, best_params, best_scores = train_models(
        X_train, X_test, y_train, y_test, 
        task_type=task_type, search_type=search_type, time_series=time_series
    )
    print("Model training completed!")
    
    # Step 5: Evaluate Models
    print("Evaluating models...")
    for model_name, model in best_models.items():
        print(f"\nEvaluating {model_name}...")
        if task_type == 'classification':
            evaluate_classification_model(model, X_test, y_test)
        elif task_type == 'regression':
            if time_series:
                evaluate_time_series_model(model, X_test, y_test)
            else:
                evaluate_regression_model(model, X_test, y_test)
    
    # Step 6: Generate Final Report
    print("Generating final model report...")
    final_summary = summarize_best_models(best_models, best_params, best_scores, task_type=task_type)
    tools.display_dataframe_to_user(name=f"{task_type.capitalize()} Models Summary", dataframe=final_summary)
    print("Final model report generated!")

# Example of using the pipeline for classification task
data_path = r'D:\Programming\Machine Learning Works\Projects\ML Syntax Automation\framingham.csv' 

# Run the pipeline
run_ml_pipeline(data_path, task_type='classification', search_type='random', time_series=False, target_column='TenYearCHD')


Loading data...
Data loaded successfully!
Generating data profiling report and visualizations...
Target column manually set to: 'TenYearCHD'


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:00<00:00, 1230.63it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Data profiling report generated as 'data_profiling_report.html'
Preprocessing data... (Target: TenYearCHD)
Data preprocessing completed!
Encoding target labels...
Target labels encoded. Classes: [0 1]
Training models (Time Series Mode: False)...

Training Logistic Regression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 1}
Best Score: 0.8561

Training SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 1}
Best Score: 0.8550

Training Decision Tree...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Hyperparameters: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10}
Best Score: 0.7972

Training Random Forest...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None}
Best Score: 

Unnamed: 0,Model,Best Hyperparameters,Performance Score
6,Voting Classifier,{'voting': 'hard'},0.85967
5,XGBoost,"{'subsample': 0.8, 'n_estimators': 200, 'max_d...",0.857311
0,Logistic Regression,"{'solver': 'liblinear', 'penalty': 'l1', 'C': 1}",0.856132
1,SVM,"{'kernel': 'rbf', 'gamma': 'auto', 'C': 1}",0.854953
3,Random Forest,"{'n_estimators': 100, 'min_samples_split': 10,...",0.854953
4,Gradient Boosting,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.851415
2,Decision Tree,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.79717


Final model report generated!
