<a href="https://colab.research.google.com/github/kipsangmarion/ml-pipelines/blob/main/Example_Usage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Classification Example

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = pd.read_csv('https://bit.ly/3Sn7blU')

# Extract features and labels
iris_X = iris.drop('Species', axis=1)
iris_y = iris['Species']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

# Define hyperparameter grids for each classifier
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20, 30]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'Logistic Regression': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Decision Tree': {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
    'Gradient Boosting': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]},
    'AdaBoost': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}
}

# Dictionary to store the best models
best_models = {}

# Perform GridSearchCV on each classifier
for classifier_name, classifier in classifiers.items():
    grid_search = GridSearchCV(classifier, param_grids[classifier_name], cv=5, scoring='accuracy', n_jobs=-1)

    # fit to training dataset
    # grid_search.fit(X_train, y_train)
    grid_search.fit(X_train, y_train)

    # Get the best model from Grid Search
    best_model = grid_search.best_estimator_

    # Perform cross-validation on best_model
    # cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy', n_jobs=-1)
    cv_scores = cross_val_score(best_model, iris_X, iris_y, cv=5, scoring='accuracy', n_jobs=-1)

    # Store the best model in the dictionary
    best_models[classifier_name] = {
        'model': best_model,
        'cross_val_scores': cv_scores,
        'mean_accuracy': np.mean(cv_scores),
        'best_parameters': grid_search.best_params_
    }

# Select the best model based on mean cross-validation accuracy
best_classifier = max(best_models, key=lambda k: best_models[k]['mean_accuracy'])

# Print information about the best model
print(f"Best Classifier: {best_classifier}")
print(f"Cross-Validation Scores: {best_models[best_classifier]['cross_val_scores']}")
print(f"Mean Accuracy: {best_models[best_classifier]['mean_accuracy']:.4f}")
print(f"Best Parameters: {best_models[best_classifier]['best_parameters']}")

# 2. Regression Example

In [8]:
# Load the dataset from the CSV URL
from sklearn.impute import SimpleImputer

boston_df = pd.read_csv('https://bit.ly/data_boston_housing')

# Handle missing values using mean imputation.
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(boston_df), columns=boston_df.columns)

# Extract features and outcomes
boston_X = df_imputed.drop('MEDV', axis=1)
boston_y = df_imputed['MEDV']

# Split data
X_train, X_test, y_train, y_test = train_test_split(boston_X, boston_y, test_size=0.2)

In [10]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import numpy as np

# Define regressors
regressors = {
    'Random Forest': RandomForestRegressor(),
    'SVR': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor()
}

# Define hyperparameter grids for each regressor
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20, 30]},
    'SVR': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'Linear Regression': {'fit_intercept': [True, False], 'positive': [True, False], 'copy_X': [True, False], 'n_jobs': [None, -1]},
    'Gradient Boosting': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]},
    'AdaBoost': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}
}

# Dictionary to store the best models
best_models = {}

# Perform GridSearchCV and cross-validation on each regressor
for regressor_name, regressor in regressors.items():
    grid_search = GridSearchCV(regressor, param_grids[regressor_name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

    # fit to training dataset
    # grid_search.fit(X_train, y_train)
    grid_search.fit(X_train, y_train)

    # Get the best model from Grid Search
    best_model = grid_search.best_estimator_

    # Perform cross-validation
    # cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    cv_scores = cross_val_score(best_model, boston_X, boston_y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

    # Store the best model in the dictionary
    best_models[regressor_name] = {
        'model': best_model,
        'cross_val_scores': cv_scores,
        'mean_mse': np.mean(cv_scores),
        'best_parameters': grid_search.best_params_
    }

# Select the best model based on mean cross-validation MSE
best_regressor = min(best_models, key=lambda k: best_models[k]['mean_mse'])

# Print information about the best regressor
print(f"Best Regressor: {best_regressor}")
print(f"Cross-Validation Mean MSE: {best_models[best_regressor]['mean_mse']:.4f}")
print(f"Best Parameters: {best_models[best_regressor]['best_parameters']}")

Best Regressor: K-Nearest Neighbors
Cross-Validation Mean MSE: -75.9860
Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}
