In [8]:
from typing import Optional, Dict, Any
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import os

model_dict = {
    'logistic_regression': LogisticRegression,
    'svm': LinearSVC,
    'random_forest': RandomForestClassifier,
    'knn': KNeighborsClassifier,
    'decision_tree': DecisionTreeClassifier,
}


default_params_dict = {
    'logistic_regression': {'C': 1, 'max_iter': 100},
    'svm': {'C': 1, 'kernel': 'rbf'},
    'random_forest': {'n_estimators': 100, 'max_depth': None},
    'knn': {'n_neighbors': 5, 'algorithm': 'auto'},
    'decision_tree': {'max_depth': None},
}

MODEL_DIR = 'models'
os.makedirs(MODEL_DIR, exist_ok=True)


def split_train_test(
        X: pd.DataFrame,
        y: pd.Series,
        test_size: float = 0.2,
        random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Splits the training data into a training and validation set.

    :param X: Feature matrix.
    :param y: Target variable.
    :param test_size: Proportion of data to be used as the validation set.
    :param random_state: Random seed for reproducibility.
    :return: Train and validation sets (X_train, X_val, y_train, y_val).
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


def create_model(model_type: str,
                 model_params: Optional[Dict[str, Any]] = None) -> make_pipeline:
    """
    Creates a pipeline with the specified model.

    :param model_type: The type of model, either 'logistic_regression', 'svm', 'random_forest', etc.
    :param model_params: A dictionary of hyperparameters for the model.
    :return: A scikit-learn pipeline.
    """
    assert model_type in model_dict, f"Model {model_type} is not supported."

    if model_params is None:
        model_params = {}

    model = model_dict[model_type](**model_params)

    preprocessor = ColumnTransformer([
        ('ohe', OneHotEncoder(), ['Pclass', 'Embarked']),
        ('binarizer', OrdinalEncoder(), ['Sex']),
    ], remainder='passthrough')

    return make_pipeline(preprocessor, model)


def train_and_save_model(X: pd.DataFrame, y: pd.Series, model_type: str, model_params: Optional[Dict[str, Any]] = None):
    """
    Trains a model, saves it to a file, and returns the trained model.

    :param X: Feature data for training.
    :param y: Target labels for training.
    :param model_type: Type of model ('logistic_regression', 'svm', 'random_forest', etc.).
    :param model_params: Hyperparameters for the model.
    :return: The trained model.
    """
    model = create_model(model_type, model_params)

    model.fit(X, y)

    model_file = os.path.join(MODEL_DIR, f"{model_type}_model.pkl")
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)

    print(f"Model trained and saved as {model_file}")
    return model


def load_model(model_type: str):
    """
    Loads a trained model from the saved file.

    :param model_type: Type of model ('logistic_regression', 'svm', etc.).
    :return: The trained model.
    """
    model_file = os.path.join(MODEL_DIR, f"{model_type}_model.pkl")
    if os.path.exists(model_file):
        with open(model_file, 'rb') as f:
            model = pickle.load(f)
        print(f"Model {model_type} loaded successfully.")
        return model
    else:
        print(f"Model {model_type} does not exist.")
        return None


NameError: name 'validate_and_prepare_data' is not defined

In [None]:
# Example usage
if __name__ == '__main__':
    # Load your dataset
    train_file = '../data/titanic/titanic_train.csv'
    test_file =  '../data/titanic/titanic_test.csv'

    X_train, y_train = validate_and_prepare_data(train_file, train=True)
    X_train_split, X_val_split, y_train_split, y_val_split = split_train_test(X_train, y_train)
    X_test = validate_and_prepare_data(test_file, train=False)

    # Train and save Logistic Regression model
    log_reg_model = train_and_save_model(X_train, y_train, 'logistic_regression', model_params={'C': 0.5, 'max_iter': 200})

    # Train and save SVM model
    svm_model = train_and_save_model(X_train, y_train, 'svm', model_params={'C': 1, 'kernel': 'linear'})

    # Train and save Random Forest model
    rf_model = train_and_save_model(X_train, y_train, 'random_forest', model_params={'n_estimators': 100, 'max_depth': 10})

    # Train and save KNN model
    knn_model = train_and_save_model(X_train, y_train, 'knn', model_params={'n_neighbors': 3, 'algorithm': 'auto'})

    # Train and save Decision Tree model
    dt_model = train_and_save_model(X_train, y_train, 'decision_tree', model_params={'max_depth': 5})

    # Load the Logistic Regression model
    log_reg_model_loaded = load_model('logistic_regression')

    # Load the SVM model
    svm_model_loaded = load_model('svm')

    # Load the Random Forest model
    rf_model_loaded = load_model('random_forest')

    # Load the KNN model
    knn_model_loaded = load_model('knn')

    # Load the Decision Tree model
    dt_model_loaded = load_model('decision_tree')