In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

from scipy.stats import zscore
import seaborn as sns


from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, TargetEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

import optuna
import optuna.visualization as vis
import plotly

import sys
import os

import time

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
def main(model,num,cat):
    start_time = time.time()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Now fitting", model, cat, num)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    # Hyperparameter fields 
    train_pct = 0.8
    MIN_FREQ_CAT = 1000  
    MAX_CAT = 10

    # 1. INPUT PARSING -------------------------------------------------



    # Read input files 
    # Check the files exist and can be read
    def check_and_read_csv(filepath):
        # Check if file exists at specified location
        if not os.path.exists(filepath):
            print("Error: the file '" + filepath + "' does not exist", file= sys.stderr)
            sys.exit(1)
        
        # Check if file is empty
        if os.path.getsize(filepath) == 0:
            print(f"Error: The file '{filepath}' is empty.")
            sys.exit(1)

        try:
            # Read the file into a DataFrame
            df = pd.read_csv(filepath)
            return df
        except Exception as e:
            print(f"Error reading file '{filepath}': {e}")
            sys.exit(1)

    train_input_file = "./Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv"
    train_labels_file = "./Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv"
    test_input_file = "./Pump_it_Up_Data_Mining_the_Water_Table_-_Test_set_values.csv"


    # Import data from specificed locations
    train_values = check_and_read_csv(train_input_file)
    train_labels = check_and_read_csv(train_labels_file)
    test_values = check_and_read_csv(test_input_file)

    # User Input Fields
    numerical_preprocessing = num
    categorical_preprocessing = cat
    model_type = model
    test_output_file = "predicted-values.csv"


    # Confirming that training values and labels match, 
    n_train_samples = len(train_values.index) 
    n_train_labels = len(train_labels.index)
    if n_train_samples != n_train_labels: 
        print("Error: number of training samples and labels not equal.")
        sys.exit(1)
    # Check features exist
    if len(train_values.columns) <= 1:
        print("Error: not enough features in training data.")
        sys.exit(1)
    elif len(test_values.columns) <= 1: 
        print("Error: not enough features in testing data.")
        sys.exit(1)
    # Confirm training and testing features are the same
    if set(train_values.columns) != set(test_values.columns): 
        print("Error: training and testing features do not match.")
        sys.exit(1)




    # 2. Data Preprocessing ----------------------------------------------------

    # Converting 'date_recorded' into a numerical feature: 
    #train_values["date_recorded"] = pd.to_datetime(train_values.date_recorded, format="%Y-%m-%d") 

    # Creating transformer for datetime 
    def transform_date_sin_cos(df): 
        df["date_recorded"] = pd.to_datetime(df.date_recorded, format="%Y-%m-%d") 
        df["day"] = df["date_recorded"].dt.day
        df["month"] = df["date_recorded"].dt.month
        df["year"] = df["date_recorded"].dt.year 

        df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
        df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)

        df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
        df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

        df["year_sin"] = np.sin(2 * np.pi * (df["year"] % 10) / 3)  
        df["year_cos"] = np.cos(2 * np.pi * (df["year"] % 10) / 3)

        df.drop(columns=["day", "month", "year", "date_recorded"], inplace=True)
        return df

    date_transformer = FunctionTransformer(transform_date_sin_cos, validate=False)



    numeric_cols = train_values.select_dtypes(include=["int64", "float64"], exclude=["object", "datetime"]).drop(columns=["id"]).columns
    categoric_cols = train_values.select_dtypes(include=["object"], exclude=["int64", "float64", "datetime"]).columns
    #datetime_cols = train_values.select_dtypes(include=["datetime"], exclude=["int64", "float64", "object"]).columns


    if categorical_preprocessing == "OneHotEncoder":
        encoder = OneHotEncoder(
            min_frequency= MIN_FREQ_CAT
            #   , max_categories = MAX_CAT
            , 
            handle_unknown='infrequent_if_exist'
            #   , drop= "first"
        , sparse_output= False # Linear regression performs poorly on sparse data
        )   
    elif categorical_preprocessing == "OrdinalEncoder":
        encoder = OrdinalEncoder(
            handle_unknown="use_encoded_value"
            , unknown_value=-1
            , encoded_missing_value= -1 #TODO esto esta bien???
            #   , dtype=float
              , min_frequency = MIN_FREQ_CAT
            # , max_categories = MAX_CAT
        )
    elif categorical_preprocessing == "TargetEncoder":
        encoder = TargetEncoder(
            target_type = "multiclass"
        )


    # Numerical preprocessing
    if numerical_preprocessing == "StandardScaler" :
        scaler = StandardScaler()
    else:
        scaler = "passthrough"

    

    # Transformer object with scaler and encoder
    preprocessor = ColumnTransformer(
    transformers = [
            ('date', date_transformer,["date_recorded"] ),
            ('num', scaler, numeric_cols),
            ('cat', encoder, categoric_cols)],
    verbose=False)


    # Split the data into train and test sets 
    train_values.drop(columns=["id"], inplace = True)
    train_labels.drop(columns=["id"], inplace = True)
    X_train, X_val, y_train, y_val = train_test_split(train_values, train_labels, train_size = train_pct)

    # Apply to the training data 
    if categorical_preprocessing == "TargetEncoder": 
        X_train_transformed = preprocessor.fit_transform(X_train, y_train["status_group"])
    else: 
        X_train_transformed = preprocessor.fit_transform(X_train)
    

    # 3. Model Training and Evaluation -------------------------------------------------------


    if model_type == "LogisticRegression": 
        model = LogisticRegression()
    elif model_type == "RandomForestClassifier": 
        model = RandomForestClassifier()
    elif model_type == "GradientBoostingClassifier": 
        model = GradientBoostingClassifier( )
    elif model_type == "HistGradientBoostingClassifier":
        model = HistGradientBoostingClassifier()
    elif model_type == "MLPClassifier":
        model = MLPClassifier()


    model.fit(X_train_transformed, y_train.values.ravel())

    # Cross Validation on the training set
    folds = KFold(n_splits=5, random_state=100, shuffle=True)
    cv = cross_val_score(estimator=model,
                        X=X_train_transformed,
                        y=y_train.values.ravel(),
                        cv=folds,
                        scoring='accuracy')
    
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print(model, cat, num)
    print("The cross validation accuracy ")
    print(cv)
    print("Mean of the cross validation scores is: ", cv.mean())
    print("Standard dev of the cross validation scores is: ", cv.std())


    # calculate classification accuracy of the trained model on the validation set
    X_val_preprocessed = preprocessor.transform(X_val) # first we need to preprocess the input
    y_val_pred = model.predict(X_val_preprocessed) # then make the predictions
    acc_val = accuracy_score(y_pred=y_val_pred, y_true=y_val) # calculate the score
    print(f"classification accuracy on the validation set: {acc_val:.4f}")
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~")


    # 4. Prediction Generation -------------------------------------------------
    # Preprocessing
    # Outlier Handling in Numeric Fields through imputation
    # Remove row where construction year is 0 -> missing data
    # mask = test_values['construction_year'] != 0
    # test_values = test_values[mask].reset_index(drop=True)

    # # Removing amount_tsh column from training data due to high # of NaNs
    # test_values.drop(columns=["amount_tsh"])

    # Transform test data with same encoder
    X_test = preprocessor.transform(test_values)

    # Make prediction
    y_test = model.predict(X_test)
    output_test = pd.DataFrame({"id": test_values["id"].values, "status_group": y_test})

    # Write prediction to file 
    output_test.to_csv(test_output_file, index=False)

    end_time = time.time()
    execution_time = end_time - start_time
    minutes, seconds = divmod(execution_time, 60)

    print(f"Execution Time: {int(minutes)} minutes and {seconds:.2f} seconds")


In [18]:
supported_models = ["LogisticRegression", 
                    "RandomForestClassifier", 
                    "GradientBoostingClassifier", 
                    "HistGradientBoostingClassifier",
                    "MLPClassifier"]
supported_numerical = ["None", "StandardScaler"]
supported_categorical = ["TargetEncoder", "OneHotEncoder", "OrdinalEncoder"]

for model in supported_models: 
    for num in supported_numerical: 
        for cat in supported_categorical: 
            main(model,num,cat)

~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting LogisticRegression TargetEncoder None
~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

~~~~~~~~~~~~~~~~~~~~~~~~~~
LogisticRegression() TargetEncoder None
The cross validation accuracy 
[0.54713805 0.56060606 0.5535564  0.55218855 0.5547138 ]
Mean of the cross validation scores is:  0.5536405723905723
Standard dev of the cross validation scores is:  0.004337467843600323
classification accuracy on the validation set: 0.5559
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 2 minutes and 6.49 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting LogisticRegression OneHotEncoder None
~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

~~~~~~~~~~~~~~~~~~~~~~~~~~
LogisticRegression() OneHotEncoder None
The cross validation accuracy 
[0.58322811 0.56365741 0.55702862 0.56155303 0.59553872]
Mean of the cross validation scores is:  0.5722011784511785
Standard dev of the cross validation scores is:  0.014716112365171423
classification accuracy on the validation set: 0.5604
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 2 minutes and 56.50 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting LogisticRegression OrdinalEncoder None
~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

~~~~~~~~~~~~~~~~~~~~~~~~~~
LogisticRegression() OrdinalEncoder None
The cross validation accuracy 
[0.597117   0.57828283 0.58364899 0.58007155 0.58301768]
Mean of the cross validation scores is:  0.5844276094276095
Standard dev of the cross validation scores is:  0.006638500980582138
classification accuracy on the validation set: 0.5684
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 0 minutes and 48.61 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting LogisticRegression TargetEncoder StandardScaler
~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

~~~~~~~~~~~~~~~~~~~~~~~~~~
LogisticRegression() TargetEncoder StandardScaler
The cross validation accuracy 
[0.75873316 0.7706229  0.7645202  0.7572601  0.7662037 ]
Mean of the cross validation scores is:  0.7634680134680134
Standard dev of the cross validation scores is:  0.0049140697537750876
classification accuracy on the validation set: 0.7655
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 2 minutes and 9.94 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting LogisticRegression OneHotEncoder StandardScaler
~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

~~~~~~~~~~~~~~~~~~~~~~~~~~
LogisticRegression() OneHotEncoder StandardScaler
The cross validation accuracy 
[0.73148148 0.73611111 0.73779461 0.73537458 0.74484428]
Mean of the cross validation scores is:  0.7371212121212121
Standard dev of the cross validation scores is:  0.004381250464495836
classification accuracy on the validation set: 0.7363
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 2 minutes and 56.16 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting LogisticRegression OrdinalEncoder StandardScaler
~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

~~~~~~~~~~~~~~~~~~~~~~~~~~
LogisticRegression() OrdinalEncoder StandardScaler
The cross validation accuracy 
[0.63015572 0.64541246 0.64099327 0.64930556 0.63846801]
Mean of the cross validation scores is:  0.6408670033670034
Standard dev of the cross validation scores is:  0.0065172528345551475
classification accuracy on the validation set: 0.6440
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 0 minutes and 50.70 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting RandomForestClassifier TargetEncoder None
~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~
RandomForestClassifier() TargetEncoder None
The cross validation accuracy 
[0.78356481 0.79934764 0.79619108 0.79029882 0.80071549]
Mean of the cross validation scores is:  0.7940235690235691
Standard dev of the cross validation scores is:  0.006341686263244906
classification accuracy on the validation set: 0.8090
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 1 minutes and 6.62 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting RandomForestClassi



~~~~~~~~~~~~~~~~~~~~~~~~~~
MLPClassifier() TargetEncoder StandardScaler
The cross validation accuracy 
[0.78356481 0.7829335  0.77714646 0.78009259 0.78261785]
Mean of the cross validation scores is:  0.7812710437710437
Standard dev of the cross validation scores is:  0.002376549007051586
classification accuracy on the validation set: 0.7767
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 2 minutes and 31.11 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting MLPClassifier OneHotEncoder StandardScaler
~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~
MLPClassifier() OneHotEncoder StandardScaler
The cross validation accuracy 
[0.76694024 0.76820286 0.77272727 0.77977694 0.77209596]
Mean of the cross validation scores is:  0.7719486531986531
Standard dev of the cross validation scores is:  0.004495000724811765
classification accuracy on the validation set: 0.7735
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 3 minutes and 4.97 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~
Now fitting MLPClassifier OrdinalEncoder StandardScaler
~~~~~~~~~~~~~~~~~~~~~~~~~~




~~~~~~~~~~~~~~~~~~~~~~~~~~
MLPClassifier() OrdinalEncoder StandardScaler
The cross validation accuracy 
[0.75883838 0.77451599 0.7684133  0.76073232 0.76578283]
Mean of the cross validation scores is:  0.7656565656565657
Standard dev of the cross validation scores is:  0.005600727612727777
classification accuracy on the validation set: 0.7088
~~~~~~~~~~~~~~~~~~~~~~~~~~
Execution Time: 1 minutes and 8.82 seconds
