In [1]:
import os

In [2]:
import requests

In [3]:
import numpy as np
import pandas as  pd

## 2. Load Dataset

In [4]:
URL = "https://raw.githubusercontent.com/Sarthak-1408/Water-Potability/refs/heads/main/water_potability.csv"
DATASET_FILEPATH = "./water_potability.csv"
if not os.path.isfile(DATASET_FILEPATH):
    response = requests.get(URL)
    # Check if the download was successful
    if response.status_code == 200:
        with open('water_potability.csv', 'wb') as file:
            file.write(response.content)
        print("CSV file downloaded successfully.")
    else:
        raise Exception(f"Failed to download file. Status code: {response.status_code}")

df = pd.read_csv(DATASET_FILEPATH)

## 3. Transform Columns

In [5]:
df.columns = df.columns.str.lower()


In [6]:
COLUMNS = list(df.columns)
print(COLUMNS)

['ph', 'hardness', 'solids', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes', 'turbidity', 'potability']


In [7]:
TARGET_COLUMN = 'potability'
COLUMNS.remove(TARGET_COLUMN)

In [8]:
df.head(5)

Unnamed: 0,ph,hardness,solids,chloramines,sulfate,conductivity,organic_carbon,trihalomethanes,turbidity,potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   hardness         3276 non-null   float64
 2   solids           3276 non-null   float64
 3   chloramines      3276 non-null   float64
 4   sulfate          2495 non-null   float64
 5   conductivity     3276 non-null   float64
 6   organic_carbon   3276 non-null   float64
 7   trihalomethanes  3114 non-null   float64
 8   turbidity        3276 non-null   float64
 9   potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


## 3. Remove nulls

In [10]:
df.fillna(0, inplace=True)

## 4. Create Dataset

In [11]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train[TARGET_COLUMN]).astype('int').values
y_val = (df_val[TARGET_COLUMN]).astype('int').values
y_test = (df_test[TARGET_COLUMN]).astype('int').values

del df_train[TARGET_COLUMN]
del df_val[TARGET_COLUMN]
del df_test[TARGET_COLUMN]

In [12]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)


## 5. Trainnning models

### 5.0. Utils functions

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [14]:
def evaluate_model(model, X_train, y_train, X_val, y_val, params):
    """
    # Función para evaluar un conjunto de hiperparámetros
    """
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logistic', model(**params))
    ])    
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    return accuracy, pipeline

In [29]:
from itertools import product

def find_best_model(Model, parameter_grid, X_train, y_train, X_val, y_val, extra_parameters = {}, verbose = False):
    best_accuracy = -np.inf
    best_params = None
    best_model = None

    parameter_labels = parameter_grid.keys()
    parameter_values = parameter_grid.values()

    for temp_parameter_iterable in product(*parameter_values):
        params = { label:value for label, value in zip(parameter_labels, temp_parameter_iterable) }
        if verbose:
            print()
            print(params)

        # Evaluamos los parámetros
        try:
            accuracy, model = evaluate_model(
                Model, X_train, y_train, X_val, y_val, params
            )
        except ValueError as ve:
            if verbose:
                print(ve)
            continue

        if verbose:
            print(f"accuracy: {accuracy}")
        
        # Actualizamos mejor modelo si es necesario
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_model = model

    return best_model, best_params, best_accuracy

In [30]:
from sklearn.linear_model import LogisticRegression

param_grid = {
    'penalty':['l1','l2','elasticnet', None],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [100,1000,2500,5000]
}

best_model, best_params, val_accuracy = find_best_model(
    Model=LogisticRegression,
    parameter_grid=param_grid,
    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val,)



In [31]:
# Imprimir resultados de optimización
print("Mejores Hiperparámetros:")
for param, value in best_params.items():
    print(f"{param}: {value}")
print(f"\nAccuracy en Validación: {val_accuracy:.4f}")

Mejores Hiperparámetros:
penalty: l1
C: 0.001
solver: liblinear
max_iter: 100

Accuracy en Validación: 0.6336


In [32]:
# Evaluar en conjunto de prueba
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy en Prueba: {test_accuracy:.4f}")

Accuracy en Prueba: 0.5686


In [33]:
# Detalles adicionales de evaluación
from sklearn.metrics import (
    classification_report, 
    confusion_matrix
)

print("\nInforme de Clasificación:")
print(classification_report(y_test, y_test_pred))

print("\nMatriz de Confusión:")
print(confusion_matrix(y_test, y_test_pred))


Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.57      1.00      0.72       373
           1       0.00      0.00      0.00       283

    accuracy                           0.57       656
   macro avg       0.28      0.50      0.36       656
weighted avg       0.32      0.57      0.41       656


Matriz de Confusión:
[[373   0]
 [283   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 5.1. Configuring MLFLOW

In [17]:
"""
from mlflow.tracking import MlflowClient

TRACKING_SERVER_HOST = "localhost"
client = MlflowClient(f"http://{TRACKING_SERVER_HOST}:5000")

mlflow.set_experiment("wqm-exp-1")
"""

'\nfrom mlflow.tracking import MlflowClient\n\nTRACKING_SERVER_HOST = "localhost"\nclient = MlflowClient(f"http://{TRACKING_SERVER_HOST}:5000")\n\nmlflow.set_experiment("wqm-exp-1")\n'

### 5.2 Model 1: Logistic regression 

### 5.3 Model 2: Random Forest

In [24]:
# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

### 5.4. Model 3: Support Vector Machine

### 5.5 Model 4: Native Bayes 