In [1]:
import math
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

random_seed = 33

In [2]:
df = sns.load_dataset('titanic')

# Separación en conjuntos

In [3]:
train_val, test = train_test_split(df, test_size = 0.2, random_state = random_seed)
train, val = train_test_split(train_val, test_size = 0.1, random_state = random_seed)

x_train, y_train = train.drop(columns=["survived"]), train["survived"]
x_train_val, y_train_val = train_val.drop(columns=["survived"]), train_val["survived"]
x_val, y_val = val.drop(columns=["survived"]), val["survived"]
x_test, y_test = test.drop(columns=["survived"]), test["survived"]

# Prepro

In [4]:
def binary_categorizer(dataframe, column, code_map: dict = None, cols: int = None):
  # resultados
  result = []

  # puede ser que me obliguen a que haya un número determinado de columnas
  if not cols:
    cols = math.ceil(math.log2(len(dataframe[column].unique()))) # aplico la fórmula de log_2_n y lo aproximo al número más grande

  # puede ser que no se conozca el mapa y tenga que inferirlo
  if not code_map:
    code_map = {value: key for key, value in enumerate(dataframe[column].unique())} # creo el mapa de forma genérica si no existe

  # realizo la codificación a binario, comprobando que puedo hacerlo para todos los valores
  for value in dataframe[column]:
    code = code_map[value] # recojo el código asignado
    b_code = format(code, "b") # lo convierto a binario

    if len(b_code) > cols: # cols no puede ser más pequeño que el código
      raise Exception(f"El número de columnas ({cols}) es demasiado pequeño para empaquetar la información ({len(b_code)}). Modifica el valor del atributo cols.")

    # realizo la codificación
    b_code_a = b_code.rjust(cols, "0") # lo formateo hasta tamaño cols rellenando con 0
    _value = list(b_code_a) # lo convierto a lista: cada elemento en una posición diferente 00 -> ["0", "0"]
    result.append(list(map(lambda v: int(v), _value))) # convierto la lista en una lista de enteros ["0", "0"] -> [0, 0]

  # defino las nuevas columnas y las añado a mi df
  new_columns_name = [f"{column}_{i}" for i in range(len(list(result[0])))] # les daré nombre a las nuevas columnas
  result_df = pd.DataFrame(result, index=dataframe.index, columns=new_columns_name) # creo un nuevo df con los resultados
  dataframe = pd.concat([dataframe, result_df], axis=1) # lo añado en el eje X respetando el orden
  return dataframe.drop(columns=[column]), code_map # también devuelvo el mapa de códigos, me será útil

In [5]:
def data_cleaner(df, other):
    # columnas sin sentido
    df = df.drop(columns=["pclass", "sibsp", "parch", "embarked", "who", "adult_male", "deck", "alive"])
    other = other.drop(columns=["pclass", "sibsp", "parch", "embarked", "who", "adult_male", "deck", "alive"])

    # nulos
    input = df.age.to_numpy().reshape(-1, 1)
    imputer = SimpleImputer().fit(input)
    df["age"] = imputer.transform(input)
    other["age"] = imputer.transform(other.age.to_numpy().reshape(-1, 1))

    df["embark_town"] = df["embark_town"].apply(lambda v: v if v is not None else "Unknown")
    other["embark_town"] = other["embark_town"].apply(lambda v: v if v is not None else "Unknown")

    # codificacion
    df.sex = df.sex.apply(lambda v: 0 if v == "male" else 1 if v == "female" else v)
    other.sex = other.sex.apply(lambda v: 0 if v == "male" else 1 if v == "female" else v)

    df["class"] = df["class"].apply(lambda v: 1 if v == "First" else 2 if v == "Second" else 3 if v == "Third" else v)
    other["class"] = other["class"].apply(lambda v: 1 if v == "First" else 2 if v == "Second" else 3 if v == "Third" else v)

    df, embark_codemap = binary_categorizer(df, "embark_town")
    other, _ = binary_categorizer(other, "embark_town", embark_codemap)

    df.alone = df.alone.apply(lambda v: int(v))
    other.alone = other.alone.apply(lambda v: int(v))

    # estandarización
    scaler = StandardScaler().fit(df)
    df = scaler.transform(df)
    other = scaler.transform(other)

    # PCA
    pca = PCA(n_components = 0.90).fit(df)
    df = pca.transform(df)
    other = pca.transform(other)

    return df, other

In [6]:
c_train, c_val = data_cleaner(x_train, x_val)
c_train_val, c_test = data_cleaner(x_train_val, x_test)

# Modelos

In [7]:
# prompt: Create a method to evaluate accuracy score given y_true and y_pred
def accuracy_score(y_true, y_pred):
  return np.mean(y_true == y_pred)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [9]:
model = KNeighborsClassifier()
model = model.fit(c_train, y_train)
pred_train = model.predict(c_train)
pred_val = model.predict(c_val)

accuracy_score(y_train, pred_train), accuracy_score(y_val, pred_val)

(0.8578125, 0.75)

In [10]:
# prompt: Use gridsearch to optimize the knn model

import numpy as np
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': range(1, 50, 2),  # Example values for n_neighbors
    'weights': ['uniform', 'distance'],  # Example values for weights
    'metric': ['euclidean', 'manhattan']  # Example values for metric
}

# Create a KNN classifier
model = KNeighborsClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(c_train, y_train)

In [11]:
# Get the best parameters and best score
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Train a new model with the best parameters
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(c_train, y_train)

# Make predictions using the best model
pred_train = best_knn.predict(c_train)
pred_val = best_knn.predict(c_val)

# Evaluate the best model
accuracy_score(y_train, pred_train), accuracy_score(y_val, pred_val)

Best parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}


(0.8828125, 0.75)

In [12]:
knn2 = KNeighborsClassifier(n_neighbors=21, weights='distance', metric='manhattan').fit(c_train, y_train)
knn3 = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan').fit(c_train, y_train)

In [13]:
def ensemble(arr, labels):
    preds = []
    for row in arr:
        pred1 = best_knn.predict_proba([row])
        pred2 = knn2.predict_proba([row])
        pred3 = knn3.predict_proba([row])

        score = 0
        count = 0
        for pred in [pred1, pred2, pred3]:
            if np.max(pred) >= 0.8:
                score += np.argmax(pred)
                count += 1

        if count == 0:
            preds.append(np.mean(np.array([pred1, pred2, pred3])).astype(int))
        else:
            preds.append(round(score / count, 0))

    return preds

In [14]:
preds = ensemble(c_train, y_train)
accuracy_score(y_train, preds)

0.98125

In [15]:
preds = ensemble(c_val, y_val)
accuracy_score(y_val, preds)

0.7916666666666666