In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from pycaret.clustering import load_model, predict_model

def clean_age(x: str) -> str:
    if x.isalpha() or ' ' in x or '-' in x:
        return '0.00'
    return x


def clean_df(df: pd.DataFrame, cols=['age', 'gender']) -> pd.DataFrame:
    df[cols[0]] = df[cols[0]].fillna('0')
    df[cols[0]] = df[cols[0]].astype('str')
    df[cols[0]] = df[cols[0]].apply(lambda x: '0' if x.isalpha() or ' ' in x or '-' in x else x)
    df[cols[0]] = df[cols[0]].astype('float32')
    df[cols[1]] = df[cols[1]].map({'F': 0, 'M': 1})
    return df    


def transform_outlier_mzscore(df: pd.DataFrame, params: dict, col: str = 'age') -> pd.DataFrame:
    df = df.copy()
    data = df[col].values
    z_score = 0.6745 * (data - params["median"]) / params["mad"]
    outliers_mask = np.abs(z_score) > params["threshold"]
    df.loc[outliers_mask, col] = params["median"]
    return df


def transform_imputar_genero_ponderado(df: pd.DataFrame, distrib: dict,col: str = 'gender') -> pd.DataFrame:
    """Imputa los valores nulos en 'gender' usando la distribución proporcionada."""
    df = df.copy()
    mask = df[col].isnull()
    if mask.any():
        values = list(distrib.keys())
        probs = [distrib[v] for v in values]
        n_nulls = mask.sum()
        df.loc[mask, col] = np.random.choice(values, size=n_nulls, p=probs)
    return df


def transform_feature_engineering(df: pd.DataFrame, params: dict) -> pd.DataFrame:
    """
    Transforma el DataFrame usando los parámetros dados.
    Aplica las transformaciones (por ejemplo, tratamiento de outliers e imputación de género)
    según los parámetros calculados en la fase de fit.
    """
    df = transform_outlier_mzscore(df, params=params["age"])
    df = transform_imputar_genero_ponderado(df, distrib=params["gender"])
    return df


def transform_scalers(df: pd.DataFrame, scalers: dict) -> pd.DataFrame:
    """
    Transforma el DataFrame usando los escaladores ajustados:
      - Aplica StandardScaler a cols_standard.
      - Aplica MinMaxScaler a cols_minmax.
    """
    df = df.copy()
    cols_standard = df.columns[2] # `age` está en posición 2
    cols_minmax = df.columns[3: ] # Todas las columnas a partir de la cuarta
    df[cols_standard] = scalers["standard"].transform(df[[cols_standard]])
    df[cols_minmax] = scalers["minmax"].transform(df[cols_minmax])
    return df

def remove_gradyear(df: pd.DataFrame) -> pd.DataFrame:
    df_new = df.copy()
    if "gradyear" in df_new.columns:
        df_new = df_new.drop(columns=["gradyear"])
    return df_new


def transform_variance_threshold(df: pd.DataFrame, selected_columns: list) -> pd.DataFrame:
    return df[selected_columns].copy()


def predict_with_birch(data: pd.DataFrame) -> pd.DataFrame:
    model = load_model("../data/06_models/best_model_birch")
    print("Columnas antes de predecir:", data.columns)
    predictions = predict_model(model, data=data)
    return predictions

In [30]:
preprocessed_train_data = pd.read_parquet("../data/03_primary/train_data.parquet")

In [31]:
preprocessed_train_data.columns

Index(['gradyear', 'gender', 'age', 'NumberOffriends', 'basketball',
       'football', 'soccer', 'softball', 'volleyball', 'swimming',
       'cheerleading', 'baseball', 'tennis', 'sports', 'cute', 'sex', 'sexy',
       'hot', 'kissed', 'dance', 'band', 'marching', 'music', 'rock', 'god',
       'church', 'jesus', 'bible', 'hair', 'dress', 'blonde', 'mall',
       'shopping', 'clothes', 'hollister', 'abercrombie', 'die', 'death',
       'drunk', 'drugs'],
      dtype='object')

In [32]:
preprocessed_train_data['drugs']

2683     0
1264     0
14649    0
3765     0
2474     0
        ..
5279     0
13651    0
5483     0
867      1
7385     0
Name: drugs, Length: 11787, dtype: int64

In [33]:
test_data = pd.read_parquet("../data/03_primary/test_data.parquet")

In [38]:
test_data.columns

Index(['gradyear', 'gender', 'age', 'NumberOffriends', 'basketball',
       'football', 'soccer', 'softball', 'volleyball', 'swimming',
       'cheerleading', 'baseball', 'tennis', 'sports', 'cute', 'sex', 'sexy',
       'hot', 'kissed', 'dance', 'band', 'marching', 'music', 'rock', 'god',
       'church', 'jesus', 'bible', 'hair', 'dress', 'blonde', 'mall',
       'shopping', 'clothes', 'hollister', 'abercrombie', 'die', 'death',
       'drunk', 'drugs'],
      dtype='object')

In [39]:
clean_test_data = clean_df(test_data)

In [40]:
fe_params = pd.read_pickle("../data/04_feature/fe_params.pkl")

In [41]:
fe_test_data = transform_feature_engineering(clean_test_data, fe_params)

In [42]:
fe_test_data

Unnamed: 0,gradyear,gender,age,NumberOffriends,basketball,football,soccer,softball,volleyball,swimming,...,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
5761,2007,1.0,16.862000,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13346,2008,0.0,16.862000,24,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4098,2008,1.0,17.656000,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3503,2009,0.0,15.548000,16,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
14832,2007,0.0,17.650999,54,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2656,2008,0.0,16.136999,23,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5546,2009,1.0,15.510000,80,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5650,2006,1.0,18.034000,38,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10754,2006,0.0,19.190001,25,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [43]:
scaler_params = pd.read_pickle("../data/04_feature/scalers_params.pkl")

In [47]:
scaler_params['minmax'].feature_names_in_

array(['NumberOffriends', 'basketball', 'football', 'soccer', 'softball',
       'volleyball', 'swimming', 'cheerleading', 'baseball', 'tennis',
       'sports', 'cute', 'sex', 'sexy', 'hot', 'kissed', 'dance', 'band',
       'marching', 'music', 'rock', 'god', 'church', 'jesus', 'bible',
       'hair', 'dress', 'blonde', 'mall', 'shopping', 'clothes',
       'hollister', 'abercrombie', 'die', 'death', 'drunk', 'drugs'],
      dtype=object)

In [49]:
set(test_data.columns).symmetric_difference(set(scaler_params['minmax'].feature_names_in_))

{'age', 'gender', 'gradyear'}

In [45]:
scaled_test_data = transform_scalers(fe_test_data, scaler_params)

In [20]:
scaled_test_data

Unnamed: 0,gradyear,gender,age,NumberOffriends,basketball,football,soccer,softball,volleyball,swimming,...,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
5761,2007,0.0,-0.302257,0.003306,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
13346,2008,1.0,-0.302257,0.039669,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
4098,2008,0.0,0.428774,0.000000,0.000000,0.0,0.136364,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
3503,2009,1.0,-1.512046,0.026446,0.045455,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.083333,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
14832,2007,0.0,0.424169,0.089256,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2656,2008,0.0,-0.969759,0.038017,0.000000,0.0,0.000000,0.0,0.0,0.083333,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
5546,2009,0.0,-1.547032,0.132231,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
5650,2006,1.0,0.776796,0.062810,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
10754,2006,0.0,1.841117,0.041322,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.142857,0.000,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
scaled_test_data_no_gradyear = remove_gradyear(scaled_test_data)

In [13]:
selected_columns = pd.read_pickle("../data/05_model_input/selected_feature_names.pkl")

In [14]:
fs_test_data = transform_variance_threshold(scaled_test_data_no_gradyear, selected_columns)

In [15]:
fs_test_data

Unnamed: 0,gender,age,NumberOffriends,basketball,football,soccer,softball,volleyball,swimming,cheerleading,...,dress,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
5761,0.0,-0.302257,0.003306,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
13346,1.0,-0.302257,0.039669,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
4098,0.0,0.428774,0.000000,0.000000,0.0,0.136364,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
3503,1.0,-1.512046,0.026446,0.045455,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.083333,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
14832,0.0,0.424169,0.089256,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2656,0.0,-0.969759,0.038017,0.000000,0.0,0.000000,0.0,0.0,0.083333,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
5546,0.0,-1.547032,0.132231,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
5650,1.0,0.776796,0.062810,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
10754,0.0,1.841117,0.041322,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.142857,0.000,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
predictions = predict_with_birch(fs_test_data)

FileNotFoundError: [Errno 2] No such file or directory: '../data/06_models/best_model_birch.pkl'

In [94]:
predictions['Cluster'].value_counts()

Cluster
Cluster 0    2940
Cluster 1       7
Name: count, dtype: int64

In [38]:
model = load_model("../data/06_models/best_model_birch")

Transformation Pipeline and Model Successfully Loaded


In [58]:
fs_test_data['drugs']

5761     0.0
13346    0.0
4098     0.0
3503     0.0
14832    0.0
        ... 
2656     0.0
5546     0.0
5650     0.0
10754    0.0
3261     0.0
Name: drugs, Length: 2947, dtype: float64

In [18]:
fs_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2947 entries, 5761 to 3261
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           2947 non-null   float64
 1   age              2947 non-null   float32
 2   NumberOffriends  2947 non-null   float64
 3   basketball       2947 non-null   float64
 4   football         2947 non-null   float64
 5   soccer           2947 non-null   float64
 6   softball         2947 non-null   float64
 7   volleyball       2947 non-null   float64
 8   swimming         2947 non-null   float64
 9   cheerleading     2947 non-null   float64
 10  baseball         2947 non-null   float64
 11  tennis           2947 non-null   float64
 12  sports           2947 non-null   float64
 13  cute             2947 non-null   float64
 14  sexy             2947 non-null   float64
 15  hot              2947 non-null   float64
 16  dance            2947 non-null   float64
 17  band            