In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer # type: ignore

## load data

In [6]:
def load_data(filename, na_values=None):
    if na_values is None:
        na_values = ["", " "]
    
    print(f"Loading data from {filename}...")
    return pd.read_csv(filename, na_values=na_values)

## preprocess data

In [5]:
def preprocess_data(df, feature_columns, column_weights=None):
    if column_weights is None:
        column_weights = {}
    
    mlbs = {}
    feature_matrices = []

    # Handle duration column if it exists
    if "duration" in df.columns:
        df["duration"] = (
            df["duration"].str.extract("(\d+)").astype(float)
        )  # Extract the number from the duration string
        df["duration"].fillna(df["duration"].mean(), inplace=True)
        print("Duration column processed.")

    for col in feature_columns:
        df[col] = df[col].str.split(",")
        df[col].fillna("", inplace=True)
        mlb = MultiLabelBinarizer()
        encoded_data = mlb.fit_transform(df[col])
        
        print(f"Encoding {col}...")
        
        # Apply column-specific weights if provided
        weight = column_weights.get(col, 1.0)
        encoded_data = encoded_data * weight
        
        feature_matrices.append(encoded_data)
        mlbs[col] = mlb
        print(f"{col} encoded.")

    features_matrix = np.hstack(feature_matrices)
    print("Feature matrix created.")

    return df, features_matrix, mlbs
