In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
def cost_fun(x: pd.DataFrame, y: pd.Series, w: pd.Series, b: float) -> float:
    """
    Computes the cost function

    Args:
        x(pd.DataFrame): training data
        y(pd.DataFrame): target values
        w, b(Scalar): model parameters

    Returns:
        cost(float): The cost of using w and b as parameters of linear regression model    
    """
    f_wb = x.dot(w) + b
    diff = f_wb - y
    cost = (1 / (2 * len(x))) * np.sum(diff ** 2)
    return cost

In [4]:
def gradient(x: pd.Series, y: pd.Series, w: pd.Series, b: float):
    """
    Computes the gradient(derivative)

    Args:
        x(pd.Dataframe): training data
        y(pd.Series): target values
        w (pd.Series): model parameter
        b(Scalar): model parameters

    Returns:
        dj_dw(pd.Series): derivative of cost with respect to w of liniear regression 
        dj_db(Scalar): derivatives of cost with respect to b of linear regression model
    """
    # Remember to modify in class
    f_wb = x.dot(w) + b
    diff = f_wb - y
    dj_dw = x.T.dot(diff) / len(x)
    dj_db = np.sum(diff) / len(x)
    return dj_dw , dj_db

In [7]:
def gradient_descent(
        x: pd.Series, y: pd.Series, w: float, b: float, alpha: float, num_iter: int, cost_fun, gradient
        ) -> tuple:
    """
    Performs gradient descent to fit w and b. Updates w and b 
    by taking num_iters gardient steps with learning rate alpha

    Args:
        x(pd.Series): training data
        y(pd.Series): target values
        w, b(Scalar): initial model parameters
        alpha(float): learning rate
        num_iter(int): number of iteration
        cost_fun(function): to calculate cost function
        gradient(function): to calculate derivatives

    Returns:
        w, b(Scalar): updated values of parameters after runnign gradient descent
    """
    for i in range(num_iter):
        dj_dw, dj_db = gradient(x, y, w, b)
        if np.allclose(dj_dw, 0) and np.allclose(dj_db, 0):
            print(f"Convergence at iteration {i:5}")
            break
        w -= alpha * dj_dw
        b -= alpha * dj_db
        if i % (num_iter // 10) == 0 or i == num_iter - 1:
            cost_i = cost_fun(x , y , w, b)
            print(f"Iteration {i:5}: Cost {cost_i}")
    return w, b

In [3]:
def zscore_normalization(x: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates z score nomalized values of each feature and instance

    Args:
        x (pd.DataFrame): traininng data for normalization

    Returns:
        x_norm (pd.DataFrame): normalized training data 
    """
    mu = x.mean(axis=0)
    sigma = x.std(axis=0)
    x_norm = (x - mu) / sigma
    return x_norm

In [None]:
def preprocess_data(df):
    """
    Automatically detects categorical variables and applies one-hot encoding.
    Keeps numerical variables unchanged.
    
    Args:
        df (pd.DataFrame): The DataFrame to preprocess.
    
    Returns:
        pd.DataFrame: A new DataFrame with one-hot encoded categorical variables
                      and original numerical variables.
    """
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) == 0:
        return df
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    encoder = OneHotEncoder(sparse=False, drop='first')
    encoded_cats = encoder.fit_transform(df[categorical_cols])
    encoded_cats_df = pd.DataFrame(encoded_cats, 
                                   columns=encoder.get_feature_names(categorical_cols),
                                   index=df.index)
    df_processed = pd.concat([df[numerical_cols], encoded_cats_df], axis=1)
    
    return df_processed