# Encoding Categorical and Temporal Data

## Setup

In [None]:
#import libraries
import pandas as pd
import numpy as np
import janitor
from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.read_csv("../data/df_after_imputation.csv", parse_dates=["datetime"])

In [None]:
#save raw features for later interpretability
raw_features = ["area", "vict_sex", "vict_descent", "weapon_group", "crime_group", "premises_group", "hour", "day", "month"]
df_raw = df[raw_features].copy()
df_raw.to_parquet("../data/df_raw.parquet", index=True)

In [None]:
#tidy df ready for encoding
df = (
    df.drop(columns=["date", "time_str", "lat", "lon", "time", "vict_age", "datetime"]
    ).reorder_columns(
    [
        "area",
        "vict_sex",
        "vict_descent",
        "weapon_group",
        "crime_group",
        "premises_group",
        "hour",
        "day",
        "month",
        "is_child"
    ]
    )
)

#cast boolean to int
df["is_child"] = df["is_child"].astype(int)

## One-Hot Encoding 

I will use one-hot encoding to prepare the categorical variables to be used as features in the model (Rojo-Echeburúa, 2024).

In [None]:
#get categorical variables
X = df[["area", "vict_sex", "vict_descent", "weapon_group", "crime_group", "premises_group"]]

#instantiate and fit encoder
oh_encoder = OneHotEncoder(drop="if_binary", handle_unknown="error")
oh_encoder.fit(X)
oh = oh_encoder.transform(X).toarray()

#get features into df
feature_names = oh_encoder.get_feature_names_out(["area", "vict_sex", "vict_descent", "weapon_group", "crime_group", "premises_group"])
df_oh = pd.DataFrame(oh, columns=feature_names)

#code adapted from Rojo-Echeburúa (2024) 

In [None]:
#tidy and check variable names
df_oh = df_oh.clean_names()
df_oh.columns = df_oh.columns.str.replace("_group_", "_")
df_oh.columns

In [None]:
#get non-categorical variables and readd to df
non_cat = df.drop(["area", "vict_sex", "vict_descent", "weapon_group", "crime_group", "premises_group"], axis=1)
df = pd.concat([non_cat, df_oh], axis=1)

In [None]:
#check
df.head()

## Cyclical Encoding

Next I will use cyclical encoding to make the temporal variables more interpretable by the model (Pelletier, 2024).

In [None]:
#get sin and cos for hour of day, day of week, and month of year
df["hour_sin"]  = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"]  = np.cos(2 * np.pi * df["hour"] / 24)
df["day_sin"] = np.sin(2 * np.pi * df["day"] / 7)
df["day_cos"] = np.cos(2 * np.pi * df["day"] / 7)
df["month_sin"] = np.sin(2 * np.pi * (df["month"] - 1) / 12)
df["month_cos"] = np.cos(2 * np.pi * (df["month"] - 1) / 12)

#code adapted from Pelletier (2024)

In [None]:
#drop original variables
df = df.drop(columns=["hour", "day", "month"])

In [None]:
#final df
df.head()

## Dataframe Export

In [None]:
# df.to_csv("../data/df_final.csv", index=False, encoding="utf-8")