# Files and imports

In [1]:
from pandas import read_csv, DataFrame
import pandas as pd

filepath = r'dataset/df_msg_data_prof.csv'
file_tag = "ecom_msg"


data: DataFrame = read_csv(filepath, na_values="")

# Data prep and transformation

## Remove columns

In [None]:
#drop id columns - are not relevant for the prediction of the class

data = data.drop(columns=['message_id', 'id_msg', 'campaign_id','client_id'])

In [None]:
#drop MV columns - where MV values are around 90% or above. 

data = data.drop(columns=["category", "blocked_at", "complained_at", "soft_bounced_at", "purchased_at", "hard_bounced_at",
                          "clicked_first_time_at", "clicked_last_time_at", "unsubscribed_at", "platform"])

## Split 'date' column

In [4]:
import pandas as pd

# Split the 'date'' column to have more information and detail on the:
# - day_of_month
# - day_ok_week 
# - is_weekend
# - moment_of_day
# - hour
# - min
#As the dataset only has data from sepecific month of 2021, we don't need to get the year, quarter.

# Convert the 'date' column to datetime
data['date'] = pd.to_datetime(data['date'], utc=True)

# Create new columns based on the date column 'event_time'
data['week_of_month'] = data['date'].apply(lambda x: (x.day - 1) // 7 + 1)
data['is_weekend'] = data['date'].dt.weekday.apply(lambda x: "weekend" if x >= 5 else "weekday") 
data['day_of_week'] = data['date'].dt.day_name()  # Monday=0, Sunday=6
data['day_of_month'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
data['min'] = data['date'].dt.minute

# Create function to get time of the day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data['time_of_day'] = data['hour'].apply(categorize_time_of_day)

# Extract the month from the data
data['month'] = data['date'].dt.month_name()  # Get the full month name
# Alternatively, to get the month number:
# data['month'] = df['date'].data.month

# Display the updated DataFrame
data.head()


Unnamed: 0,id,message_id,campaign_id,message_type,client_id,channel,category,platform,email_provider,stream,...,purchased_at_month,purchased_at_day,week_of_month,is_weekend,day_of_week,day_of_month,hour,min,time_of_day,month


# Processing each task:

In [None]:
from numpy import ndarray
from pandas import DataFrame, read_csv
from matplotlib.pyplot import savefig, show, figure
from dslabs_functions import plot_multibar_chart, CLASS_EVAL_METRICS, run_NB, run_KNN


def evaluate_approach(
    train: DataFrame, test: DataFrame, target: str = "class", metric: str = "accuracy"
) -> dict[str, list]:
    trnY = train.pop(target).values
    trnX: ndarray = train.values
    tstY = test.pop(target).values
    tstX: ndarray = test.values
    eval: dict[str, list] = {}

    eval_NB: dict[str, float] = run_NB(trnX, trnY, tstX, tstY, metric=metric)
    eval_KNN: dict[str, float] = run_KNN(trnX, trnY, tstX, tstY, metric=metric)
    if eval_NB != {} and eval_KNN != {}:
        for met in CLASS_EVAL_METRICS:
            eval[met] = [eval_NB[met], eval_KNN[met]]
    return eval


target = "stroke"
file_tag = "stroke"
train: DataFrame = read_csv("data/stroke_train.csv")
test: DataFrame = read_csv("data/stroke_test.csv")

figure()
eval: dict[str, list] = evaluate_approach(train, test, target=target, metric="recall")
plot_multibar_chart(
    ["NB", "KNN"], eval, title=f"{file_tag} evaluation", percentage=True
)
#savefig(f"./{file_tag}_eval.png")
show()

# Variable Encoding

## Ordinal Encoding


In [None]:
from pandas import read_csv, DataFrame
from dslabs_functions import get_variable_types, encode_cyclic_variables, dummify

data: DataFrame = read_csv(filepath, index_col="id", na_values="")
vars: dict[str, list] = get_variable_types(data)

true_false: dict[str, int] = {"false": 0, "False": 0, "true": 1, "True": 1}
#message_type_values: dict[str, int] = {"bulk": 0, "trigger": 1}

encoding: dict[str, dict[str, int]] = {
#    "message_type_values": residence_type_values,
    "is_opened": true_false,
    "is_clicked": true_false,
    "is_unsubscribed": true_false,
    "is_hard_bounced": true_false,
    "is_soft_bounced": true_false,
    "is_complained": true_false,
    "is_purchased": true_false,
    "subject_with_personalization": true_false,
    "subject_with_deadline": true_false,
    "subject_with_emoji": true_false,
    "subject_with_bonuses": true_false,
    "subject_with_discount": true_false,
    "subject_with_saleout": true_false,

}
df: DataFrame = data.replace(encoding, inplace=False)
df.head()

## Cyclic variables