# Files and imports

In [21]:
from pandas import read_csv, DataFrame
import pandas as pd

#get file from data pre processing notebook. This file has all the column we need to start the data profiling phase
filepath = r'dataset/df_msg_pre_proc.csv'

file_tag = 'df_msg_pre_proc'

data = pd.read_csv(filepath, na_values="")

In [22]:
%run "dslabs_functions.py"

dslabs_functions loaded


# Processing each task:

In [None]:
from numpy import ndarray
from pandas import DataFrame, read_csv
from matplotlib.pyplot import savefig, show, figure
from dslabs_functions import plot_multibar_chart, CLASS_EVAL_METRICS, run_NB, run_KNN


def evaluate_approach(
    train: DataFrame, test: DataFrame, target: str = "class", metric: str = "accuracy"
) -> dict[str, list]:
    trnY = train.pop(target).values
    trnX: ndarray = train.values
    tstY = test.pop(target).values
    tstX: ndarray = test.values
    eval: dict[str, list] = {}

    eval_NB: dict[str, float] = run_NB(trnX, trnY, tstX, tstY, metric=metric)
    eval_KNN: dict[str, float] = run_KNN(trnX, trnY, tstX, tstY, metric=metric)
    if eval_NB != {} and eval_KNN != {}:
        for met in CLASS_EVAL_METRICS:
            eval[met] = [eval_NB[met], eval_KNN[met]]
    return eval


target = "stroke"
file_tag = "stroke"
train: DataFrame = read_csv("data/stroke_train.csv")
test: DataFrame = read_csv("data/stroke_test.csv")

figure()
eval: dict[str, list] = evaluate_approach(train, test, target=target, metric="recall")
plot_multibar_chart(
    ["NB", "KNN"], eval, title=f"{file_tag} evaluation", percentage=True
)
#savefig(f"./{file_tag}_eval.png")
show()

# Variable Encoding

## Ordinal Encoding


### Binary

In [23]:
from pandas import read_csv, DataFrame
from dslabs_functions import get_variable_types, encode_cyclic_variables, dummify

data: DataFrame = read_csv(filepath)
vars: dict[str, list] = get_variable_types(data)

true_false: dict[str, int] = {"false": 0, "False": 0, "f": 0, "true": 1, "True": 1, "t": 1}
channel_msg_values: dict[str, int] = {"mobile_push": 0, "email": 1}
time_of_day_values: dict[str, int] = {"Morning": 0, "Afternoon": 1}

encoding: dict[str, dict[str, int]] = {
    "is_opened": true_false,
    "is_clicked": true_false,
    "is_unsubscribed": true_false,
    "is_hard_bounced": true_false,
    "is_soft_bounced": true_false,
    "is_complained": true_false,
    #"is_blocked": true_false,  # Added
    "is_purchased": true_false,
    "subject_with_personalization": true_false,
    "subject_with_deadline": true_false,
    "subject_with_emoji": true_false,
    "subject_with_bonuses": true_false,
    "subject_with_discount": true_false,
    "subject_with_saleout": true_false,
    "is_weekend": true_false,  # Added
    "time_of_day": true_false,  # Added
    "channel_msg": true_false  # Added
}
data: DataFrame = data.replace(encoding, inplace=False)
data.head()

Unnamed: 0,channel_msg,platform,email_provider,date,sent_at,is_opened,opened_first_time_at,opened_last_time_at,is_clicked,clicked_first_time_at,...,subject_with_discount,subject_with_saleout,week_of_month,is_weekend,day_of_week_nr,day_of_month,month,hour,min,time_of_day
0,mobile_push,,,2021-04-30,2021-04-30 07:22:39+00:00,0,NaT,NaT,0,NaT,...,False,False,5,False,4,30,April,7,22,Morning
1,mobile_push,,,2021-04-30,2021-04-30 07:22:40+00:00,0,NaT,NaT,0,NaT,...,False,False,5,False,4,30,April,7,22,Morning
2,mobile_push,,,2021-04-30,2021-04-30 07:22:41+00:00,0,NaT,NaT,0,NaT,...,False,False,5,False,4,30,April,7,22,Morning
3,mobile_push,,,2021-04-30,2021-04-30 07:22:42+00:00,1,2021-04-30 07:22:44,2021-04-30 07:22:44,0,NaT,...,False,False,5,False,4,30,April,7,22,Morning
4,mobile_push,,,2021-04-30,2021-04-30 07:22:43+00:00,1,2021-04-30 07:22:46,2021-04-30 07:22:46,0,NaT,...,False,False,5,False,4,30,April,7,22,Morning


### Symbolic

In [24]:
for v in vars["symbolic"]:
    print(v, data[v].unique())

channel_msg ['mobile_push' 'email' 'web_push']
platform [nan 'desktop' 'smartphone' 'tablet' 'phablet']
email_provider [nan 'gmail.com' 'mail.ru' 'rambler.ru' 'yandex.ru' 'list.ru' 'ro.ru'
 'inbox.ru' 'bk.ru' '59.ru' 'icloud.com' 'nm.ru' 'ya.ru' 'mail.ua'
 'ngs.ru' 'internet.ru' 'private' 'narod.ru' 'phystech.edu' 'sochi.com'
 'auchan.ru' 'land.ru' '66.ru' 'yahoo.com' 'aaanet.ru' 'omgau.org' 'e1.ru'
 'ukr.net' 'tut.by' 'yandex.by' 'lenta.ru' 'phkp.ru' 'yandex.kz'
 'myrambler.ru' 'tyt.by' 'yandex.ua' 'qip.ru' 'my.com' 'xaker.ru' 'me.com'
 '74.ru' 't-sk.ru' 'nextmail.ru' 'yandex.com' 'onego.ru' 'kamaz.ru'
 '63.ru']
campaign_type ['bulk' 'trigger' 'transactional']
channel_cmp ['mobile_push' 'multichannel' 'email']
topic ['sale out' 'abandoned category' 'subscribed' 'order cancelled'
 'abandoned search' 'abandoned cart' 'added to wish list' 'order created'
 'profile updated' 'recent purchase' 'abandoned view'
 'order ready for pickup' nan 'bonuses expired' 'event' 'order shipped']
month ['

- campaign_type ['bulk' 'transactional' 'trigger']
    - transactional e trigger, pela sua definiçao, estao mais relacionados uns com os outros. o trigger e o bulk tem de estar mais afastados, ou entao transactional e bulk tem de estar afastados. 

- platform [nan 'phablet' 'smartphone' 'desktop' 'tablet']
    - according to their size and similarities/functionalities

- email_provider [nan 'ya.ru' 'mail.ru' 'yandex.ru' 'inbox.ru' 'gmail.com' 'internet.ru'
 'list.ru' 'rambler.ru' 'bk.ru' 'icloud.com' 'ro.ru' 'spartak.ru'
 'private' 'lenta.ru' 'udm.ru' 'mail2000.ru' 'me.com' 'yahoo.com'
 'googlemail.com' 'nm.ru' 'e1.ru' 'mail.ua' 'olympus.ru' 'paso.ru'
 'ukr.net' 'yandex.ua' 'tut.by' 'my.com' 'online.ua' 'myrambler.ru'
 'yandex.by' 'autorambler.ru' 'meta.ua' 'spark-mail.ru' '74.ru'
 'vtomske.ru' 'auchan.ru' 'land.ru' 'inbox.lv' 'samaradom.ru' 'ngs.ru'
 'yandex.com' 'narod.ru' 'rarus.ru' 'nextmail.ru']
    - Based on the perceived importance, similarity, and usage in Russia. This ranking considers the popularity and common usage of these email providers in Russia, and also taking into account 
- channel_cmp ['mobile_push' 'email' 'multichannel']
    - In the context of a Russian e-commerce multichannel platform, we need to consider the effectiveness and common usage of each channel for sending campaigns and messages to clients. Here's a suggested ranking based on typical engagement and reach:

    - Multichannel: This involves using multiple channels to reach clients, which can increase engagement and effectiveness by combining the strengths of various channels.

    - Email: Email is a widely used and effective channel for detailed communication and marketing campaigns. It is commonly used in Russia for e-commerce communications.

    - Mobile Push: Mobile push notifications are effective for immediate and short notifications but may have lower engagement compared to email and multichannel approaches. However, they are still quite popular for quick updates and alerts.
- topic ['sale out' 'order created' 'profile updated' 'abandoned category'
 'added to wish list' 'order ready for pickup' 'subscribed'
 'abandoned cart' 'recent purchase' 'abandoned view' nan 'bonuses expired'
 'order cancelled' 'abandoned search' 'event']
    - based on their similarities and potential importance, and also on common e-commerce and user interaction scenarios
    - Order-related topics: These are directly related to transactions and are typically of high importance.
        - order created
        - order ready for pickup
        - order cancelled
        - recent purchase
    - Abandoned actions: These indicate potential lost sales or user interest.
        - abandoned cart
        - abandoned category
        - abandoned view
        - abandoned search
    - User engagement: These involve user interactions and updates.
        - profile updated
        - added to wish list
        - subscribed
    - Promotional and informational: These are related to promotions and notifications.
        - sale out
        - bonuses expired
        - event
- month ['April' 'May' 'June']
    - 0, 1, 2

In [26]:
campaign_type_values: dict[str, int] = {
    "bulk": 0, 
    "transactional": 1, 
    "trigger": 2
}
platform_values: dict[str, int] = {
    "smartphone": 0,
    "phablet": 1,
    "tablet": 2,
    "desktop": 3,
}
month_values: dict[str, int] = {
    "April": 0,
    "May": 1,
    "June": 2,
}
channel_values: dict[str, int] = {
    'multichannel': 0,
    'email': 1,
    'mobile_push': 2
}
topic_values: dict[str, int] = {
    'order created': 0,
    'order ready for pickup': 1,
    'order cancelled': 2,
    'recent purchase': 3,
    'abandoned cart': 4,
    'abandoned category': 5,
    'abandoned view': 6,
    'abandoned search': 7,
    'profile updated': 8,
    'added to wish list': 9,
    'subscribed': 10,
    'sale out': 11,
    'bonuses expired': 12,
    'event': 13
}
email_provider_values: dict[str, int] = {
    'yandex.ru': 0,
    'mail.ru': 1,
    'gmail.com': 2,
    'yahoo.com': 3,
    'icloud.com': 4,
    'bk.ru': 5,
    'list.ru': 6,
    'inbox.ru': 7,
    'ya.ru': 8,
    'yandex.ua': 9,
    'yandex.by': 10,
    'yandex.com': 11,
    'rambler.ru': 12,
    'myrambler.ru': 13,
    'autorambler.ru': 14,
    'mail.ua': 15,
    'tut.by': 16,
    'ukr.net': 17,
    'meta.ua': 18,
    'online.ua': 19,
    'mail2000.ru': 20,
    'nm.ru': 21,
    'e1.ru': 22,
    'olympus.ru': 23,
    'paso.ru': 24,
    'my.com': 25,
    'spark-mail.ru': 26,
    '74.ru': 27,
    'vtomske.ru': 28,
    'auchan.ru': 29,
    'land.ru': 30,
    'inbox.lv': 31,
    'samaradom.ru': 32,
    'ngs.ru': 33,
    'narod.ru': 34,
    'rarus.ru': 35,
    'nextmail.ru': 36,
    'internet.ru': 37,
    'ro.ru': 38,
    'spartak.ru': 39,
    'private': 40,
    'lenta.ru': 41,
    'udm.ru': 42,
    'me.com': 43,
    'googlemail.com': 44
}

encoding: dict[str, dict[str, int]] = {
    "message_type": campaign_type_values,
    "platform": platform_values,
    "month": month_values, 
    "channel_cmp": channel_values,
    "topic": topic_values, 
    "email_provider": email_provider_values
}

df: DataFrame = data.replace(encoding, inplace=False)
df.head()

Unnamed: 0,channel_msg,platform,email_provider,date,sent_at,is_opened,opened_first_time_at,opened_last_time_at,is_clicked,clicked_first_time_at,...,subject_with_discount,subject_with_saleout,week_of_month,is_weekend,day_of_week_nr,day_of_month,month,hour,min,time_of_day
0,mobile_push,,,2021-04-30,2021-04-30 07:22:39+00:00,0,NaT,NaT,0,NaT,...,False,False,5,False,4,30,0,7,22,Morning
1,mobile_push,,,2021-04-30,2021-04-30 07:22:40+00:00,0,NaT,NaT,0,NaT,...,False,False,5,False,4,30,0,7,22,Morning
2,mobile_push,,,2021-04-30,2021-04-30 07:22:41+00:00,0,NaT,NaT,0,NaT,...,False,False,5,False,4,30,0,7,22,Morning
3,mobile_push,,,2021-04-30,2021-04-30 07:22:42+00:00,1,2021-04-30 07:22:44,2021-04-30 07:22:44,0,NaT,...,False,False,5,False,4,30,0,7,22,Morning
4,mobile_push,,,2021-04-30,2021-04-30 07:22:43+00:00,1,2021-04-30 07:22:46,2021-04-30 07:22:46,0,NaT,...,False,False,5,False,4,30,0,7,22,Morning


## Cyclic variables