# File and libraries

In [13]:
import pandas as pd
import numpy as np


file_tag = "Google Merch Store 2024-2025"

# DSLabs functions

In [14]:
%run "scripts/dslabs_functions.py"


# data functions

In [15]:
%run "scripts/data_functions.py"


data_functions loaded


# Load and sampling

In [16]:
# test_data=True
test_data=False


# Define a function to sample 10% from each group
def sample_per_day(group, fraction=0.1):
    return group.sample(frac=fraction)


total_data = pd.read_csv(r'data\google_merch_total_sessions_2024_2025.csv')
additional_data = pd.read_csv(r'data\google_merch_total_sessions_2024_2025_additional_metrics.csv')
gsc_data = pd.read_csv(r'data\search_console_google_merch_2024_2025.csv')    

data=total_data.merge(additional_data, on='Date', how='left').merge(gsc_data, on='Date', how='left')


# parse Date as datetime
data['Date'] = pd.to_datetime(data['Date'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 427 entries, 0 to 426
Data columns (total 35 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   Date                                      427 non-null    datetime64[ns]
 1   Sessions                                  427 non-null    int64         
 2   Bounce rate                               427 non-null    float64       
 3   Seven-day active users                    427 non-null    int64         
 4   28-day active users                       427 non-null    int64         
 5   New users                                 427 non-null    int64         
 6   Scrolled users                            427 non-null    int64         
 7   Engaged sessions                          427 non-null    int64         
 8   Views                                     427 non-null    int64         
 9   Event count                     

In [17]:
data.shape

(427, 35)

# column drop

### low value or high null count columns

### ecommerce specific columns

In [18]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,Date,Sessions,Bounce rate,Seven-day active users,28-day active users,New users,Scrolled users,Engaged sessions,Views,Event count,...,Sessions per user,Views per user,Total users,Promotion views,Promotion clicks,One-day active users,Organic Google Search impressions,Organic Google Search clicks,Organic Google Search average position,Organic Google Search click-through-rate
count,427,427.0,427.0,427.0,427.0,427.0,427.0,427.0,427.0,427.0,...,427.0,427.0,427.0,427.0,427.0,427.0,427.0,427.0,427.0,427.0
mean,2024-08-04 00:00:00.000000256,2096.981265,0.477794,10567.112412,38404.140515,1208.053864,642.749415,1129.370023,7518.901639,29223.332553,...,1.283672,4.436609,2599.107728,1487.814988,240.81733,1641.929742,3574.866511,31.992974,11.378257,0.01441
min,2024-01-04 00:00:00,22.0,0.165997,780.0,780.0,4.0,0.0,4.0,20.0,401.0,...,1.08939,1.294118,177.0,0.0,0.0,11.0,123.0,0.0,5.446009,0.0
25%,2024-04-19 12:00:00,1463.0,0.412334,8326.5,30502.5,876.5,264.0,662.5,4267.0,16799.0,...,1.241037,3.69878,1811.0,879.5,110.0,1150.5,230.0,3.0,9.440974,0.007907
50%,2024-08-04 00:00:00,1984.0,0.478213,10800.0,40024.0,1177.0,599.0,999.0,6563.0,25377.0,...,1.277015,4.613819,2440.0,1592.0,225.0,1580.0,329.0,5.0,11.046218,0.012987
75%,2024-11-18 12:00:00,2545.5,0.545998,12269.5,48328.0,1442.0,874.5,1346.5,9370.5,36457.5,...,1.30976,5.290969,3157.5,2144.0,337.5,1965.0,484.0,10.0,12.838869,0.018843
max,2025-03-05 00:00:00,11239.0,0.818182,20447.0,60050.0,7734.0,6455.0,8453.0,94665.0,353391.0,...,3.181818,11.672626,25911.0,10809.0,3869.0,9285.0,26067.0,255.0,46.421479,0.085657
std,,1078.436107,0.10094,3632.601576,12293.299375,627.503784,665.872685,776.451498,6176.753658,23458.915577,...,0.124969,1.336725,1676.945838,1001.073715,255.356796,845.927446,6941.202809,55.902056,3.43063,0.009272


# replace (not set) with null

we will handle these later but these are actually null values

In [19]:
data.replace('none', np.nan, inplace=True)

# DataFrame Aggregation

## timeseries df creation

In [20]:




# Categorize the season based on 'event_date' column
def categorize_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

data['season'] = data['Date'].apply(categorize_season)


# Categorize the commercial/holiday season based on 'event_date' column
def categorize_commercial_season(date):
    month = date.month
    day = date.day
    if (month == 12 and day >= 20) or (month == 1 and day <= 5):
        return 'Christmas/New Year'
    elif (month == 11 and day >= 20) or (month == 12 and day < 20):
        return 'Black Friday/Cyber Monday'
    elif (month == 10 and day >= 20) or (month == 11 and day < 20):
        return 'Halloween'
    elif (month == 4 and day >= 10) or (month == 4 and day <= 20):
        return 'Easter'
    elif (month == 2 and day >= 10) or (month == 2 and day <= 20):
        return 'Valentine\'s Day'
    else:
        return 'Regular Season'

data['commercial_season'] = data['Date'].apply(categorize_commercial_season)

# create year, quarter, month, day number of week, weekend/weekday based on event_date column

# Create new columns
# aggregated_df['year'] = aggregated_df['event_date'].dt.year
# aggregated_df['quarter'] = aggregated_df['event_date'].dt.quarter
# aggregated_df['month'] = aggregated_df['event_date'].dt.month
# aggregated_df['day'] = aggregated_df['event_date'].dt.day

# data['day_of_week'] = data['local_date'].dt.day_name()  

# aggregated_df['day_of_year'] = aggregated_df['event_date'].dt.dayofyear  # Day of the year
# aggregated_df['week_number'] = aggregated_df['event_date'].dt.isocalendar().week  # ISO week number

# Assuming 'local_date' is in datetime format, otherwise you can parse it using pd.to_datetime
def week_of_month(dt):
    first_day = dt.replace(day=1)
    # Calculate the week of the month by comparing the current date to the first day of the month
    return (dt.day + first_day.weekday()) // 7 + 1

# Apply this function to your 'local_date' column
data['week_of_month'] = data['Date'].apply(week_of_month)


data['day_of_week_nr'] = data['Date'].dt.weekday  # Monday=0, Sunday=6
data['is_weekend'] = data['day_of_week_nr'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday


data

Unnamed: 0,Date,Sessions,Bounce rate,Seven-day active users,28-day active users,New users,Scrolled users,Engaged sessions,Views,Event count,...,One-day active users,Organic Google Search impressions,Organic Google Search clicks,Organic Google Search average position,Organic Google Search click-through-rate,season,commercial_season,week_of_month,day_of_week_nr,is_weekend
0,2025-03-05,1304,0.368098,6906,29095,627,620,824,6155,23166,...,971,151,1,7.907285,0.006623,Spring,Regular Season,2,2,0
1,2025-03-04,1538,0.379714,7047,29494,791,723,954,6664,25808,...,1174,185,1,5.908108,0.005405,Spring,Regular Season,2,1,0
2,2025-03-03,1451,0.352860,7158,29497,674,710,939,6884,26862,...,1095,145,1,7.896552,0.006897,Spring,Regular Season,2,0,0
3,2025-03-02,1002,0.469062,7404,29753,692,337,532,3582,14427,...,861,135,0,9.592593,0.000000,Spring,Regular Season,2,6,1
4,2025-03-01,1298,0.554700,7458,29843,1091,325,578,3392,14491,...,1190,123,1,10.886179,0.008130,Spring,Regular Season,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,2024-01-08,981,0.387360,2749,2749,428,0,601,4745,16848,...,766,8645,133,17.415616,0.015385,Winter,Regular Season,2,0,0
423,2024-01-07,446,0.482063,2081,2081,261,0,231,2129,8848,...,358,7780,98,19.194473,0.012596,Winter,Regular Season,2,6,1
424,2024-01-06,426,0.415493,1748,1748,256,0,249,1920,6913,...,346,8169,106,18.751744,0.012976,Winter,Regular Season,1,5,1
425,2024-01-05,887,0.363021,1433,1433,427,0,565,5000,17829,...,712,8812,127,16.045847,0.014412,Winter,Christmas/New Year,1,4,0


## us holidays

In [21]:
us_holidays = {
    2024: {
        'New Year\'s Day': '2024-01-01',
        'Martin Luther King Jr. Day': '2024-01-15',
        'Presidents\' Day': '2024-02-19',
        'Memorial Day': '2024-05-27',
        'Independence Day': '2024-07-04',
        'Labor Day': '2024-09-02',
        'Columbus Day': '2024-10-14',
        'Veterans Day': '2024-11-11',
        'Thanksgiving Day': '2024-11-28',
        'Christmas Day': '2024-12-25'
    },
    2025: {
        'New Year\'s Day': '2025-01-01',
        'Martin Luther King Jr. Day': '2025-01-20',
        'Presidents\' Day': '2025-02-17',
        'Memorial Day': '2025-05-26',
        'Independence Day': '2025-07-04',
        'Labor Day': '2025-09-01',
        'Columbus Day': '2025-10-13',
        'Veterans Day': '2025-11-11',
        'Thanksgiving Day': '2025-11-27',
        'Christmas Day': '2025-12-25'
    }
}

# Flatten the holidays dictionary into a single dictionary with dates as keys
holidays = {pd.to_datetime(date): name for year in us_holidays for name, date in us_holidays[year].items()}


# Create a new column 'is_holiday' that indicates whether each date is a holiday
data['is_holiday'] = data['Date'].apply(lambda x: 1 if x in holidays else 0)

# Display the DataFrame
data

Unnamed: 0,Date,Sessions,Bounce rate,Seven-day active users,28-day active users,New users,Scrolled users,Engaged sessions,Views,Event count,...,Organic Google Search impressions,Organic Google Search clicks,Organic Google Search average position,Organic Google Search click-through-rate,season,commercial_season,week_of_month,day_of_week_nr,is_weekend,is_holiday
0,2025-03-05,1304,0.368098,6906,29095,627,620,824,6155,23166,...,151,1,7.907285,0.006623,Spring,Regular Season,2,2,0,0
1,2025-03-04,1538,0.379714,7047,29494,791,723,954,6664,25808,...,185,1,5.908108,0.005405,Spring,Regular Season,2,1,0,0
2,2025-03-03,1451,0.352860,7158,29497,674,710,939,6884,26862,...,145,1,7.896552,0.006897,Spring,Regular Season,2,0,0,0
3,2025-03-02,1002,0.469062,7404,29753,692,337,532,3582,14427,...,135,0,9.592593,0.000000,Spring,Regular Season,2,6,1,0
4,2025-03-01,1298,0.554700,7458,29843,1091,325,578,3392,14491,...,123,1,10.886179,0.008130,Spring,Regular Season,1,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,2024-01-08,981,0.387360,2749,2749,428,0,601,4745,16848,...,8645,133,17.415616,0.015385,Winter,Regular Season,2,0,0,0
423,2024-01-07,446,0.482063,2081,2081,261,0,231,2129,8848,...,7780,98,19.194473,0.012596,Winter,Regular Season,2,6,1,0
424,2024-01-06,426,0.415493,1748,1748,256,0,249,1920,6913,...,8169,106,18.751744,0.012976,Winter,Regular Season,1,5,1,0
425,2024-01-05,887,0.363021,1433,1433,427,0,565,5000,17829,...,8812,127,16.045847,0.014412,Winter,Christmas/New Year,1,4,0,0


## significant event days

In [22]:
significant_events = {
    '2024-02-11': 'Super Bowl LVIII',
    '2024-03-11': 'COVID-19 Pandemic Anniversary',
    '2024-07-15': 'Tax Day',
    '2024-11-05': 'US Presidential Election',
    '2024-11-28': 'Thanksgiving',
    '2024-12-25': 'Christmas',
    '2025-01-20': 'Inauguration Day',
    '2025-02-09': 'Super Bowl LIX',
    '2025-04-15': 'Tax Day',
    '2025-11-27': 'Thanksgiving',
    '2025-12-25': 'Christmas'
}

commercial_events = {
    '2024-11-29': 'Black Friday',
    '2024-12-02': 'Cyber Monday',
    '2025-06-23': 'Amazon Prime Day',
    '2025-06-24': 'Amazon Prime Day',
    '2025-11-28': 'Black Friday',
    '2025-12-01': 'Cyber Monday',
}



# Flatten the significant_events dictionary into a single dictionary with dates as keys
events = {pd.to_datetime(date): name for date, name in significant_events.items()}
commercial_events = {pd.to_datetime(date): name for date, name in commercial_events.items()}

# Create a new column 'is_event' that indicates whether each date is a significant event
data['is_major_event_day'] = data['Date'].apply(lambda x: 1 if x in events else 0)
data['is_commercial_event_day'] = data['Date'].apply(lambda x: 1 if x in commercial_events else 0)

data

Unnamed: 0,Date,Sessions,Bounce rate,Seven-day active users,28-day active users,New users,Scrolled users,Engaged sessions,Views,Event count,...,Organic Google Search average position,Organic Google Search click-through-rate,season,commercial_season,week_of_month,day_of_week_nr,is_weekend,is_holiday,is_major_event_day,is_commercial_event_day
0,2025-03-05,1304,0.368098,6906,29095,627,620,824,6155,23166,...,7.907285,0.006623,Spring,Regular Season,2,2,0,0,0,0
1,2025-03-04,1538,0.379714,7047,29494,791,723,954,6664,25808,...,5.908108,0.005405,Spring,Regular Season,2,1,0,0,0,0
2,2025-03-03,1451,0.352860,7158,29497,674,710,939,6884,26862,...,7.896552,0.006897,Spring,Regular Season,2,0,0,0,0,0
3,2025-03-02,1002,0.469062,7404,29753,692,337,532,3582,14427,...,9.592593,0.000000,Spring,Regular Season,2,6,1,0,0,0
4,2025-03-01,1298,0.554700,7458,29843,1091,325,578,3392,14491,...,10.886179,0.008130,Spring,Regular Season,1,5,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,2024-01-08,981,0.387360,2749,2749,428,0,601,4745,16848,...,17.415616,0.015385,Winter,Regular Season,2,0,0,0,0,0
423,2024-01-07,446,0.482063,2081,2081,261,0,231,2129,8848,...,19.194473,0.012596,Winter,Regular Season,2,6,1,0,0,0
424,2024-01-06,426,0.415493,1748,1748,256,0,249,1920,6913,...,18.751744,0.012976,Winter,Regular Season,1,5,1,0,0,0
425,2024-01-05,887,0.363021,1433,1433,427,0,565,5000,17829,...,16.045847,0.014412,Winter,Christmas/New Year,1,4,0,0,0,0


# drop irrelevant columns

# save file

In [24]:
if test_data==False:
    data.to_csv('data/df_merch_0_2024_agg_time_pre_proc.csv',index=False)

# Excel File for encoding mapping

In [25]:
import os
from openpyxl import load_workbook


# Function to append distinct combinations of selected columns into sheets in an Excel file
def append_columns_to_excel(df, columns_dict, output_file):
    """
    Append distinct combinations of selected columns into separate sheets in an existing Excel file,
    with the columns ordered by their names for easier hierarchical encoding and add an empty encoding column.

    Args:
    df (pd.DataFrame): The DataFrame containing the columns to save.
    columns_dict (dict): Dictionary where keys are sheet names, and values are lists of column names to include.
    output_file (str): The path of the Excel file to save the sheets.

    Returns:
    None
    """
    # Check if the file exists and is a valid Excel file
    if os.path.exists(output_file):
        try:
            # Try to load the existing workbook
            with pd.ExcelWriter(output_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
                writer.book = load_workbook(output_file)
                
                # Loop over each sheet name and corresponding list of columns
                for sheet_name, columns in columns_dict.items():
                    # Check if all the specified columns exist in the DataFrame
                    missing_columns = [col for col in columns if col not in df.columns]
                    if missing_columns:
                        print(f"Warning: The following columns are not found in the DataFrame for sheet '{sheet_name}': {missing_columns}")
                        continue

                    # Get distinct combinations of the selected columns
                    distinct_values = df[columns].drop_duplicates().dropna(how='all')

                    # Convert columns to strings temporarily for sorting to avoid float-string comparison errors
                    distinct_values = distinct_values.astype(str)

                    # Sort distinct values by the specified columns for hierarchical grouping
                    distinct_values.sort_values(by=columns, inplace=True)

                    # Add an empty encoding column for each column in the DataFrame
                    for col in columns:
                        distinct_values[f'{col}_enc'] = pd.NA

                    # Write distinct values to a new sheet named after the sheet_name
                    distinct_values.to_excel(writer, sheet_name=sheet_name, index=False)
                    
        except Exception as e:
            print(f"Error: {e}")
            print("The file might be corrupt or invalid. Creating a new file.")
            # Create a new file if loading fails
            with pd.ExcelWriter(output_file, engine='openpyxl', mode='w') as writer:
                for sheet_name, columns in columns_dict.items():
                    missing_columns = [col for col in columns if col not in df.columns]
                    if missing_columns:
                        print(f"Warning: The following columns are not found in the DataFrame for sheet '{sheet_name}': {missing_columns}")
                        continue

                    # Get distinct combinations of the selected columns
                    distinct_values = df[columns].drop_duplicates().dropna(how='all')

                    # Convert columns to strings temporarily for sorting
                    distinct_values = distinct_values.astype(str)

                    # Sort distinct values by the specified columns for hierarchical grouping
                    distinct_values.sort_values(by=columns, inplace=True)

                    # Add an empty encoding column for each column in the DataFrame
                    for col in columns:
                        distinct_values[f'{col}_enc'] = pd.NA

                    distinct_values.to_excel(writer, sheet_name=sheet_name, index=False)

    else:
        # If the file does not exist, create a new one
        with pd.ExcelWriter(output_file, engine='openpyxl', mode='w') as writer:
            for sheet_name, columns in columns_dict.items():
                missing_columns = [col for col in columns if col not in df.columns]
                if missing_columns:
                    print(f"Warning: The following columns are not found in the DataFrame for sheet '{sheet_name}': {missing_columns}")
                    continue

                # Get distinct combinations of the selected columns
                distinct_values = df[columns].drop_duplicates().dropna(how='all')

                # Convert columns to strings temporarily for sorting
                distinct_values = distinct_values.astype(str)

                # Sort distinct values by the specified columns for hierarchical grouping
                distinct_values.sort_values(by=columns, inplace=True)

                # Add an empty encoding column for each column in the DataFrame
                for col in columns:
                    distinct_values[f'{col}_enc'] = pd.NA

                distinct_values.to_excel(writer, sheet_name=sheet_name, index=False)
                

# Define the groups of columns for hierarchical encoding, grouped by sheet name
columns_to_save = {
    
    'device_category': ['device_category'],   
    'device_web_info_browser': ['device_web_info_browser'],   

}

# Save the distinct values combinations of each column group into corresponding sheets
# append_columns_to_excel(data_final, columns_to_save, f'data/df_merch_values_time_pre_encoding.xlsx')

# print("Excel file has been updated successfully.")