# File and libraries

In [118]:
import pandas as pd
import numpy as np


filepath=r'google_merch_store_raw_merge.csv'



file_tag = "ga4_merch_store"

# DSLabs functions

In [119]:
%run "dslabs_functions.py"


# data functions

In [120]:
%run "data_functions.py"


data_functions lodaded


# Load and sampling

In [121]:
# test_data=True
test_data=False


# Define a function to sample 10% from each group
def sample_per_day(group, fraction=0.1):
    return group.sample(frac=fraction)


if test_data==True:

    data=pd.read_csv(filepath)

    data['event_timestamp'] = pd.to_datetime(data['event_timestamp'], unit='us', utc=True)
    data['event_date'] = pd.to_datetime(data['event_date'], infer_datetime_format=True)
 


    # Apply the sampling to each group (grouped by event_date) 1%
    data = data.groupby('event_date').apply(lambda x: sample_per_day(x, 0.01)).reset_index(drop=True)

   

else:
    data=pd.read_csv(filepath)

    data['event_timestamp'] = pd.to_datetime(data['event_timestamp'], unit='us', utc=True)
    data['event_date'] = pd.to_datetime(data['event_date'], infer_datetime_format=True)

    # 10% sample
    data = data.groupby('event_date').apply(lambda x: sample_per_day(x, 0.1)).reset_index(drop=True)
    

    

data.info()

KeyboardInterrupt: 

In [73]:
data.shape

(776600, 48)

# class target column creation


- we want to classify if that hit is from a returning or new user.
- Due to web analytics tracking particularities like cookie acceptance we prefer to consider returning users as users that are on their 3rd or larger session number
- In this case, new user (ga_session_number <=2) will be 0 and returning user will be more than 2 (ga_session_number > 2)
- session number column shall be removed afterwards as well

In [74]:

data['returning_user'] = data['ga_session_number'].apply(lambda x: 0 if x <= 2 else 1)


data=data.drop(['ga_session_number'],axis=1) # now we do not need it anymore. remove it



In [75]:
target = "returning_user"

values = data[target].value_counts(normalize=True) 
print(values)

returning_user
0    0.751367
1    0.248633
Name: proportion, dtype: float64


# unbalance dataset

let us umbalance to have 10% as returning users

In [76]:
# Separate the majority (0) and minority (1) classes
df_majority = data[data['returning_user'] == 0]
df_minority = data[data['returning_user'] == 1]

# Calculate the number of minority rows needed to make a 90/10 split
# Let total_rows be the total number of rows after undersampling
total_rows = len(df_majority) / 0.9  # 90% majority, 10% minority
desired_minority_count = int(total_rows * 0.1)  # 10% of the total should be minority

# Downsample the minority class to the desired number of rows
df_minority_downsampled = df_minority.sample(n=desired_minority_count, random_state=42)

# Combine the majority class with the downsampled minority class
df_imbalanced = pd.concat([df_majority, df_minority_downsampled])

# Shuffle the combined dataset
data = df_imbalanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution to verify the 90/10 split
print(data['returning_user'].value_counts(normalize=True) * 100)

returning_user
0    90.000093
1     9.999907
Name: proportion, dtype: float64


# date column creation

## event_date

In [77]:
data['event_date'] = pd.to_datetime(data['event_date'], infer_datetime_format=True)


# create year, quarter, month, day number of week, weekend/weekday based on event_date column

# Create new columns
data['year'] = data['event_date'].dt.year
data['quarter'] = data['event_date'].dt.quarter
data['month'] = data['event_date'].dt.month
data['day'] = data['event_date'].dt.day
data['day_of_week'] = data['event_date'].dt.day_name()  
data['day_of_year'] = data['event_date'].dt.dayofyear  # Day of the year
data['week_number'] = data['event_date'].dt.isocalendar().week  # ISO week number

data['day_of_week_nr'] = data['event_date'].dt.weekday  # Monday=0, Sunday=6
data['is_weekend'] = data['day_of_week_nr'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday
data.drop(['day_of_week_nr'],axis=1) # already symbolic. not needed


Unnamed: 0,event_date,session_id,user_pseudo_id,event_name,event_timestamp,page_location,page_title,device_category,device_mobile_brand_name,device_mobile_model_name,...,promotion_name,returning_user,year,quarter,month,day,day_of_week,day_of_year,week_number,is_weekend
0,2020-12-01,7226772885,1.332747e+07,scroll,2020-12-01 04:25:19.308342+00:00,https://shop.googlemerchandisestore.com/Google...,Google Cambridge Campus Zip Hoodie,desktop,Google,ChromeBook,...,,0,2020,4,12,1,Tuesday,336,49,0
1,2020-11-24,9817631621,4.457414e+07,begin_checkout,2020-11-24 14:01:04.292979+00:00,https://shop.googlemerchandisestore.com/yourin...,Checkout Your Information,desktop,Google,Chrome,...,Not available in demo dataset,0,2020,4,11,24,Tuesday,329,48,0
2,2020-12-09,1585908966,3.457441e+06,user_engagement,2020-12-09 19:08:39.458830+00:00,https://shop.googlemerchandisestore.com/asearc...,Store search results,desktop,Google,Chrome,...,,0,2020,4,12,9,Wednesday,344,50,0
3,2020-11-13,3736620099,8.576520e+06,view_item,2020-11-13 12:22:16.328216+00:00,https://shop.googlemerchandisestore.com/Google...,Google Youth FC Zip Hoodie,desktop,Apple,Safari,...,,0,2020,4,11,13,Friday,318,46,0
4,2021-01-12,8812586936,1.421856e+06,user_engagement,2021-01-12 04:49:40.502631+00:00,https://shop.googlemerchandisestore.com/Google...,Bags | Lifestyle | Google Merchandise Store,desktop,Google,Chrome,...,,1,2021,1,1,12,Tuesday,12,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648341,2020-12-15,7719167180,4.570995e+06,view_item,2020-12-15 19:06:48.896082+00:00,https://shop.googlemerchandisestore.com/Google...,Google | Shop by Brand | Google Merchandise Store,desktop,Mozilla,Firefox,...,(not set),1,2020,4,12,15,Tuesday,350,51,0
648342,2020-12-08,2709704858,7.194803e+07,page_view,2020-12-08 02:21:08.720584+00:00,https://shop.googlemerchandisestore.com/Google...,Kids | Apparel | Google Merchandise Store,tablet,Apple,iPad,...,,0,2020,4,12,8,Tuesday,343,50,0
648343,2020-12-20,8963565810,8.507348e+06,first_visit,2020-12-20 00:01:31.791498+00:00,https://shop.googlemerchandisestore.com/Google...,Drinkware | Lifestyle | Google Merchandise Store,tablet,Apple,iPad,...,,0,2020,4,12,20,Sunday,355,51,1
648344,2020-11-24,9510772309,5.273588e+06,view_promotion,2020-11-24 05:41:39.104987+00:00,https://shop.googlemerchandisestore.com/,Home,mobile,Samsung,<Other>,...,Reach New Heights,0,2020,4,11,24,Tuesday,329,48,0


# event_timestamp

In [78]:
# Convert 'event_timestamp' from microseconds to datetime
data['event_timestamp'] = pd.to_datetime(data['event_timestamp'], unit='us')

# Create new columns
data['hour_of_day'] = data['event_timestamp'].dt.hour  # Extract hour (0-23)
data['minute'] = data['event_timestamp'].dt.minute  # Extract minute
data['hour_minute_fraction'] = data['hour_of_day'] + data['minute'] / 60  # Hour + fraction of minute

# Categorize the time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data['time_of_day'] = data['hour_of_day'].apply(categorize_time_of_day)

# column drop

### low value or high null count columns

In [79]:
data=data.drop(['debug_mode','device_is_limited_ad_tracking','device_mobile_marketing_name','geo_metro','traffic_source_name','page_referrer','entrances'],axis=1)

### ecommerce specific columns

In [80]:
data=data.drop([
 'ecommerce_total_item_quantity', 
 'ecommerce_purchase_revenue',            
 'ecommerce_shipping_value',              
 'ecommerce_tax_value',                   
 'ecommerce_unique_items',               
 'ecommerce_transaction_id',              
 'item_id',                               
 'item_name',                             
 'item_brand',                            
 'item_variant',                          
 'item_category',                         
 'price',                                 
 'quantity',                              
 'item_revenue',                          
 'item_list_index',                       
 'promotion_name'],axis=1)

In [81]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,event_date,session_id,user_pseudo_id,event_name,event_timestamp,page_location,page_title,device_category,device_mobile_brand_name,device_mobile_model_name,...,day,day_of_week,day_of_year,week_number,day_of_week_nr,is_weekend,hour_of_day,minute,hour_minute_fraction,time_of_day
count,648346,648346.0,648346.0,648346,648346,648346,646475,648346,648346,648346,...,648346.0,648346,648346.0,648346.0,648346.0,648346.0,648346.0,648346.0,648346.0,648346
unique,,,,17,,1346,484,3,8,10,...,,7,,,,,,,,4
top,,,,view_item,,https://shop.googlemerchandisestore.com/,Home,desktop,Apple,Chrome,...,,Tuesday,,,,,,,,Night
freq,,,,233934,,60300,106139,378495,276294,180121,...,,111199,,,,,,,,218076
mean,2020-12-14 09:41:16.977416192,4992448000.0,254327600.0,,2020-12-14 21:40:32.009578752+00:00,,,,,,...,15.227601,,247.781661,36.813421,2.741565,0.215431,11.488577,29.436151,11.97918,
min,2020-11-01 00:00:00,6412.0,1000442.0,,2020-11-01 00:02:04.892425+00:00,,,,,,...,1.0,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,
25%,2020-11-26 00:00:00,2515745000.0,5821700.0,,2020-11-26 03:59:46.370438656+00:00,,,,,,...,8.0,,28.0,4.0,1.0,0.0,5.0,14.0,5.933333,
50%,2020-12-11 00:00:00,4988220000.0,22811340.0,,2020-12-11 15:06:22.419322880+00:00,,,,,,...,15.0,,329.0,48.0,3.0,0.0,11.0,29.0,11.966667,
75%,2021-01-05 00:00:00,7486272000.0,61646400.0,,2021-01-05 13:11:45.500064512+00:00,,,,,,...,22.0,,345.0,50.0,4.0,0.0,18.0,44.0,18.05,
max,2021-01-31 00:00:00,9999997000.0,9994023000.0,,2021-01-31 23:59:55.412363+00:00,,,,,,...,31.0,,366.0,53.0,6.0,1.0,23.0,59.0,23.983333,


# row drop (from values)

## specific event_names



In [82]:
data['event_name'].value_counts()

event_name
view_item              233934
page_view              116997
user_engagement         90393
add_to_cart             52498
scroll                  41536
session_start           32115
first_visit             25726
select_item             25656
view_promotion          16682
begin_checkout           6020
view_search_results      2302
add_shipping_info        1449
purchase                 1078
add_payment_info          962
select_promotion          889
click                      84
view_item_list             25
Name: count, dtype: int64

### event_name insights
We want to classify a user by its interactions with the website so we want to exclude some actions that may also be biased by incorrect ga4 tracking namely:
- session_start
- first_visit
- click (low event count)
- view_item_list (may not be triggered by user interaction)

In [83]:
# List of values to drop
events_to_drop = ['session_start', 'first_visit','click','view_item_list']

# drop events from list
data = data[~data['event_name'].isin(events_to_drop)]


data['event_name'].value_counts()

event_name
view_item              233934
page_view              116997
user_engagement         90393
add_to_cart             52498
scroll                  41536
select_item             25656
view_promotion          16682
begin_checkout           6020
view_search_results      2302
add_shipping_info        1449
purchase                 1078
add_payment_info          962
select_promotion          889
Name: count, dtype: int64

# replace (not set) with null

we will handle these later but these are actually null values

In [84]:
data.replace('(not set)', np.nan, inplace=True)

# Engagement time msec
- https://support.google.com/analytics/answer/11109416?hl=en
- is it null?

In [85]:
data['engagement_time_msec'] = data['engagement_time_msec'].fillna(0)

# geo columns

In [86]:
geo_country_list = data['geo_country'].tolist()
distinct_geo_countries = list(set(geo_country_list))

print(distinct_geo_countries)

['Argentina', 'Trinidad & Tobago', 'Poland', 'Nepal', 'Morocco', 'India', 'Dominican Republic', 'Uruguay', 'South Korea', 'Paraguay', 'Brazil', 'Sweden', 'Belarus', 'Oman', 'Jamaica', 'Bulgaria', 'Chile', 'Romania', 'Armenia', 'Georgia', 'Israel', 'Ukraine', 'Japan', 'Saudi Arabia', 'Hong Kong', 'China', 'South Africa', 'Egypt', 'France', 'Albania', 'Iceland', 'Pakistan', 'Luxembourg', 'Turkey', 'Austria', 'Jordan', 'Cambodia', 'Mexico', 'Denmark', 'Indonesia', 'Netherlands', 'Costa Rica', 'Sri Lanka', 'Ghana', 'Palestine', 'Mongolia', 'Ecuador', 'Finland', 'Taiwan', 'Slovenia', 'Lebanon', 'Hungary', 'El Salvador', 'United Arab Emirates', 'Honduras', 'Colombia', 'Lithuania', 'Bosnia & Herzegovina', 'Kazakhstan', 'Singapore', 'Czechia', 'Greece', 'Bangladesh', 'Guatemala', 'Qatar', 'Nigeria', 'Kenya', 'Malta', 'Peru', 'Bahamas', 'Azerbaijan', 'Germany', 'Croatia', 'United Kingdom', 'Iraq', 'Slovakia', 'Venezuela', 'Malaysia', nan, 'Panama', 'Bahrain', 'Cyprus', 'Latvia', 'Philippines', 

## move subcontinent

In [87]:
# Move 'geo_sub_continent' from index 16 after continent

# Pop 'geo_sub_continent' column
geo_sub_continent = data.pop('geo_sub_continent')  # Remove the column

# Insert 'geo_sub_continent' at index 16
data.insert(16, 'geo_sub_continent', geo_sub_continent)


# Display the reordered DataFrame
print("\nReordered DataFrame:")
data.info()


Reordered DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 590396 entries, 0 to 648345
Data columns (total 38 columns):
 #   Column                           Non-Null Count   Dtype              
---  ------                           --------------   -----              
 0   event_date                       590396 non-null  datetime64[ns]     
 1   session_id                       590396 non-null  int64              
 2   user_pseudo_id                   590396 non-null  float64            
 3   event_name                       590396 non-null  object             
 4   event_timestamp                  590396 non-null  datetime64[ns, UTC]
 5   page_location                    590396 non-null  object             
 6   page_title                       589108 non-null  object             
 7   device_category                  590396 non-null  object             
 8   device_mobile_brand_name         590396 non-null  object             
 9   device_mobile_model_name         590396 no

## geo_region and geo_city

In [88]:
null_countries_df=data.groupby(['geo_country']).agg(
    null_count=('geo_region', lambda x: x.isna().sum())
).sort_values('null_count',ascending=False).reset_index()


countries_with_nulls = null_countries_df[null_countries_df['null_count'] > 0]['geo_country'].tolist()

print(countries_with_nulls)
# will retrieve the most populated cities of these countries to use as fill method

['Singapore', 'United States', 'Hong Kong', 'Peru', 'Russia', 'China', 'Japan', 'Turkey', 'Mexico', 'Bangladesh', 'Ireland', 'Romania', 'Indonesia', 'Colombia', 'Vietnam', 'Norway', 'Switzerland', 'Ukraine', 'Thailand', 'Portugal', 'Dominican Republic', 'Iraq', 'Brazil', 'Spain', 'Malaysia', 'Sweden', 'Egypt', 'Czechia', 'India', 'Hungary', 'Slovakia', 'Chile', 'Italy', 'Finland', 'Serbia', 'Austria', 'Uruguay', 'Greece', 'New Zealand', 'Germany', 'Philippines', 'Poland', 'South Korea', 'Palestine', 'Morocco', 'United Arab Emirates', 'Slovenia', 'Argentina', 'Kuwait', 'Algeria', 'United Kingdom', 'Armenia', 'Netherlands', 'North Macedonia', 'Bulgaria', 'Croatia', 'Nigeria', 'Cyprus', 'Bahrain', 'Costa Rica', 'Albania', 'Venezuela', 'Australia', 'Pakistan', 'Azerbaijan', 'South Africa', 'Puerto Rico', 'Panama', 'Bolivia', 'Jamaica', 'Saudi Arabia', 'Paraguay', 'Sri Lanka', 'Lithuania', 'Luxembourg', 'Kazakhstan', 'Canada', 'Ecuador', 'Macao', 'Malta', 'Trinidad & Tobago', 'Kosovo', 'Hon

In [89]:
most_populated_cities = {
    'Singapore': 'Singapore',
    'Hong Kong': 'Hong Kong',
    'Peru': 'Lima',
    'Dominican Republic': 'Santo Domingo',
    'Palestine': 'Gaza',
    'Iraq': 'Baghdad',
    'Serbia': 'Belgrade',
    'Uruguay': 'Montevideo',
    'North Macedonia': 'Skopje',
    'Cyprus': 'Nicosia',
    'Bahrain': 'Manama',
    'Armenia': 'Yerevan',
    'Panama': 'Panama City',
    'Azerbaijan': 'Baku',
    'Venezuela': 'Caracas',
    'Albania': 'Tirana',
    'Luxembourg': 'Luxembourg',
    'Jamaica': 'Kingston',
    'Macao': 'Macau',
    'Bolivia': 'Santa Cruz de la Sierra',
    'Trinidad & Tobago': 'Chaguanas',
    'Honduras': 'Tegucigalpa',
    'Paraguay': 'Asuncion',
    'Morocco': 'Casablanca',
    'Kosovo': 'Pristina',
    'Slovakia': 'Bratislava',
    'Malta': 'Birkirkara',
    'New Zealand': 'Auckland',
    'Bulgaria': 'Sofia',
    'Bangladesh': 'Dhaka',
    'Nigeria': 'Lagos'
}



data['geo_region'] = data['geo_region'].fillna(data['geo_country'].map(most_populated_cities))
data['geo_city'] = data['geo_city'].fillna(data['geo_country'].map(most_populated_cities))

In [90]:
null_regions_df=data.groupby(['geo_region']).agg(
    null_count=('geo_region', lambda x: x.isna().sum())
).sort_values('null_count',ascending=False).reset_index()

null_regions_df


Unnamed: 0,geo_region,null_count
0,Abruzzo,0
1,Porto District,0
2,Podkarpackie Voivodeship,0
3,Piedmont,0
4,Pichincha,0
...,...,...
390,Grand Est,0
391,Giza Governorate,0
392,Georgia,0
393,Geneva,0


In [91]:
null_region_cities_df=data.groupby(['geo_region']).agg(
    null_count=('geo_city', lambda x: x.isna().sum())
).sort_values('null_count',ascending=False).reset_index()


region_cities_with_nulls = null_region_cities_df[null_region_cities_df['null_count'] > 0]['geo_region'].tolist()


region_cities_with_nulls

['California',
 'Florida',
 'England',
 'Texas',
 'New Jersey',
 'New York',
 'Pennsylvania',
 'Michigan',
 'Ohio',
 'Taipei City',
 'Ontario',
 'Illinois',
 'Georgia',
 'Massachusetts',
 'North Carolina',
 'Virginia',
 'Maryland',
 'Quebec',
 'Connecticut',
 'Ile-de-France',
 'Wisconsin',
 'Missouri',
 'Washington',
 'Indiana',
 'Colorado',
 'South Carolina',
 'Minnesota',
 'British Columbia',
 'Utah',
 'Tennessee',
 'Alberta',
 'Alabama',
 'New Taipei City',
 'North Rhine-Westphalia',
 'Flanders',
 'Louisiana',
 'Oregon',
 'Bavaria',
 'Kentucky',
 'Baden-Wurttemberg',
 'Catalonia',
 'Tokyo',
 'Iowa',
 'State of Sao Paulo',
 'Maharashtra',
 'Mississippi',
 'Lombardy',
 'Taiwan Province',
 'Arizona',
 'Arkansas',
 'Oklahoma',
 'Auvergne-Rhone-Alpes',
 'Kansas',
 'Gyeonggi-do',
 'North Holland',
 'Emilia-Romagna',
 'South Holland',
 'Tamil Nadu',
 'Kaohsiung City',
 'Delaware',
 'Taichung City',
 'Andalusia',
 'Veneto',
 'West Virginia',
 'Metro Manila',
 'West Bengal',
 'State of Mexic

In [92]:
most_populated_cities_by_region = {
    'Taipei City': 'Taipei',
    'New Taipei City': 'New Taipei',
    'Taichung City': 'Taichung',
    'Kaohsiung City': 'Kaohsiung',
    'West Virginia': 'Charleston',
    'Delaware': 'Wilmington',
    'State of Mexico': 'Ecatepec',
    'Buenos Aires Province': 'La Plata',
    'Montana': 'Billings',
    'Wallonia': 'Liège',
    'Alaska': 'Anchorage',
    'Grand Est': 'Strasbourg',
    'Henan': 'Zhengzhou',
    'Hawalli Governorate': 'Hawalli',
    'Gelderland': 'Arnhem',
    'Algiers Province': 'Algiers',
    'North Dakota': 'Fargo',
    'Vermont': 'Burlington',
    'Idaho': 'Boise',
    'Canary Islands': 'Las Palmas',
    'Castile-La Mancha': 'Toledo',
    'Lublin Voivodeship': 'Lublin',
    'Calabarzon': 'Cavite City',
    'Saxony': 'Dresden',
    'Castile and Leon': 'Valladolid',
    'State of Bahia': 'Salvador',
    'Limburg': 'Hasselt',
    'Tainan City': 'Tainan',
    'Moscow Oblast': 'Khimki',
    'Chiba': 'Chiba',
    'Prince Edward Island': 'Charlottetown',
    'Overijssel': 'Zwolle',
    'Haifa District': 'Haifa',
    'Liguria': 'Genoa',
    'Northern Ireland': 'Belfast',
    'Rhineland-Palatinate': 'Mainz',
    'Sverdlovsk Oblast': 'Yekaterinburg',
    'Region of Southern Denmark': 'Odense',
    'Decentralized Administration of Peloponnese, Western Greece and the Ionian': 'Patras',
    'Amman Governorate': 'Amman',
    'Silesian Voivodeship': 'Katowice',
    'Asturias': 'Oviedo',
    'Galicia': 'Santiago de Compostela',
    'Hokkaido': 'Sapporo',
    'Quintana Roo': 'Cancún',
    'Centre-Val de Loire': 'Orléans',
    'Tamaulipas': 'Reynosa',
    'State of Santa Catarina': 'Florianópolis',
    'Saitama': 'Saitama',
    'Sicily': 'Palermo',
    'Gyeongsangnam-do': 'Changwon',
    'St. Gallen': 'St. Gallen',
    'Drenthe': 'Assen',
    'State of Ceara': 'Fortaleza',
    'Marche': 'Ancona',
    'West Pomeranian Voivodeship': 'Szczecin',
    'Friuli-Venezia Giulia': 'Trieste',
    'Decentralized Administration of Attica': 'Athens',
    'Viken': 'Drammen',
    'Lower Austria': 'St. Pölten',
    'State of Pernambuco': 'Recife',
    'Veracruz': 'Veracruz',
    'Flanders': 'Antwerp',
    'Kuyavian-Pomeranian Voivodeship': 'Bydgoszcz',
    'Krasnodar Krai': 'Krasnodar',
    'Central Bohemian Region': 'Prague',
    'State of Espirito Santo': 'Vitória',
    'Abruzzo': 'Pescara',
    'Region Zealand': 'Roskilde',
    'Maryland': 'Baltimore',
    'South District': 'Ashdod',
    'Normandy': 'Rouen',
    'Iowa': 'Des Moines',
    'Jerusalem District': 'Jerusalem',
    'Wisconsin': 'Milwaukee',
    'Gyeongsangbuk-do': 'Gyeongju',
    'Puebla': 'Puebla',
    'Friesland': 'Leeuwarden',
    'Bourgogne-Franche-Comte': 'Dijon',
    'Chungcheongbuk-do': 'Cheongju',
    'Sonora': 'Hermosillo',
    'Penang': 'George Town',
    'Federation of Bosnia and Herzegovina': 'Sarajevo',
    'Trentino-South Tyrol': 'Bolzano',
    'Cordoba': 'Córdoba',
    'Hyogo': 'Kobe',
    'Brandenburg': 'Potsdam',
    'Baden-Wurttemberg': 'Stuttgart',
    'Setubal': 'Setúbal',
    'Guanajuato': 'Guanajuato',
    'Kocaeli': 'Izmit',
    'Groningen': 'Groningen',
    'Muscat Governorate': 'Muscat',
    'Giza Governorate': 'Giza',
    'Arkansas': 'Little Rock',
    'Community of Madrid': 'Madrid',
    'Skane County': 'Malmö',
    'South Moravian Region': 'Brno',
    'Podkarpackie Voivodeship': 'Rzeszów',
    'Minnesota': 'Minneapolis',
    'Utrecht': 'Utrecht',
    'North Rhine-Westphalia': 'Cologne',
    'Indiana': 'Indianapolis',
    'Louisiana': 'New Orleans',
    'Odisha': 'Bhubaneswar',
    'Newfoundland and Labrador': 'St. John\'s',
    'Canton of Bern': 'Bern',
    'Bihar': 'Patna',
    'Utah': 'Salt Lake City',
    'Lower Saxony': 'Hanover',
    'Guayas': 'Guayaquil',
    'Gangwon-do': 'Gangneung',
    'Brittany': 'Rennes',
    'Mississippi': 'Jackson',
    'Pays de la Loire': 'Nantes',
    'Nuevo Leon': 'Monterrey',
    'New Hampshire': 'Concord',
    'South Dakota': 'Sioux Falls',
    'Chhattisgarh': 'Raipur',
    'Uttarakhand': 'Dehradun'
}



# device columns

- for many cases we assumed devices, brands and os versions of 2021 as top devices

## device mobile brand

In [93]:

# Fill 'device_mobile_brand_name' with 'PC' where the conditions are met
data.loc[(data['device_operating_system'] == 'Windows') & (data['device_category'] == 'desktop'), 'device_mobile_brand_name'] = 'PC'
data.loc[(data['device_operating_system'] == 'Web') & (data['device_category'] == 'desktop') & ((data['device_mobile_model_name'].isin(['Chrome','Edge','Firefox']))), 'device_mobile_brand_name'] = 'PC'
data.loc[(data['device_mobile_brand_name'] == 'Microsoft') & (data['device_category'] == 'desktop'), 'device_mobile_brand_name'] = 'PC'


data['device_mobile_brand_name'].value_counts()

device_mobile_brand_name
Apple        251668
PC           186163
Google        45268
<Other>       42816
Samsung       42645
Xiaomi        13381
Huawei         7906
Mozilla         456
Microsoft        93
Name: count, dtype: int64

## device_mobile_model_name 

In [94]:


data.loc[(data['device_mobile_brand_name'] == 'Samsung'), 'device_mobile_model_name'] = 'Galaxy S21'

data.loc[(data['device_mobile_brand_name'] == 'Xiaomi'), 'device_mobile_model_name'] = 'Mi 11'

data.loc[(data['device_mobile_brand_name'] == 'Huawei'), 'device_mobile_model_name'] = 'P50'

data.loc[(data['device_mobile_brand_name'] == 'Apple')& (data['device_category'] == 'desktop'), 'device_mobile_model_name'] = 'Macintosh'

data.loc[(data['device_mobile_brand_name'] == 'PC')& (data['device_mobile_model_name'] == 'Chrome'), 'device_mobile_model_name'] = 'PC'


data['device_mobile_model_name'].value_counts()

device_mobile_model_name
PC            162647
iPhone        121901
Macintosh     120700
<Other>        56325
Galaxy S21     42645
ChromeBook     32068
Mi 11          13381
Edge           12525
iPad            7976
P50             7906
Firefox         7649
Chrome          1785
Pixel 3         1441
Pixel 4 XL      1436
Safari            11
Name: count, dtype: int64

## device_operating_system

In [95]:

# Chrome OS - ensure 'desktop' is correctly spelled
data.loc[(data['device_mobile_model_name'] == 'ChromeBook') & (data['device_category'] == 'desktop'), 'device_operating_system'] = 'ChromeOS'

# iOS - for iPhone and iPad
data.loc[data['device_mobile_model_name'].isin(['iPhone', 'iPad']) | (data['device_mobile_brand_name'] == 'Apple') | ((data['device_mobile_model_name'] == 'Apple') & (data['device_category'].isin(['mobile','tablet']))), 'device_operating_system'] = 'iOS'

# Android - for specified brands
android_brands = ['Xiaomi', 'Huawei', 'Samsung']
data.loc[data['device_mobile_brand_name'].isin(android_brands), 'device_operating_system'] = 'Android'
data.loc[(data['device_mobile_brand_name'] == 'Google') & (data['device_category'].isin(['mobile','tablet'])), 'device_operating_system'] = 'Android'

# macOS
data.loc[(data['device_mobile_brand_name'] == 'Apple') & (data['device_category'] == 'desktop'), 'device_operating_system'] = 'MacOS'

# Windows
data.loc[(data['device_operating_system'] == 'Web') & (data['device_category'] == 'desktop') & ((data['device_mobile_brand_name'] == 'PC')), 'device_operating_system'] = 'Windows'
data.loc[(data['device_operating_system'] == 'Web') & (data['device_category'] == 'desktop') & ((data['device_mobile_brand_name'] == 'Mozilla')), 'device_operating_system'] = 'Windows'
data.loc[(data['device_category'] == 'desktop') & ((data['device_mobile_brand_name'] == 'Microsoft')), 'device_operating_system'] = 'Windows'


data.groupby(['device_category','device_operating_system', 'device_mobile_brand_name']).agg(
    unique_event_count=('event_timestamp', 'nunique')
).sort_values('unique_event_count',ascending=False).reset_index()


Unnamed: 0,device_category,device_operating_system,device_mobile_brand_name,unique_event_count
0,desktop,Windows,PC,142995
1,mobile,iOS,Apple,94912
2,desktop,MacOS,Apple,93153
3,mobile,Android,Samsung,30925
4,desktop,ChromeOS,Google,24965
5,mobile,Web,<Other>,16514
6,mobile,Android,<Other>,12379
7,mobile,Android,Xiaomi,10267
8,mobile,Android,Google,8915
9,tablet,iOS,Apple,6226


## device_operating_system_version

In [96]:
# replace all string characters and keep float values
data['device_operating_system_version'] = data['device_operating_system_version'].str.extract(r'(\d+\.\d+|\d+)')


In [97]:
# for chrome os consider same browser version
# https://chromereleases.googleblog.com/2021/
data.loc[(data['device_operating_system'] == 'ChromeOS') & (data['device_operating_system_version'].isnull()), 'device_operating_system_version'] = data['device_web_info_browser_version']


data['device_operating_system_version'].value_counts()


device_operating_system_version
10         215573
10.15       84564
14.3        26294
14.2        25611
86.0        15204
87.0        13138
9            8838
11.1         8258
7            4944
<Other>      3726
Name: count, dtype: int64

In [98]:
data['device_operating_system_version'].value_counts()

device_operating_system_version
10         215573
10.15       84564
14.3        26294
14.2        25611
86.0        15204
87.0        13138
9            8838
11.1         8258
7            4944
<Other>      3726
Name: count, dtype: int64

## device_language

In [99]:
data.groupby(['geo_continent','geo_country','device_language']).agg(
    unique_event_count=('event_timestamp', 'nunique')
).sort_values('unique_event_count',ascending=False).reset_index()

Unnamed: 0,geo_continent,geo_country,device_language,unique_event_count
0,Americas,United States,en-us,76211
1,Asia,India,en-us,15970
2,Americas,Canada,en-us,13315
3,Americas,United States,en-gb,12523
4,Americas,United States,en,6110
...,...,...,...,...
793,Europe,Serbia,de,1
794,Asia,Myanmar (Burma),en-ca,1
795,Asia,Nepal,en-ca,1
796,Asia,Oman,fr,1


In [100]:
data.groupby(['geo_continent','geo_country']).agg(
    null_device_language_count=('device_language', lambda x: x.isna().sum())
).sort_values('null_device_language_count',ascending=False).reset_index()

Unnamed: 0,geo_continent,geo_country,null_device_language_count
0,Americas,United States,119393
1,Asia,India,23109
2,Americas,Canada,19898
3,Europe,United Kingdom,7893
4,Europe,France,5249
...,...,...,...
103,Europe,Kosovo,20
104,Europe,Bosnia & Herzegovina,19
105,Americas,Trinidad & Tobago,14
106,Americas,Honduras,14


In [101]:
data['device_language'].value_counts()

device_language
en-us    220675
en-gb     35068
en        16571
zh        16259
en-ca     12511
fr         8197
es-es      7736
de         4223
ko         3943
Name: count, dtype: int64

## device_web_info_browser

In [102]:
data.loc[(data['device_web_info_browser'] == 'Android Webview'), 'device_web_info_browser'] = "Chrome"


data['device_web_info_browser'].value_counts()

device_web_info_browser
Chrome     411008
Safari     140520
<Other>     16046
Edge        12718
Firefox     10104
Name: count, dtype: int64

## device_web_info_browser_version

In [103]:
data['device_web_info_browser_version'].value_counts()

device_web_info_browser_version
87.0       217211
86.0       125888
<Other>    105165
14.0       103390
604         14572
13.1        13847
84.0         3368
83.0         2730
13.0         2410
82.0         1770
14.1           32
87              8
86              3
87.7            1
82.1            1
Name: count, dtype: int64

# Traffic Columns

## session counts

In [104]:
data.groupby(['traffic_source_medium','traffic_source_source']).agg(
    unique_session_count=('session_id', 'nunique')
).sort_values('unique_session_count',ascending=False).reset_index()

Unnamed: 0,traffic_source_medium,traffic_source_source,unique_session_count
0,organic,google,45289
1,(none),(direct),33128
2,<Other>,<Other>,21381
3,referral,<Other>,13650
4,referral,shop.googlemerchandisestore.com,10933
5,(data deleted),(data deleted),7751
6,cpc,google,6363
7,organic,<Other>,4059
8,(data deleted),<Other>,129
9,referral,(data deleted),2


## null counts

In [105]:
data.groupby(['traffic_source_medium']).agg(
    null_count=('traffic_source_source', lambda x: x.isna().sum())
).sort_values('null_count',ascending=False).reset_index()

Unnamed: 0,traffic_source_medium,null_count
0,(data deleted),0
1,(none),0
2,<Other>,0
3,cpc,0
4,organic,0
5,referral,0


In [106]:
data.groupby(['traffic_source_source']).agg(
    null_count=('traffic_source_medium', lambda x: x.isna().sum())
).sort_values('null_count',ascending=False).reset_index()

# no null count on

Unnamed: 0,traffic_source_source,null_count
0,(data deleted),0
1,(direct),0
2,<Other>,0
3,google,0
4,shop.googlemerchandisestore.com,0


## remove parenthesis ()

In [107]:
data['traffic_source_source'] = data['traffic_source_source'].str.replace(r'\(|\)', '', regex=True).str.strip()
data['traffic_source_medium'] = data['traffic_source_medium'].str.replace(r'\(|\)', '', regex=True).str.strip()


data.groupby(['traffic_source_medium','traffic_source_source']).agg(
    unique_session_count=('session_id', 'nunique')
).sort_values('unique_session_count',ascending=False).reset_index()

Unnamed: 0,traffic_source_medium,traffic_source_source,unique_session_count
0,organic,google,45289
1,none,direct,33128
2,<Other>,<Other>,21381
3,referral,<Other>,13650
4,referral,shop.googlemerchandisestore.com,10933
5,data deleted,data deleted,7751
6,cpc,google,6363
7,organic,<Other>,4059
8,data deleted,<Other>,129
9,referral,data deleted,2


## Replace values

- bad referral naming from shop.googlemerchandisestore.com means badly tracked and we should consider direct traffic instead
- medium =none is referring to direct traffic and we will use the same name to not confuse with null values
- data deleted is most likely paid campaign by google to avoid confidential data exposure so we will replace with cpc / google as well


In [108]:

# data.loc[(data['traffic_source_source'] == 'shop.googlemerchandisestore.com') & (data['traffic_source_medium'] == 'referral'), 'traffic_source_source'] = 'direct'
# data.loc[(data['traffic_source_source'] == 'shop.googlemerchandisestore.com') & (data['traffic_source_medium'] == 'referral'), 'traffic_source_medium'] = 'direct'
# source is merchandisestore.com then direct (bad parameter)
data.loc[(data['traffic_source_source'] == 'shop.googlemerchandisestore.com') & (data['traffic_source_medium'] == 'referral'), ['traffic_source_medium', 'traffic_source_source']] = ['direct','direct']


# referral traffic
data.loc[(data['traffic_source_medium'] == 'referral'), ['traffic_source_medium', 'traffic_source_source']] = ['referral','referral_link']

# google organic
data.loc[(data['traffic_source_source'].isnull()) & (data['traffic_source_medium']== 'organic'),  ['traffic_source_medium', 'traffic_source_source']] = ['organic','google']


# when we have direct traffic it is direct traffic
data.loc[(data['traffic_source_source'] == 'direct'), 'traffic_source_medium'] = 'direct'

# data deleted is paid campaign cpc by google
# data.loc[(data['traffic_source_source'] == 'data deleted') | (data['traffic_source_medium'] == 'data deleted'), 'traffic_source_medium'] = 'cpc'
# data.loc[(data['traffic_source_source'] == 'data deleted') | (data['traffic_source_medium'] == 'data deleted'), 'traffic_source_source'] = 'google'
data.loc[(data['traffic_source_source'] == 'data deleted') | (data['traffic_source_medium'] == 'data deleted'), ['traffic_source_medium', 'traffic_source_source']] = ['cpc', 'google']


# full null values are direct
data.loc[(data['traffic_source_source'].isnull()) & (data['traffic_source_medium'].isnull()),  ['traffic_source_medium', 'traffic_source_source']] = ['direct','direct']
# data.loc[(data['traffic_source_source'].isnull()) & (data['traffic_source_medium'].isnull()), 'traffic_source_medium'] = 'direct'



# data = fill_nulls_based_on_top_value_multiple_columns(data, 'traffic_source_source', ['device_category','geo_country'])
# data = fill_nulls_based_on_top_value_multiple_columns(data, 'traffic_source_medium', ['device_category','geo_country','traffic_source_source','traffic_source_medium'])


data.groupby(['traffic_source_source','traffic_source_medium']).agg(
    unique_session_count=('session_id', 'nunique')
).sort_values('unique_session_count',ascending=False).reset_index()

Unnamed: 0,traffic_source_source,traffic_source_medium,unique_session_count
0,google,organic,45289
1,direct,direct,43997
2,<Other>,<Other>,21381
3,google,cpc,14232
4,referral_link,referral,13652
5,<Other>,organic,4059
6,<Other>,cpc,1


# Page columns

In [109]:
df_page_ref=data.groupby(['page_location']).agg(
    unique_session_count=('session_id', 'nunique')
).sort_values('unique_session_count',ascending=False).reset_index()

In [110]:
# remove domain name for better readability
data['page_location'] = data['page_location'].str.replace(r'shop.googlemerchandisestore.com/store.html', 'shop.googlemerchandisestore.com/').str.strip()
data['page_location'] = data['page_location'].str.replace(r'+', ' ').str.strip()
data['page_location'] = data['page_location'].str.replace(r'https://', '').str.strip()
data['page_location'] = data['page_location'].str.replace(r'http://', '').str.strip()
data['page_location'] = data['page_location'].str.replace(r'www.', '').str.strip()


df_pages_agg=data.groupby(['page_location','page_title']).agg(
    unique_session_count=('session_id', 'nunique')
).sort_values('unique_session_count',ascending=False).reset_index()

df_pages_agg

Unnamed: 0,page_location,page_title,unique_session_count
0,shop.googlemerchandisestore.com/,Home,48414
1,googlemerchandisestore.com/,Google Online Store,19464
2,shop.googlemerchandisestore.com/Google Redesig...,Apparel | Google Merchandise Store,15413
3,shop.googlemerchandisestore.com/Google Redesig...,Men's / Unisex | Apparel | Google Merchandise ...,13810
4,shop.googlemerchandisestore.com/Google Redesig...,Sale | Google Merchandise Store,10822
...,...,...,...
1203,shop.googlemerchandisestore.com/Google Redesig...,Page Unavailable,1
1204,shop.googlemerchandisestore.com/Google Redesig...,Page Unavailable,1
1205,shop.googlemerchandisestore.com/google redesig...,Page Unavailable,1
1206,shop.googlemerchandisestore.com/google redesig...,Page Unavailable,1


## split categories in page paths

In [111]:
# Step 1: Split 'page_location' into 4 parts (max)
split_columns = data['page_location'].str.split('/', n=4, expand=True)

# Step 2: Assign the first three parts to new columns (ignore the first empty part if there is a leading '/')
data['domain'] = split_columns[0] # url_domain 
data['page_path_level_1'] = split_columns[1].replace('', pd.NA)
data['page_path_level_2'] = split_columns[2].replace('', pd.NA)
data['page_path_level_3'] = split_columns[3].replace('', pd.NA)




df_pages_total=data[['page_title','page_location','page_path_level_1','page_path_level_2','page_path_level_3']]


## fill length page path with page title

In [112]:
data['page_path_level_1'] = data.apply(
    lambda row: row['page_title'] if pd.isna(row['page_path_level_2']) else row['page_path_level_1'], 
    axis=1
)

df_pages_total=data[['page_title','page_location','page_path_level_1','page_path_level_2','page_path_level_3']]


## fill the other page path levels with previous page path column

This will allow for hierarchical encoding without sacrificing columns or rows

In [113]:
# Fill 'page_path_level_2' by concatenating 'page_path_1' and 'page_path_2' (if 'page_path_2' is null)
data['page_path_level_2'] = data.apply(
    lambda row: row['page_path_level_1'] if pd.isna(row['page_path_level_2']) 
    else f"{row['page_path_level_1']}/{row['page_path_level_2']}", axis=1
)

# Fill 'page_path_level_3' by concatenating 'page_path_level_2' and 'page_path_3' (if 'page_path_3' is null)
data['page_path_level_3'] = data.apply(
    lambda row: row['page_path_level_2'] if pd.isna(row['page_path_level_3']) 
    else f"{row['page_path_level_2']}/{row['page_path_level_3']}", axis=1
)


# df_pages_total = data[['page_path_level_1', 'page_path_level_2', 'page_path_level_3']]


                      page_path_1    page_path_2  \
0                 Google Redesign        Apparel   
1       Checkout Your Information           None   
2            Store search results           None   
3                 Google Redesign        Apparel   
4                 Google Redesign      Lifestyle   
...                           ...            ...   
648340            Google Redesign  Shop by Brand   
648341            Google Redesign  Shop by Brand   
648342            Google Redesign        Apparel   
648344                       Home           None   
648345            Google Redesign   eco friendly   

                                  page_path_3              page_path_level_2  \
0       Google Cambridge Campus Zip Hoodie XL        Google Redesign/Apparel   
1                                        None      Checkout Your Information   
2                                        None           Store search results   
3                  Google Youth FC Zip Hoodie        Go

In [114]:

df_pages_agg=data.groupby(['page_title','domain','page_location','page_path_1','page_path_2','page_path_3', 'page_path_level_2', 'page_path_level_3']).agg(
    unique_session_count=('session_id', 'nunique')
).sort_values('unique_session_count',ascending=False).reset_index()

df_pages_agg

Unnamed: 0,page_title,domain,page_location,page_path_1,page_path_2,page_path_3,page_path_level_2,page_path_level_3,unique_session_count
0,Men's / Unisex | Apparel | Google Merchandise ...,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/Google Redesig...,Google Redesign,Apparel,Mens,Google Redesign/Apparel,Google Redesign/Apparel/Mens,13810
1,YouTube | Shop by Brand | Google Merchandise S...,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/Google Redesig...,Google Redesign,Shop by Brand,YouTube,Google Redesign/Shop by Brand,Google Redesign/Shop by Brand/YouTube,9631
2,Hats | Apparel | Google Merchandise Store,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/Google Redesig...,Google Redesign,Apparel,Hats,Google Redesign/Apparel,Google Redesign/Apparel/Hats,7060
3,Drinkware | Lifestyle | Google Merchandise Store,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/Google Redesig...,Google Redesign,Lifestyle,Drinkware,Google Redesign/Lifestyle,Google Redesign/Lifestyle/Drinkware,6867
4,Bags | Lifestyle | Google Merchandise Store,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/Google Redesig...,Google Redesign,Lifestyle,Bags,Google Redesign/Lifestyle,Google Redesign/Lifestyle/Bags,6384
...,...,...,...,...,...,...,...,...,...
906,Page Unavailable,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/google redesig...,google redesign,apparel,google heather green speckled tee,google redesign/apparel,google redesign/apparel/google heather green s...,1
907,Page Unavailable,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/google redesig...,google redesign,apparel,google grey tee,google redesign/apparel,google redesign/apparel/google grey tee,1
908,Page Unavailable,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/google redesig...,google redesign,apparel,google google striped tank,google redesign/apparel,google redesign/apparel/google google striped ...,1
909,Page Unavailable,shop.googlemerchandisestore.com,shop.googlemerchandisestore.com/google redesig...,google redesign,apparel,google fold over beanie grey,google redesign/apparel,google redesign/apparel/google fold over beani...,1


# Final df without relevant columns

In [115]:
data_final=data.drop(['page_title','page_location','session_id','user_pseudo_id'],axis=1) 

In [116]:
data.shape

(590396, 44)

In [117]:
data_final.to_csv('df_merch_pre_proc.csv',index=False)

# df_merch_pre_proc.csv
# df_merch_profile.csv
# df_merch_data_prep.csv