In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Amazon Customer Behavior Survey 2.csv")
df.head()

Unnamed: 0,Timestamp,age,Gender,Purchase_Frequency,Purchase_Categories,Personalized_Recommendation_Frequency,Browsing_Frequency,Product_Search_Method,Search_Result_Exploration,Customer_Reviews_Importance,...,Saveforlater_Frequency,Review_Left,Review_Reliability,Review_Helpfulness,Personalized_Recommendation_Frequency.1,Recommendation_Helpfulness,Rating_Accuracy,Shopping_Satisfaction,Service_Appreciation,Improvement_Areas
0,2023/06/04 1:28:19 PM GMT+5:30,23,Female,Few times a month,Beauty and Personal Care,Yes,Few times a week,Keyword,Multiple pages,1,...,Sometimes,Yes,Occasionally,Yes,2,Yes,1,1,Competitive prices,Reducing packaging waste
1,2023/06/04 2:30:44 PM GMT+5:30,23,Female,Once a month,Clothing and Fashion,Yes,Few times a month,Keyword,Multiple pages,1,...,Rarely,No,Heavily,Yes,2,Sometimes,3,2,Wide product selection,Reducing packaging waste
2,2023/06/04 5:04:56 PM GMT+5:30,24,Prefer not to say,Few times a month,Groceries and Gourmet Food;Clothing and Fashion,No,Few times a month,Keyword,Multiple pages,2,...,Rarely,No,Occasionally,No,4,No,3,3,Competitive prices,Product quality and accuracy
3,2023/06/04 5:13:00 PM GMT+5:30,24,Female,Once a month,Beauty and Personal Care;Clothing and Fashion;...,Sometimes,Few times a month,Keyword,First page,5,...,Sometimes,Yes,Heavily,Yes,3,Sometimes,3,4,Competitive prices,Product quality and accuracy
4,2023/06/04 5:28:06 PM GMT+5:30,22,Female,Less than once a month,Beauty and Personal Care;Clothing and Fashion,Yes,Few times a month,Filter,Multiple pages,1,...,Rarely,No,Heavily,Yes,4,Yes,2,2,Competitive prices,Product quality and accuracy


In [4]:
demographic_df = df[
    [
        "age",
        "Gender",
        "Purchase_Categories",
        "Cart_Completion_Frequency",
        "Cart_Abandonment_Factors",
    ]
]

demographic_df.head()

Unnamed: 0,age,Gender,Purchase_Categories,Cart_Completion_Frequency,Cart_Abandonment_Factors
0,23,Female,Beauty and Personal Care,Sometimes,Found a better price elsewhere
1,23,Female,Clothing and Fashion,Often,High shipping costs
2,24,Prefer not to say,Groceries and Gourmet Food;Clothing and Fashion,Sometimes,Found a better price elsewhere
3,24,Female,Beauty and Personal Care;Clothing and Fashion;...,Sometimes,Found a better price elsewhere
4,22,Female,Beauty and Personal Care;Clothing and Fashion,Sometimes,High shipping costs


In [6]:
# this takes people who listed multiple purchase categories and
#   creates a row for them for each of their categories
# idk if this is the right way or one hot encoding

demo_df_exploded = demographic_df.assign(
    Purchase_Categories=demographic_df["Purchase_Categories"].str.split(";")
).explode("Purchase_Categories")
demo_df_exploded.head()

Unnamed: 0,age,Gender,Purchase_Categories,Cart_Completion_Frequency,Cart_Abandonment_Factors
0,23,Female,Beauty and Personal Care,Sometimes,Found a better price elsewhere
1,23,Female,Clothing and Fashion,Often,High shipping costs
2,24,Prefer not to say,Groceries and Gourmet Food,Sometimes,Found a better price elsewhere
2,24,Prefer not to say,Clothing and Fashion,Sometimes,Found a better price elsewhere
3,24,Female,Beauty and Personal Care,Sometimes,Found a better price elsewhere


In [17]:
demo_df_exploded["Cart_Completion_Frequency"].value_counts()

Cart_Completion_Frequency
Sometimes    549
Often        347
Rarely       120
Always       103
Never         39
Name: count, dtype: int64

In [18]:
# Convert cart_completion_frequency to ordinal scale (low values indicate higher abandonment)
cart_completion_map = {"Always": 5, "Often": 4, "Sometimes": 3, "Rarely": 2, "Never": 1}
demo_df_exploded["Cart_Abandonment_Score"] = df["Cart_Completion_Frequency"].map(
    cart_completion_map
)

demo_df_exploded.head()

Unnamed: 0,age,Gender,Purchase_Categories,Cart_Completion_Frequency,Cart_Abandonment_Factors,Cart_Abandonment_Score
0,23,Female,Beauty and Personal Care,Sometimes,Found a better price elsewhere,3
1,23,Female,Clothing and Fashion,Often,High shipping costs,4
2,24,Prefer not to say,Groceries and Gourmet Food,Sometimes,Found a better price elsewhere,3
2,24,Prefer not to say,Clothing and Fashion,Sometimes,Found a better price elsewhere,3
3,24,Female,Beauty and Personal Care,Sometimes,Found a better price elsewhere,3


In [22]:
# Define age groups
bins = [18, 25, 35, 45, 55, 65]
labels = ["18-24", "25-34", "35-44", "45-54", "55-64"]
demo_df_exploded["Age_Group"] = pd.cut(
    demo_df_exploded["age"], bins=bins, labels=labels, right=False
)
demo_df_exploded["Age_Group"].value_counts()

Age_Group
25-34    385
18-24    382
35-44    197
45-54    130
55-64     34
Name: count, dtype: int64

In [26]:
# Aggregate cart abandonment by age group, gender, and product category
grouped = (
    demo_df_exploded.groupby(["Age_Group", "Gender", "Purchase_Categories"])[
        "Cart_Abandonment_Score"
    ]
    .mean()
    .reset_index()
)

grouped

  demo_df_exploded.groupby(["Age_Group", "Gender", "Purchase_Categories"])[


Unnamed: 0,Age_Group,Gender,Purchase_Categories,Cart_Abandonment_Score
0,18-24,Female,Beauty and Personal Care,3.180000
1,18-24,Female,Clothing and Fashion,3.250000
2,18-24,Female,Groceries and Gourmet Food,3.461538
3,18-24,Female,Home and Kitchen,3.170732
4,18-24,Female,others,3.000000
...,...,...,...,...
95,55-64,Prefer not to say,Beauty and Personal Care,3.000000
96,55-64,Prefer not to say,Clothing and Fashion,3.000000
97,55-64,Prefer not to say,Groceries and Gourmet Food,
98,55-64,Prefer not to say,Home and Kitchen,3.000000


In [35]:
grouped_highrate = grouped[grouped["Cart_Abandonment_Score"] <= 2]
# a score of 1 or 1 are those who say they rarely or never complete their cart
grouped_highrate

Unnamed: 0,Age_Group,Gender,Purchase_Categories,Cart_Abandonment_Score
11,18-24,Others,Clothing and Fashion,2.0
13,18-24,Others,Home and Kitchen,2.0
32,25-34,Others,Groceries and Gourmet Food,1.666667
54,35-44,Others,others,1.0
78,45-54,Prefer not to say,Home and Kitchen,2.0
79,45-54,Prefer not to say,others,1.0
