In [None]:
import pandas as pd

# load the dataset
df = pd.read_csv('../data/shopping_behavior_updated.csv')
print("\n A look into first five rows from dataset: ")
print(df.head())

print("\n how big is our dataset? ")
print(df.shape)

print("\n concise summary of the dataframe: ")
print(df.info())

print("\n statistical summary of numerical columns: ")
print(df.describe()) 

In [None]:
# check for null values and see if they exist 
import pandas as pd

print(df.isnull().sum())

In [None]:
# for theme 1: customer demographics analysis

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("what is the age group of my clients ")
df['AgeGroup']=pd.cut(df['Age'], bins=[0,18,25,35,45,55,65,100], labels=['<18','18-24','25-34','35-44','45-54','55-64','65+'])
plt.figure(figsize=(10,6))
sns.set_style('darkgrid')
sns.histplot(data=df['AgeGroup'], bins=7, color='blue')
plt.title('Age Group Distribution of Customers')
plt.xlabel('Age Groups')
plt.ylabel('Number of Customers')
plt.show()  

print("\n are males or females more prone to buy from my store ")
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='Gender', palette='colorblind')
plt.title('Gender Distribution of Customers')
plt.show()

print("\n where are my top clients from: ")
print("Top 10 customer locations by count:")
print(df['Location'].value_counts().head(10))



plt.figure(figsize=(12, 6)) 
sns.countplot(data=df, 
              x='Location', 
              palette='viridis',
              order=df['Location'].value_counts().index) 
plt.title('Location Distribution of Customers')
plt.xlabel('Location')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout() 
plt.show()




In [None]:
# Theme 2: Purchase Behavior 


import pandas as pd
import matplotlib.pyplot as plt     
import seaborn as sns

print("theme 2: purchase behavior analysis ")
print("\n what is the total or biggest amount spent in purchases")
df['total_spent']=df['Purchase Amount (USD)'].sum()
print(f"Total amount spent by all customers: ${df['total_spent'].iloc[0]:.2f}")

print("\n how frequently my clients make purchases")
df['frequency_counts']=df['Frequency of Purchases'].value_counts()
plt.figure(figsize=(10,6))
sns.set_style('whitegrid')
sns.countplot(data=df, x='Frequency of Purchases', order=df['Frequency of Purchases'].value_counts().index, palette='pastel')
plt.title('Purchase Frequency Distribution')    
plt.xlabel('Frequency of Purchases')
plt.ylabel('Number of Customers')   
plt.show()

print("\n what is the average purchase amount")
df['avg_purchase_amount']= df['Purchase Amount (USD)'].mean()
print(f"Average purchase amount: ${df['avg_purchase_amount'].iloc[0]:.2f}")

print("\n are males or females more prone to spend more")

print(df['Gender'].value_counts())
plt.figure(figsize=(8,5))
sns.barplot(data=df, x='Gender', y='Purchase Amount (USD)', hue='Gender', palette='Set2')
plt.title('Spending amounts by Gender ')
plt.xlabel('Gender')
plt.ylabel('Purchase Amount (USD)')
plt.show()

In [None]:
# Theme 3: Product & Category Insights 


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("Theme 3: Product & Category Insights")
print("\n what is the most bought item, from which category: ")
print(df['Item Purchased'].value_counts().head(10))

plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Item Purchased', order=df['Item Purchased'].value_counts().head(10).index, palette='magma')
plt.title('Top 10 Most Bought Items')     
plt.xlabel('Item Purchased')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=45, ha='right')
plt.show()

plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Category', order=df['Category'].value_counts().index, palette='coolwarm')
plt.title('Most Bought Items by Category')      
plt.xlabel('Category')
plt.ylabel('Number of Purchases')
plt.show()

print("\n what are the most bought sizes, colours for future marketing")
print("Top 5 most bought colors:")
print(df['Color'].value_counts().head(5))
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='Color', order=df['Color'].value_counts().index, palette='Set3')
plt.title('Most Bought Colors')
plt.xlabel('Color')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Top 5 most bought sizes:")
print(df['Size'].value_counts().head(5))
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='Size', order=df['Size'].value_counts().index, palette='Set1')
plt.title('Most Bought Sizes')
plt.xlabel('Size')
plt.ylabel('Number of Purchases')
plt.show()

print("\n total amount per category ")
df['total_amount_per_category']= df.groupby('Category')['Purchase Amount (USD)'].sum()
plt.figure(figsize=(12,6))
sns.barplot(data=df, x='Category', y='Purchase Amount (USD)', estimator=sum, ci=None, palette='Blues_d')
plt.title('Total Amount Spent per Category')
plt.xlabel('Category')
plt.ylabel('Total Purchase Amount (USD)')
plt.show()


In [None]:
# Theme 4: Seasonal & Feedback Analysis 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("Theme 4: Seasonal & Feedback Analysis")
print("\n what season has the most purchases or purchases spike and dip")
print(df['Season'].value_counts().head(5))
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='Season', order=df['Season'].value_counts().index, palette='Set3')
plt.title('Most Bought Seasons')
plt.xlabel('Season')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\n what are my top reviews")
print("Top 5 Review Ratings:")
print(df['Review Rating'].value_counts().head(5))

print("\n what are the preferred payment/shipping methods")
print("Preferred Payment Methods:")
print(df['Payment Method'].value_counts())
