In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make sure you have the updated CSV file in your 'data' folder
df = pd.read_csv('../data/ifood_df_updated.csv')

# Set a style for better visualization
sns.set_style("whitegrid")

# Define the color palette
custom_palette = ["#067A46", "#242424", "#FFFFFF", "#90B33A"]
# Set this as the default color palette for all seaborn plots
sns.set_palette(custom_palette)

# Create a figure with two subplots side-by-side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram for Income, using the first color from the palette (vibrant green)
sns.histplot(df['Income'], bins=30, kde=True, ax=axes[0], color=custom_palette[0])
axes[0].set_title('Distribution of Customer Income')
axes[0].set_xlabel('Income ($)')
axes[0].set_ylabel('Number of Customers')

# Histogram for TotalPurchases, also using the vibrant green
sns.histplot(df['TotalPurchases'], bins=30, kde=True, ax=axes[1], color=custom_palette[0])
axes[1].set_title('Distribution of Total Purchases')
axes[1].set_xlabel('Total Purchases')
axes[1].set_ylabel('Number of Customers')

# Adjust layout and save the plot
plt.tight_layout()

# Save the plot in your 'reports' folder
plt.savefig('../reports/eda_histograms.png')

print("Histograms have been generated and saved to 'reports/eda_histograms.png'.")

In [None]:
# Drop one-hot encoded columns and other non-numerical/non-useful columns for correlation
df_corr = df.drop(columns=[
    'Z_CostContact', 'Z_Revenue', 'Customer_Days', 'Complain',
    'marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow',
    'education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD',
    'Marital_Status', 'Education'
])

# Calculate the correlation matrix
correlation_matrix = df_corr.corr()

# Create a heatmap
plt.figure(figsize=(18, 15))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')

# Save the plot in your 'reports' folder
plt.savefig('../reports/correlation_heatmap.png')

print("Correlation heatmap has been generated and saved to 'reports/correlation_heatmap.png'.")

In [None]:
# Remove the income outlier for a cleaner visualization
df = df[df['Income'] < 100000]

# Set the visualization style
sns.set_style("whitegrid")

# Create a scatter plot with a regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='NumWebVisitsMonth', y='NumWebPurchases', data=df, scatter_kws={'alpha':0.6})
plt.title('Relationship Between Web Visits and Web Purchases')
plt.xlabel('Number of Web Visits per Month')
plt.ylabel('Number of Web Purchases')
plt.savefig('../reports/web_visits_vs_purchases.png')

print("Scatter plot has been generated and saved to 'reports/web_visits_vs_purchases.png'.")

In [None]:
# Remove the income outlier for a cleaner visualization
df = df[df['Income'] < 100000]

# Set the visualization style
sns.set_style("whitegrid")

# Create a scatter plot with a regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='NumDealsPurchases', y='MntTotal', data=df, scatter_kws={'alpha':0.6})
plt.title('Relationship Between Deals Used and Total Spending')
plt.xlabel('Number of Deals Purchases')
plt.ylabel('Total Amount Spent ($)')
plt.savefig('../reports/deals_vs_spending.png')

print("Scatter plot has been generated and saved to 'reports/deals_vs_spending.png'.")

In [None]:
# Remove the income outlier for a cleaner visualization
df = df[df['Income'] < 100000]
# Define your custom color palette
custom_palette = ["#067A46", "#242424", "#FFFFFF", "#90B33A"]

# Set this as the default color palette for all seaborn plots
sns.set_palette(custom_palette)
# Set the visualization style
sns.set_style("whitegrid")

# Create a box plot to compare MntTotal across different education levels
plt.figure(figsize=(12, 8))
sns.boxplot(x='Education', y='MntTotal', hue='Education', data=df, palette='viridis')
plt.title('Total Spending Distribution by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Total Amount Spent ($)')
plt.savefig('../reports/education_vs_spending.png')

print("Box plot has been generated and saved to 'reports/education_vs_spending.png'.")

In [None]:
# Remove the income outlier for a cleaner visualization
df = df[df['Income'] < 100000]

# Create a box plot to compare MntTotal across the number of children in the household
plt.figure(figsize=(10, 6))
sns.boxplot(x='Kids', y='MntTotal', hue='Kids', data=df, palette=custom_palette)
plt.title('Total Spending Distribution by Number of Children')
plt.xlabel('Number of Children in Household')
plt.ylabel('Total Amount Spent ($)')
plt.savefig('../reports/kids_vs_spending.png')

print("Box plot has been generated and saved to 'reports/kids_vs_spending.png'.")

In [None]:
# Remove the income outlier for a cleaner visualization
df = df[df['Income'] < 100000]

# Calculate the sum of purchases for each channel
channel_purchases = pd.DataFrame({
    'Channel': ['Web', 'Catalog', 'Store'],
    'Purchases': [df['NumWebPurchases'].sum(), df['NumCatalogPurchases'].sum(), df['NumStorePurchases'].sum()]
})

# Create a bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x='Channel', y='Purchases', data=channel_purchases)
plt.title('Total Purchases by Channel')
plt.xlabel('Purchase Channel')
plt.ylabel('Total Number of Purchases')
plt.savefig('../reports/purchases_by_channel.png')

print("Bar chart has been generated and saved to 'reports/purchases_by_channel.png'.")