In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/ifood_df.csv")  # replace with your actual CSV name 

# See first few rows
df.head()

In [None]:
# Shape of dataset
print("Rows, Columns:", df.shape)

# Column names
print("Columns:", df.columns.tolist())

# Info summary
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
import missingno as msno
msno.matrix(df)


In [None]:
# understand customer demographics and purchase behaviors
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set(style="whitegrid")

# Select key columns
num_cols = ["Income", "Age", "Recency", "MntTotal", "NumWebVisitsMonth"]

# Plot distributions
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.savefig(f"../reports/{col.lower()}_distribution.png", dpi=300, bbox_inches="tight")
    plt.close()  # prevents too many figures from showing


In [None]:
#Outlier Detection
# Outlier summary table
outlier_summary = df[num_cols].describe().T[["min", "25%", "50%", "75%", "max"]]

# Save to reports folder
outlier_summary.to_excel("../reports/outlier_summary.xlsx")

# Optional: preview
outlier_summary

In [None]:
# Generate descriptive statistics for numerical columns
# Transposing with .T makes the output easier to read
print("Descriptive Statistics for Numerical Data:")
print(df.describe().T)

In [None]:
# Filter out the row where Age is greater than 100
df = df[df['Age'] <= 100]

# Confirm the outlier is removed by checking the max Age again
print("Max Age after cleaning:", df['Age'].max())

In [None]:
# Drop the existing MntRegularProds column due to the negative value
df = df.drop('MntRegularProds', axis=1)

# Create a new, clean 'MntRegularProds' column
df['MntRegularProds'] = (
    df['MntWines'] +
    df['MntFruits'] +
    df['MntMeatProducts'] +
    df['MntFishProducts'] +
    df['MntSweetProducts']
)

# Verify the new minimum value is now correct (non-negative)
print("Min value of new MntRegularProds:", df['MntRegularProds'].min())

In [None]:
# Step 4: Feature Engineering & Data Preparation

# 1. Create 'Age' from 'Customer_Days'
# This is a good example of converting a raw value into a more interpretable one.
df['Age'] = round(df['Customer_Days'] / 365).astype(int)

# 2. Combine one-hot encoded 'marital' status columns into a single column
# This simplifies the data and makes it easier to work with.
# The .idxmax(axis=1) finds the column name with the value 1,
# and .str.replace() cleans up the name.
marital_cols = ['marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow']
df['Marital_Status'] = df[marital_cols].idxmax(axis=1).str.replace('marital_', '')

# 3. Combine one-hot encoded 'education' columns into a single column
education_cols = ['education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD']
df['Education'] = df[education_cols].idxmax(axis=1).str.replace('education_', '')

# 4. Create a single 'Kids' column from 'Kidhome' and 'Teenhome'
# This gives a quick overview of the total number of children in a household.
df['Kids'] = df['Kidhome'] + df['Teenhome']

# 5. Create 'TotalPurchases' from different purchase types
# Summing up all purchase columns gives us a key performance indicator (KPI)
# for customer engagement.
purchase_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
df['TotalPurchases'] = df[purchase_cols].sum(axis=1)

# 6. Create 'TotalCampaigns' from all campaign responses
# This is another useful KPI to measure a customer's overall responsiveness to marketing efforts.
cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
df['TotalCampaigns'] = df[cmp_cols].sum(axis=1)

# You'll notice the 'MntTotal' and 'AcceptedCmpOverall' columns are already in your dataset.
# The person who prepared this dataset did some of this work already! That's a great time-saver.
# It's always a good practice to verify if these columns are correctly calculated.

# Display the first few rows with the new columns to verify the changes
print(df[['Age', 'Marital_Status', 'Education', 'Kids', 'TotalPurchases', 'TotalCampaigns']].head())

# Save the updated DataFrame
# It is best practice to save the prepared data to a new file so you don't
# accidentally modify your original raw dataset.
df.to_csv('../data/ifood_df_updated.csv', index=False)
print("Data has been updated and saved to 'data/ifood_df_updated.csv'.")

In [None]:
# -----------------------------------------------------------
# Step 5: Exploratory Data Analysis (EDA) - Part 1: Summary Statistics
# -----------------------------------------------------------

# Load the updated dataset you saved in the previous step
df = pd.read_csv('../data/ifood_df_updated.csv')

# Display basic information about the DataFrame, including data types
# This helps confirm that your new columns were created correctly
print("DataFrame Info:")
print(df.info())

print("\n------------------------------------------------\n")

# Generate descriptive statistics for numerical columns
# This gives a quick overview of the central tendency and spread of the data
print("Descriptive Statistics:")
print(df.describe())