In [1]:
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Add utility path
sys.path.append(str((Path().resolve().parent.parent / "src" / "utils").resolve()))
from load_data import load_csv, save_csv

# -------------------------------
# 📥 Load Raw Data
# -------------------------------
df = load_csv("sessions.csv")
df.head()

# -------------------------------
# 🔍 Initial Exploration
# -------------------------------
df.info()
df.isna().sum()
df.describe(include='all')

# -------------------------------
# 🧹 Data Cleaning
# -------------------------------

# Convert datetime column
df['session_start'] = pd.to_datetime(df['session_start'], errors='coerce')

# Clean categorical fields
df['device'] = df['device'].str.strip().str.title()
df['source'] = df['source'].str.strip().str.title()

# Drop nulls for essential columns
df = df.dropna(subset=['customer_id', 'session_start'])

# -------------------------------
# 🧠 Feature Engineering
# -------------------------------

# Day of week & hour
df['session_day'] = df['session_start'].dt.day_name()
df['session_hour'] = df['session_start'].dt.hour

# Flag long sessions (>15 min)
df['long_session'] = df['session_duration_min'] > 15

# Load cleaned customers
customers_df = load_csv("customers_clean.csv")
df = df.merge(customers_df[['customer_id', 'region', 'gender']], on='customer_id', how='left')



In [None]:
# -------------------------------
# 📊 Exploratory Data Analysis
# -------------------------------

# Duration distribution
sns.histplot(df['session_duration_min'], bins=30)
plt.title("Session Duration (Minutes)")
plt.show()

# Pages viewed distribution
sns.histplot(df['pages_viewed'], bins=20)
plt.title("Pages Viewed Per Session")
plt.show()

# Device usage
sns.countplot(y='device', data=df)
plt.title("Devices Used")
plt.show()

# Source breakdown
sns.countplot(y='source', data=df)
plt.title("Traffic Sources")
plt.show()

# Engagement by weekday
sns.boxplot(x='session_day', y='session_duration_min', data=df,
            order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title("Session Duration by Day of Week")
plt.xticks(rotation=45)
plt.show()

# -------------------------------
# 💾 Save Cleaned File
# -------------------------------
save_csv(df, "sessions_clean.csv")
