In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # Added Gradient Boosting
from scipy.stats import randint, uniform # For RandomizedSearchCV

In [2]:
print("# --- 1. Initializing modules and loading data from dataset ---")
df = pd.read_csv('coffee_sales.csv')
print("Original DataFrame Head:")
print(df.head())
print("\n")

# --- 1. Initializing modules and loading data from dataset ---


FileNotFoundError: [Errno 2] No such file or directory: 'coffee_sales.csv'

In [None]:
print("# --- 2. Feature Engineering (Enhanced) ---")
df['date'] = pd.to_datetime(df['date'])
df['datetime'] = pd.to_datetime(df['datetime'])


In [None]:
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['day_of_year'] = df['date'].dt.dayofyear # Added day of year


In [None]:
le_coffee = LabelEncoder()
le_cash = LabelEncoder()
df['coffee_type_encoded'] = le_coffee.fit_transform(df['coffee_name']) # Renamed to avoid confusion
df['cash_type_encoded'] = le_cash.fit_transform(df['cash_type']) # Renamed


In [None]:
daily_data = df.groupby('date').agg(
    daily_revenue=('money', 'sum'),
    daily_orders=('coffee_type_encoded', 'count'),
    most_popular_coffee=('coffee_type_encoded', lambda x: x.mode()[0]),
    avg_hour=('hour', 'mean'),
    peak_hour=('hour', 'max'),
    day_of_week=('day_of_week', 'first'),
    month=('month', 'first'),
    day_of_year=('day_of_year', 'first'), # Added day of year to daily_data
    card_usage_ratio=('cash_type_encoded', 'mean')
).reset_index()

print("Average daily revenue: ", daily_data['daily_revenue'].mean())


In [None]:
# Sort by date to ensure correct lag calculation
daily_data = daily_data.sort_values(by='date').reset_index(drop=True)

# Lagged features for daily_revenue and daily_orders
# Lag 1 day
daily_data['daily_revenue_lag1'] = daily_data['daily_revenue'].shift(1)
daily_data['daily_orders_lag1'] = daily_data['daily_orders'].shift(1)

# Lag 7 days (to capture weekly seasonality)
daily_data['daily_revenue_lag7'] = daily_data['daily_revenue'].shift(7)
daily_data['daily_orders_lag7'] = daily_data['daily_orders'].shift(7)

# Rolling mean features (e.g., 3-day rolling average)
daily_data['daily_revenue_rolling_mean3'] = daily_data['daily_revenue'].rolling(window=3, min_periods=1).mean().shift(1)
daily_data['daily_orders_rolling_mean3'] = daily_data['daily_orders'].rolling(window=3, min_periods=1).mean().shift(1)

# Drop rows with NaN values introduced by shifting (typically the first few rows)
daily_data = daily_data.dropna().reset_index(drop=True)

print("Aggregated Daily Data with New Features Head:")
print(daily_data.head())
print("\n")

In [None]:
def print_metrics(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} Metrics:")
    print(f"RÂ²: {r2:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"RMSE: {rmse:.3f}\n")

In [None]:
print("# --- 3. Data Preparation for Modeling ---")

# Define features and targets for Revenue Prediction
# Note: 'daily_orders' is used as a feature for revenue prediction
features_revenue = [
    'daily_orders', 'most_popular_coffee', 'avg_hour', 'peak_hour',
    'day_of_week', 'month', 'day_of_year', 'card_usage_ratio',
    'daily_revenue_lag1', 'daily_orders_lag1',
    'daily_revenue_lag7', 'daily_orders_lag7',
    'daily_revenue_rolling_mean3', 'daily_orders_rolling_mean3'
]
X_revenue = daily_data[features_revenue]
y_revenue = daily_data['daily_revenue']

# Define features and targets for Demand Prediction
# Note: 'daily_orders' is the target, so it's NOT a feature here.
features_demand = [
    'most_popular_coffee', 'avg_hour', 'peak_hour',
    'day_of_week', 'month', 'day_of_year', 'card_usage_ratio',
    'daily_orders_lag1', # Only lag of demand itself
    'daily_orders_lag7', # Only lag of demand itself
    'daily_orders_rolling_mean3' # Only rolling mean of demand itself
]
X_demand = daily_data[features_demand]
y_demand = daily_data['daily_orders']

# Time-based train-test split (80% train, 20% test)
# Ensure the split maintains chronological order
split_index = int(len(daily_data) * 0.8)

X_revenue_train, X_revenue_test = X_revenue[:split_index], X_revenue[split_index:]
y_revenue_train, y_revenue_test = y_revenue[:split_index], y_revenue[split_index:]

X_demand_train, X_demand_test = X_demand[:split_index], X_demand[split_index:]
y_demand_train, y_demand_test = y_demand[:split_index], y_demand[split_index:]

print(f"Train set size: {len(X_revenue_train)} samples")
print(f"Test set size: {len(X_revenue_test)} samples")

# Scale numerical features
scaler_revenue = StandardScaler()
X_revenue_train_scaled = scaler_revenue.fit_transform(X_revenue_train)
X_revenue_test_scaled = scaler_revenue.transform(X_revenue_test)

scaler_demand = StandardScaler()
X_demand_train_scaled = scaler_demand.fit_transform(X_demand_train)
X_demand_test_scaled = scaler_demand.transform(X_demand_test)
print("\nFeatures scaled.\n")

In [None]:
print("# --- 4. Model Training and Evaluation (Improved) ---")

In [None]:
print("--- Linear Regression ---")
lr_revenue = LinearRegression()
lr_revenue.fit(X_revenue_train_scaled, y_revenue_train)
lr_revenue_pred = lr_revenue.predict(X_revenue_test_scaled)
print_metrics(y_revenue_test, lr_revenue_pred, "Revenue (Linear Regression)")

lr_demand = LinearRegression()
lr_demand.fit(X_demand_train_scaled, y_demand_train)
lr_demand_pred = lr_demand.predict(X_demand_test_scaled)
print_metrics(y_demand_test, lr_demand_pred, "Demand (Linear Regression)")

# --- Random Forest Regressor (with Hyperparameter Tuning) ---
print("--- Random Forest Regressor (with Hyperparameter Tuning) ---")

# Parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 500),
    'max_features': ['sqrt', 'log2', None], # 'auto' is deprecated, use 'sqrt'
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

In [None]:
print("Tuning Random Forest for Revenue Prediction...")
rf_revenue = RandomForestRegressor(random_state=42)
rand_search_revenue = RandomizedSearchCV(
    estimator=rf_revenue,
    param_distributions=param_dist,
    n_iter=50, # Number of parameter settings that are sampled
    cv=5, # Using 5-fold cross-validation
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1, # Use all available cores
    verbose=0
)
rand_search_revenue.fit(X_revenue_train_scaled, y_revenue_train)
best_rf_revenue = rand_search_revenue.best_estimator_
print(f"Best parameters for Revenue RF: {rand_search_revenue.best_params_}")

rf_revenue_pred = best_rf_revenue.predict(X_revenue_test_scaled)
print_metrics(y_revenue_test, rf_revenue_pred, "Revenue (Tuned Random Forest)")


In [None]:
print("Tuning Random Forest for Demand Prediction...")
rf_demand = RandomForestRegressor(random_state=42)
rand_search_demand = RandomizedSearchCV(
    estimator=rf_demand,
    param_distributions=param_dist,
    n_iter=50, # Number of parameter settings that are sampled
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rand_search_demand.fit(X_demand_train_scaled, y_demand_train)
best_rf_demand = rand_search_demand.best_estimator_
print(f"Best parameters for Demand RF: {rand_search_demand.best_params_}")

rf_demand_pred = best_rf_demand.predict(X_demand_test_scaled)
print_metrics(y_demand_test, rf_demand_pred, "Demand (Tuned Random Forest)")

In [None]:
print("--- Optional: Gradient Boosting Regressor ---")
# Gradient Boosting often performs very well on tabular data.
# We can also tune its hyperparameters. For now, let's use default or reasonable ones.

# For Revenue
gbr_revenue = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_revenue.fit(X_revenue_train_scaled, y_revenue_train)
gbr_revenue_pred = gbr_revenue.predict(X_revenue_test_scaled)
print_metrics(y_revenue_test, gbr_revenue_pred, "Revenue (Gradient Boosting Regressor)")

# For Demand
gbr_demand = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_demand.fit(X_demand_train_scaled, y_demand_train)
gbr_demand_pred = gbr_demand.predict(X_demand_test_scaled)
print_metrics(y_demand_test, gbr_demand_pred, "Demand (Gradient Boosting Regressor)")

In [None]:
print("# --- 5. Visualization of Predictions ---")

plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.plot(daily_data['date'][split_index:], y_revenue_test, label='Actual Revenue')
plt.plot(daily_data['date'][split_index:], rf_revenue_pred, label='RF Predicted Revenue', alpha=0.7)
plt.title('Daily Revenue: Actual vs. RF Predicted')
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(daily_data['date'][split_index:], y_demand_test, label='Actual Demand')
plt.plot(daily_data['date'][split_index:], rf_demand_pred, label='RF Predicted Demand', alpha=0.7)
plt.title('Daily Demand: Actual vs. RF Predicted')
plt.xlabel('Date')
plt.ylabel('Orders')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)

plt.tight_layout()
plt.show()