In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
import joblib


In [13]:
df=pd.read_csv('../data/daily_sales.csv')
df['date'] = pd.to_datetime(df['date'])

In [14]:
df.head()

Unnamed: 0,date,total_sales,year,month,day,dayofweek,is_weekend,sales_yesterday,sales_last_week,sales_avg_7d,sales_avg_30d
0,2013-01-01,2511.618999,2013,1,1,1,0,,,,
1,2013-01-02,496092.417944,2013,1,2,2,0,2511.618999,,,
2,2013-01-03,361461.231124,2013,1,3,3,0,496092.417944,,,
3,2013-01-04,354459.677093,2013,1,4,4,0,361461.231124,,,
4,2013-01-05,477350.121229,2013,1,5,5,1,354459.677093,,,


In [15]:
df.shape

(1684, 11)

In [16]:
feature_cols = ['year', 'month', 'day', 'dayofweek', 'is_weekend', 
                'sales_yesterday', 'sales_last_week', 'sales_avg_7d', 'sales_avg_30d']

In [18]:
df_clean=df.dropna()
print('Shape of the cleaned data:', df_clean.shape)

Shape of the cleaned data: (1655, 11)


In [19]:
df_clean = df.dropna()
print(f"Clean data shape: {df_clean.shape}")

# Cell 3: Train/test split
split_date = df_clean['date'].max() - pd.DateOffset(days=30)
train_data = df_clean[df_clean['date'] < split_date]
test_data = df_clean[df_clean['date'] >= split_date]

X_train = train_data[feature_cols]
y_train = train_data['total_sales']
X_test = test_data[feature_cols]
y_test = test_data['total_sales']

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

Clean data shape: (1655, 11)
Train: 1624, Test: 31


In [20]:
models = {
    'Linear': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = np.maximum(pred, 0)
    
    mae = mean_absolute_error(y_test, pred)
    results[name] = mae
    
    print(f"{name} MAE: ${mae:,.0f}")

Linear MAE: $64,902
Random Forest MAE: $51,007


In [21]:
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

In [23]:
joblib.dump(best_model, '../models/best_model.pkl')

print(f"Best model: {best_model_name}")
print("Model saved!")

Best model: Random Forest
Model saved!
