In [11]:
# train_model.py
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def train_model(data_path='retail_store_inventory.csv'):
    df = pd.read_csv(data_path)

    # Feature engineering
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

    df['Day'] = df['Date'].dt.day
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year

    # Drop unused or high cardinality fields
    df = df.drop(['Date', 'Store ID', 'Product ID'], axis=1)

    # Convert categorical variables
    df = pd.get_dummies(df, drop_first=True)

    X = df.drop(['Units Sold'], axis=1)
    y = df['Units Sold']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Model trained. MSE: {mse:.2f}")

    joblib.dump(model, 'xgb_demand_model.pkl')
    joblib.dump(X.columns, 'model_features.pkl')

if __name__ == '__main__':
    train_model()


Model trained. MSE: 70.75
