In [3]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load dataset
df = pd.read_csv("./Train-Set.csv")

# Target column
y = df["OutletSales"]   # <-- replace with your actual target column
X = df.drop("OutletSales", axis=1)

# Categorical columns
categorical_cols = [
    'ProductID',
    'FatContent',
    'ProductType',
    'OutletID',
    'OutletSize',
    'LocationType',
    'OutletType'
]

# Numerical columns
numerical_cols = [
    'Weight',
    'ProductVisibility',
    'MRP',
    'EstablishmentYear'
]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', XGBRegressor(n_estimators=300, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
pipeline.fit(X_train, y_train)

# Evaluate
preds = pipeline.predict(X_test)
print("R2 Score:", r2_score(y_test, preds))

# Save FULL pipeline
pickle.dump(pipeline, open("ml_model.pkl", "wb"))

R2 Score: 0.45835916272663957
