In [20]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

In [21]:
# Load training data
train = pd.read_csv("train.csv", parse_dates=["date"])
test = pd.read_csv("test.csv", parse_dates=["date"])

In [22]:
# Step 1: Feature Engineering
train["year"] = train["date"].dt.year
train["month"] = train["date"].dt.month
train["day"] = train["date"].dt.day
train["weekday"] = train["date"].dt.weekday

In [23]:
test["year"] = test["date"].dt.year
test["month"] = test["date"].dt.month
test["day"] = test["date"].dt.day
test["weekday"] = test["date"].dt.weekday

In [24]:
# Encoding categorical variables
encoder_store = LabelEncoder()
encoder_family = LabelEncoder()

In [25]:
train["store_nbr"] = encoder_store.fit_transform(train["store_nbr"])
train["family"] = encoder_family.fit_transform(train["family"])

# Handle unseen categories in test data
test["store_nbr"] = test["store_nbr"].map(lambda x: encoder_store.transform([x])[0] if x in encoder_store.classes_ else -1)
test["family"] = test["family"].map(lambda x: encoder_family.transform([x])[0] if x in encoder_family.classes_ else -1)

In [26]:
# Step 2: Define Features and Target
features = ["store_nbr", "family", "onpromotion", "year", "month", "day", "weekday"]
target = "sales"

In [27]:
X = train[features]
y = np.log1p(train[target])  # Log-transform target for RMSLE

In [28]:
# Step 3: Split the Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from sklearn.metrics import mean_squared_log_error

# Define parameters
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42
}

In [30]:
# Log-transform target for RMSLE
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Convert datasets into DMatrix format (optimized for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train_log)
dval = xgb.DMatrix(X_val, label=y_val_log)

In [31]:
# Custom RMSLE metric
def rmsle_eval(preds, dtrain):
    labels = dtrain.get_label()
    return "RMSLE", np.sqrt(mean_squared_log_error(np.expm1(labels), np.expm1(preds)))

# Train the model
model = xgb.train(
    params, 
    dtrain, 
    num_boost_round=1000, 
    evals=[(dval, "validation")], 
    feval=rmsle_eval,  # Custom RMSLE evaluation metric
    early_stopping_rounds=50,  # Stops if no improvement
    verbose_eval=True
)



[0]	validation-rmse:0.80261	validation-RMSLE:0.80261
[1]	validation-rmse:0.78185	validation-RMSLE:0.78185
[2]	validation-rmse:0.75893	validation-RMSLE:0.75893
[3]	validation-rmse:0.74098	validation-RMSLE:0.74098
[4]	validation-rmse:0.73064	validation-RMSLE:0.73064
[5]	validation-rmse:0.72099	validation-RMSLE:0.72099
[6]	validation-rmse:0.71236	validation-RMSLE:0.71237
[7]	validation-rmse:0.70722	validation-RMSLE:0.70722
[8]	validation-rmse:0.69939	validation-RMSLE:0.69939
[9]	validation-rmse:0.69347	validation-RMSLE:0.69347
[10]	validation-rmse:0.68808	validation-RMSLE:0.68808
[11]	validation-rmse:0.67206	validation-RMSLE:0.67206
[12]	validation-rmse:0.65898	validation-RMSLE:0.65898
[13]	validation-rmse:0.64637	validation-RMSLE:0.64637
[14]	validation-rmse:0.63488	validation-RMSLE:0.63488
[15]	validation-rmse:0.62443	validation-RMSLE:0.62443
[16]	validation-rmse:0.61901	validation-RMSLE:0.61901
[17]	validation-rmse:0.61403	validation-RMSLE:0.61403
[18]	validation-rmse:0.60228	validatio

In [32]:
X_test = test.drop(columns=["id"])  # Remove ID column if present

In [33]:
# Ensure 'date' is a datetime type (if not already)
X_test["date"] = pd.to_datetime(X_test["date"])

# Extract useful date features
X_test["year"] = X_test["date"].dt.year
X_test["month"] = X_test["date"].dt.month
X_test["day"] = X_test["date"].dt.day
X_test["weekday"] = X_test["date"].dt.weekday

In [34]:
# Drop the original date column
X_test = X_test.drop(columns=["date"])

# Convert test set into DMatrix
dtest = xgb.DMatrix(X_test)


In [35]:
# Predict using the trained model
y_pred_log = model.predict(dtest)
# Ensure all sales values are non-negative and round to 1 decimal place
y_pred = np.round(np.maximum(0, np.expm1(y_pred_log)), 1)

print("✅ Fixed! Predictions are ready.")

✅ Fixed! Predictions are ready.


In [36]:
# Save predictions
submission = pd.DataFrame({"id": test["id"], "sales": y_pred})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file saved as 'submission.csv'!")

✅ Submission file saved as 'submission.csv'!
