In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
# Load Training data
url = 'https://raw.githubusercontent.com/zex3/mlKaggle/main/train.csv'
train_df = pd.read_csv(url)
print("Train shape", train_df.shape)
train_df

Train shape (913000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [None]:
# Load Testing Data
url = 'https://raw.githubusercontent.com/zex3/mlKaggle/main/test.csv'
test_df = pd.read_csv(url)
print("Test shape", test_df.shape)
test_df

Test shape (45000, 4)


Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1
...,...,...,...,...
44995,44995,2018-03-27,10,50
44996,44996,2018-03-28,10,50
44997,44997,2018-03-29,10,50
44998,44998,2018-03-30,10,50


In [None]:
# Convert date column to datetime type
train_df["date"] = pd.to_datetime(train_df["date"])
test_df["date"] = pd.to_datetime(test_df["date"])

In [None]:
# Filter the data to get only the 2017 data for validation set
validation_df = train_df[train_df["date"].dt.year == 2017]
validation_df

Unnamed: 0,date,store,item,sales
1461,2017-01-01,1,1,19
1462,2017-01-02,1,1,15
1463,2017-01-03,1,1,10
1464,2017-01-04,1,1,16
1465,2017-01-05,1,1,14
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [None]:
# Load validation predictions from two different models
val_preds_1 = pd.read_csv('val_pred_prophet.csv')
val_preds_2 = pd.read_csv('val_pred_lightGBM.csv')
val_preds_3 = pd.read_csv('val_pred_gam.csv')
val_preds_4 = pd.read_csv('val_pred_xgb.csv')

# Check shapes of data
print("Val Preds 1 shape", val_preds_1.shape)
print("Val Preds 2 shape", val_preds_2.shape)
print("Val Preds 3 shape", val_preds_3.shape)
print("Val Preds 4 shape", val_preds_4.shape)
print("Validation df shape", validation_df.shape)

Val Preds 1 shape (182500, 2)
Val Preds 2 shape (182500, 2)
Val Preds 3 shape (182500, 4)
Val Preds 4 shape (182500, 5)
Validation df shape (182500, 4)


In [None]:
# Load test predictions from the same two models
test_preds_1 = pd.read_csv('Prophet_submission.csv')
test_preds_2 = pd.read_csv('LightGBM_submission.csv')
test_preds_3 = pd.read_csv('gam_submission.csv')
test_preds_4 = pd.read_csv('xgb_submission.csv')

# Check shapes of data
print("Test Preds 1 shape", test_preds_1.shape)
print("Test Preds 2 shape", test_preds_2.shape)
print("Test Preds 3 shape", test_preds_3.shape)
print("Test Preds 4 shape", test_preds_4.shape)
print("Test df shape", test_df.shape)

Test Preds 1 shape (45000, 2)
Test Preds 2 shape (45000, 2)
Test Preds 3 shape (45000, 2)
Test Preds 4 shape (45000, 2)
Test df shape (45000, 4)


In [None]:
# Combine predictions to create features
X_train = pd.DataFrame({
    'model1_preds': val_preds_1['sales'],
    'model2_preds': val_preds_2['sales'],
    'model3_preds': val_preds_3['sales'],
    'model4_preds': val_preds_4['sales']
})

In [None]:
y_train = validation_df["sales"]

In [None]:
# Train LightGBM meta model
meta_model = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=100)
meta_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 823
[LightGBM] [Info] Number of data points in the train set: 182500, number of used features: 4
[LightGBM] [Info] Start training from score 58.815014


In [None]:
# Prepare test features
X_test = pd.DataFrame({
    'model1_preds': test_preds_1['sales'],
    'model2_preds': test_preds_2['sales'],
    'model3_preds': test_preds_3['sales'],
    'model4_preds': test_preds_4['sales']
})

In [None]:
# Predict on the test set
test_sales_predictions = meta_model.predict(X_test)

In [None]:
# Save the predictions to a CSV file
test_set_with_predictions = test_preds_1[['id']].copy()
test_set_with_predictions['sales'] = test_sales_predictions
test_set_with_predictions.to_csv('combined_predictions.csv', index=False)