In [4]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/final-train-nodes/final_train_nodes.csv
/kaggle/input/int20h-2023-hackathon/nodes.csv
/kaggle/input/int20h-2023-hackathon/nodes_test.csv
/kaggle/input/int20h-2023-hackathon/orders.csv
/kaggle/input/int20h-2023-hackathon/test.csv
/kaggle/input/int20h-2023-hackathon/final_test.csv
/kaggle/input/uklon-hackaton-nodes/nodes.csv
/kaggle/input/uklon-hackaton-orders/orders.csv


## Download final train nodes info after preprocessing (saved in DataPreparation.ipynb) 

In [2]:
final_train_nodes = pd.read_csv('/kaggle/input/final-train-nodes/final_train_nodes.csv')
final_train_nodes = final_train_nodes.drop("Unnamed: 0", axis=1)
final_train_nodes

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time,centr_distance_st,centr_distance_fin,Kyivsky Raion,Malynovsky Raion,Prymorsk Raion,unknown_region
0,1.403211e+18,290773715,4768528694,5.147066,47.017761,5.060632,5.135280,0,1,0,0
1,5.763551e+18,10980432,3719876029,5.621990,51.687198,2.352199,20.000000,0,0,0,1
2,5.443825e+18,274917390,5218361665,3.120920,0.000000,0.509588,1.291575,0,0,1,0
3,4.695904e+18,290800926,482648118,4.697003,39.967319,4.906856,4.930175,0,1,0,0
4,8.978881e+17,27126445,1987168307,1.764181,16.777725,7.376104,7.546448,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
5995,1.281328e+18,27126477,27126488,5.552757,47.259739,6.903768,7.222168,0,0,0,1
5996,4.331594e+18,10980427,4773378423,5.506385,39.559718,2.635005,3.153057,0,0,1,0
5997,7.030316e+18,290800917,8952394133,1.720923,13.580452,5.653728,1558.781118,0,0,0,1
5998,2.069922e+18,10980464,4768348532,0.947986,11.814679,1.746690,1.384377,0,1,0,0


## Installing meteostat and osmapi libraries

In [6]:
!pip install meteostat
!pip install osmapi

Collecting meteostat
  Downloading meteostat-1.6.5-py3-none-any.whl (31 kB)
Installing collected packages: meteostat
Successfully installed meteostat-1.6.5
[0mCollecting osmapi
  Downloading osmapi-3.1.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m387.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: osmapi
Successfully installed osmapi-3.1.0
[0m

## Preprocessing of orders information 

We use meteostat library to find most common natural phenomena in that day

In [7]:
orders_dfrm = pd.read_csv('/kaggle/input/uklon-hackaton-orders/orders.csv')
nodes_dfrm = pd.read_csv('/kaggle/input/uklon-hackaton-nodes/nodes.csv')

orders_dfrm['running_time'] = pd.to_datetime(orders_dfrm['running_time'])
orders_dfrm['completed_time'] = pd.to_datetime(orders_dfrm['completed_time'])
orders_dfrm["Id"] = np.abs(orders_dfrm["Id"])
nodes_dfrm["Id"] = np.abs(nodes_dfrm["Id"])

hours = orders_dfrm["running_time"].apply(lambda x: x.hour)
orders_dfrm["hours"] = hours.values
bins = [0, 5, 10, 13, 16, 20, 23]
names = ['night', 'morning', 'afternoon_1', 'afternoon_2', 'evening_1', 'evening_2']

orders_dfrm['time_of_day'] = pd.cut(orders_dfrm['hours'], bins, labels=names) 

from sklearn.impute import KNNImputer

imputer = KNNImputer()
imputed = imputer.fit_transform(nodes_dfrm)
df_nodes_new = pd.DataFrame(imputed, columns=nodes_dfrm.columns)

from meteostat import Point,Hourly, Stations
import osmapi as osm
from datetime import timedelta

api = osm.OsmApi()
node = api.NodeGet(nodes_dfrm['node_start'][0])

stations = Stations()
stations = stations.nearby(node['lat'], node['lon'])
station = stations.fetch(1)

w = Hourly(station, orders_dfrm["running_time"].min() - timedelta(hours=0,minutes=35), orders_dfrm["running_time"].max())

w = w.fetch()

w["coco"].unique()
weather_types = {
    3:"Cloudy",
    4:"Overcast",
    5:"Fog",
    21:"Snow Shower",
    22: "Heavy Snow Shower"
}

w.index = pd.to_datetime(w.index)
w["hour"] = w.index.hour
df_orders_new = orders_dfrm.merge(w[["hour", "coco"]], left_on = "hours", right_on="hour")
df_orders_new.drop(columns=["hour"], inplace=True)
df_orders_new["coco"] = df_orders_new["coco"].replace(weather_types)
df_orders_w = pd.get_dummies(df_orders_new, prefix='', prefix_sep='')
df_orders_w

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time,hours,night,morning,afternoon_1,afternoon_2,evening_1,evening_2,Cloudy,Fog,Heavy Snow Shower,Overcast,Snow Shower
0,7013180891535596072,2022-01-24 11:04:12,2022-01-24 11:14:07,4.061,595.0,11,0,0,1,0,0,0,0,0,0,0,1
1,9124568004071597524,2022-01-24 11:23:59,2022-01-24 11:35:01,4.903,662.0,11,0,0,1,0,0,0,0,0,0,0,1
2,3080014891063950854,2022-01-24 11:29:00,2022-01-24 11:44:49,8.420,949.0,11,0,0,1,0,0,0,0,0,0,0,1
3,6156766971585401913,2022-01-24 11:25:56,2022-01-24 11:40:54,3.511,898.0,11,0,0,1,0,0,0,0,0,0,0,1
4,1551185077857631394,2022-01-24 11:19:35,2022-01-24 11:34:01,3.188,866.0,11,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,7492326860735367343,2022-01-24 02:25:07,2022-01-24 02:28:59,1.463,232.0,2,1,0,0,0,0,0,1,0,0,0,0
5996,7655848563748033000,2022-01-24 02:44:43,2022-01-24 03:01:04,8.962,981.0,2,1,0,0,0,0,0,1,0,0,0,0
5997,210961324812816296,2022-01-24 02:18:08,2022-01-24 02:29:56,5.924,708.0,2,1,0,0,0,0,0,1,0,0,0,0
5998,5154489657190406661,2022-01-24 02:17:17,2022-01-24 02:21:32,2.800,255.0,2,1,0,0,0,0,0,1,0,0,0,0


Next we can merge orders information with new features created by nodes

In [9]:
final_train_nodes = final_train_nodes.sort_values('Id')
df_orders_w = df_orders_w.sort_values('Id')

In [10]:
processed_orders_train = pd.concat([final_train_nodes, df_orders_w], axis=1)
processed_orders_train 

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time,centr_distance_st,centr_distance_fin,Kyivsky Raion,Malynovsky Raion,Prymorsk Raion,...,morning,afternoon_1,afternoon_2,evening_1,evening_2,Cloudy,Fog,Heavy Snow Shower,Overcast,Snow Shower
0,1.403211e+18,290773715,4768528694,5.147066,47.017761,5.060632,5.135280,0,1,0,...,0,1,0,0,0,0,0,0,0,1
1,5.763551e+18,10980432,3719876029,5.621990,51.687198,2.352199,20.000000,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,5.443825e+18,274917390,5218361665,3.120920,0.000000,0.509588,1.291575,0,0,1,...,0,1,0,0,0,0,0,0,0,1
3,4.695904e+18,290800926,482648118,4.697003,39.967319,4.906856,4.930175,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,8.978881e+17,27126445,1987168307,1.764181,16.777725,7.376104,7.546448,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,1.281328e+18,27126477,27126488,5.552757,47.259739,6.903768,7.222168,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5996,4.331594e+18,10980427,4773378423,5.506385,39.559718,2.635005,3.153057,0,0,1,...,0,0,0,0,0,1,0,0,0,0
5997,7.030316e+18,290800917,8952394133,1.720923,13.580452,5.653728,1558.781118,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5998,2.069922e+18,10980464,4768348532,0.947986,11.814679,1.746690,1.384377,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [14]:
final_dataset = processed_orders_train.drop("completed_time", axis=1)
cols = list(final_dataset.columns.values)
cols.remove('delta_time')
final_dataset = final_dataset[cols + ['delta_time']]

In [15]:
final_dataset = final_dataset.drop(["Id", "running_time"], axis=1)
final_dataset

Unnamed: 0,node_start,node_finish,avg_distance,avg_time,centr_distance_st,centr_distance_fin,Kyivsky Raion,Malynovsky Raion,Prymorsk Raion,unknown_region,...,afternoon_1,afternoon_2,evening_1,evening_2,Cloudy,Fog,Heavy Snow Shower,Overcast,Snow Shower,delta_time
0,290773715,4768528694,5.147066,47.017761,5.060632,5.135280,0,1,0,0,...,1,0,0,0,0,0,0,0,1,595.0
1,10980432,3719876029,5.621990,51.687198,2.352199,20.000000,0,0,0,1,...,1,0,0,0,0,0,0,0,1,662.0
2,274917390,5218361665,3.120920,0.000000,0.509588,1.291575,0,0,1,0,...,1,0,0,0,0,0,0,0,1,949.0
3,290800926,482648118,4.697003,39.967319,4.906856,4.930175,0,1,0,0,...,1,0,0,0,0,0,0,0,1,898.0
4,27126445,1987168307,1.764181,16.777725,7.376104,7.546448,0,0,0,1,...,1,0,0,0,0,0,0,0,1,866.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,27126477,27126488,5.552757,47.259739,6.903768,7.222168,0,0,0,1,...,0,0,0,0,1,0,0,0,0,232.0
5996,10980427,4773378423,5.506385,39.559718,2.635005,3.153057,0,0,1,0,...,0,0,0,0,1,0,0,0,0,981.0
5997,290800917,8952394133,1.720923,13.580452,5.653728,1558.781118,0,0,0,1,...,0,0,0,0,1,0,0,0,0,708.0
5998,10980464,4768348532,0.947986,11.814679,1.746690,1.384377,0,1,0,0,...,0,0,0,0,1,0,0,0,0,255.0


# Building a Model

We decided to use XGBoost algorithm for solving this regression task. 

In [48]:
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE


X, y = final_dataset.iloc[:, :-1], final_dataset.iloc[:, -1]
  
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state = 42)

xgb_r = xg.XGBRegressor()
xgb_r.fit(train_X, train_y, eval_set=[(train_X, train_y), (test_X, test_y)], verbose=False)

y_pred = xgb_r.predict(test_X)

rmse = np.sqrt(MSE(test_y, y_pred))

In [17]:
rmse

133.5434946829071

# Model Tuning

We used Optuna, based on Bayesian optimization algorithm

In [70]:
from sklearn.metrics import mean_squared_error


def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xg.XGBRegressor(**param)
    model.fit(train_X, train_y)
    y_pred = model.predict(test_X)
    return mean_squared_error(test_y, y_pred)

In [71]:
import optuna
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=200)

[32m[I 2023-03-05 06:06:46,530][0m A new study created in memory with name: regression[0m
[32m[I 2023-03-05 06:06:48,660][0m Trial 0 finished with value: 31702.440768853576 and parameters: {'max_depth': 3, 'learning_rate': 0.648765345386179, 'n_estimators': 611, 'min_child_weight': 8, 'gamma': 0.8830894652086617, 'subsample': 0.6262561646080176, 'colsample_bytree': 0.46603611565682274, 'reg_alpha': 0.14720884800595926, 'reg_lambda': 0.5182934668794748, 'random_state': 563}. Best is trial 0 with value: 31702.440768853576.[0m
[32m[I 2023-03-05 06:06:51,685][0m Trial 1 finished with value: 49217.410026023375 and parameters: {'max_depth': 7, 'learning_rate': 0.8133974393858994, 'n_estimators': 859, 'min_child_weight': 5, 'gamma': 0.6275751670033568, 'subsample': 0.6107206992652945, 'colsample_bytree': 0.16942302234958315, 'reg_alpha': 0.87302464622238, 'reg_lambda': 0.40338815801671857, 'random_state': 434}. Best is trial 0 with value: 31702.440768853576.[0m
[32m[I 2023-03-05 06:

In [72]:
print('Best parameters', study.best_params)

Best parameters {'max_depth': 3, 'learning_rate': 0.14576085826177032, 'n_estimators': 51, 'min_child_weight': 1, 'gamma': 0.13161353552006624, 'subsample': 0.8104977384588856, 'colsample_bytree': 0.75905597859778, 'reg_alpha': 0.3335284890374997, 'reg_lambda': 0.36050868351747817, 'random_state': 354}


In [74]:
from sklearn.metrics import mean_squared_error

best_pars = study.best_params
best_pars["eval_metric"] = 'rmse'

model = xg.XGBRegressor(**best_pars)
model.fit(train_X, train_y)
y_pred = model.predict(test_X)

print('RMSE: ', np.sqrt(mean_squared_error(test_y, y_pred)))

RMSE:  133.2184673860847


Metrics was improved a little bit

# Data Preparation for Submission

Download final test nodes info after preprocessing (saved in DataPreparation.ipynb)

In [79]:
final_test_nodes = pd.read_csv("/kaggle/input/final-test-nodes/final_test_nodes.csv")
final_test_nodes = final_test_nodes.drop("Unnamed: 0", axis=1)
final_test_nodes["Id"] = final_test_nodes["Id"].apply(lambda x: int(x))
final_test_nodes

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time,centr_distance_st,centr_distance_fin,Kyivsky Raion,Malynovsky Raion,Prymorsk Raion,unknown_region
0,6198,8952394129,6878011682,4.706362,30.176927,1562.863912,34.259433,0,0,0,1
1,6417,290008230,1262365786,6.238546,40.495348,1.812387,3.502169,0,0,1,0
2,7054,1984088902,317189357,3.905904,29.912151,2.393025,2.145064,0,0,0,1
3,9628,290941343,290897544,5.949408,43.329206,7.745447,7.818495,1,0,0,0
4,10283,290941344,7878405269,7.014933,50.521088,7.816996,7.794678,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,525706,290800924,3127870582,2.895784,22.095807,6.554184,6.530242,0,1,0,0
996,526604,4775833861,4909437297,3.483059,27.500757,4.801377,4.728286,0,0,0,1
997,527213,4807102920,4732308198,3.440952,27.840056,1.530748,2.141107,0,0,0,1
998,527520,5966955830,4491475140,0.702519,5.932918,2.701672,2.764008,0,0,0,1


Repeat steps from train data preparation for test

In [80]:
import numpy as np
import pandas as pd

orders_dfrm = pd.read_csv('/kaggle/input/int20h-2023-hackathon/final_test.csv')
nodes_dfrm = pd.read_csv('/kaggle/input/int20h-2023-hackathon/nodes_test.csv')
orders_dfrm['running_time'] = pd.to_datetime(orders_dfrm['running_time'])

orders_dfrm["Id"] = np.abs(orders_dfrm["Id"])
nodes_dfrm["Id"] = np.abs(nodes_dfrm["Id"])

hours = orders_dfrm["running_time"].apply(lambda x: x.hour)
orders_dfrm["hours"] = hours.values
bins = [0, 5, 10, 13, 16, 20, 23]
names = ['night', 'morning', 'afternoon_1', 'afternoon_2', 'evening_1', 'evening_2']

orders_dfrm['time_of_day'] = pd.cut(orders_dfrm['hours'], bins, labels=names) 

from sklearn.impute import KNNImputer

imputer = KNNImputer()
imputed = imputer.fit_transform(nodes_dfrm)
df_nodes_new = pd.DataFrame(imputed, columns=nodes_dfrm.columns)
df_nodes_new.head()

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,6198.0,8952394000.0,8952394000.0,138.79571,37.0
1,6198.0,2059504000.0,4548172000.0,95.273001,33.0
2,6198.0,2059505000.0,2059504000.0,137.647881,35.0
3,6198.0,1570777000.0,1977019000.0,4.383708,30.0
4,6198.0,1977019000.0,1977019000.0,24.195593,31.0


In [82]:
from meteostat import Point,Hourly, Stations
import osmapi as osm
from datetime import timedelta

api = osm.OsmApi()
node = api.NodeGet(nodes_dfrm['node_start'][5])

stations = Stations()
stations = stations.nearby(node['lat'], node['lon'])
station = stations.fetch(1)

w = Hourly(station, orders_dfrm["running_time"].min() - timedelta(hours=0,minutes=35), orders_dfrm["running_time"].max())

w = w.fetch()

w["coco"].unique()
weather_types = {
    3:"Cloudy",
    4:"Overcast",
    5:"Fog",
    21:"Snow Shower",
    22: "Heavy Snow Shower"
}

w.index = pd.to_datetime(w.index)
w["hour"] = w.index.hour
df_orders_new = orders_dfrm.merge(w[["hour", "coco"]], left_on = "hours", right_on="hour", how="outer")

df_orders_new["hour"] = df_orders_new["hour"].fillna(3)
df_orders_new["coco"] = df_orders_new["coco"].fillna(3)

df_orders_new.drop(columns=["hour"], inplace=True)
df_orders_new["coco"] = df_orders_new["coco"].replace(weather_types)
df_orders_w = pd.get_dummies(df_orders_new, prefix='', prefix_sep='')
df_orders_w

Unnamed: 0,Id,running_time,route_distance_km,hours,night,morning,afternoon_1,afternoon_2,evening_1,evening_2,Cloudy,Fog,Heavy Snow Shower,Overcast,Snow Shower
0,6198,2022-01-24 03:38:30,4.744,3,1,0,0,0,0,0,1,0,0,0,0
1,6417,2022-01-24 03:45:51,6.279,3,1,0,0,0,0,0,1,0,0,0,0
2,7054,2022-01-24 03:52:14,3.934,3,1,0,0,0,0,0,1,0,0,0,0
3,9628,2022-01-24 04:03:21,5.959,4,1,0,0,0,0,0,0,0,0,0,1
4,10283,2022-01-24 04:01:12,7.028,4,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,525706,2022-01-24 18:46:17,2.897,18,0,0,0,0,1,0,0,0,0,0,1
996,526604,2022-01-24 18:46:44,3.482,18,0,0,0,0,1,0,0,0,0,0,1
997,527213,2022-01-24 18:47:25,3.486,18,0,0,0,0,1,0,0,0,0,0,1
998,527520,2022-01-24 18:52:01,0.703,18,0,0,0,0,1,0,0,0,0,0,1


In [84]:
final_test_nodes = final_test_nodes.sort_values('Id')
df_orders_w = df_orders_w.sort_values('Id')

In [86]:
processed_test_orders = pd.concat([final_test_nodes, df_orders_w], axis=1)
processed_test_orders

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time,centr_distance_st,centr_distance_fin,Kyivsky Raion,Malynovsky Raion,Prymorsk Raion,...,morning,afternoon_1,afternoon_2,evening_1,evening_2,Cloudy,Fog,Heavy Snow Shower,Overcast,Snow Shower
0,6198,8952394129,6878011682,4.706362,30.176927,1562.863912,34.259433,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,6417,290008230,1262365786,6.238546,40.495348,1.812387,3.502169,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,7054,1984088902,317189357,3.905904,29.912151,2.393025,2.145064,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,9628,290941343,290897544,5.949408,43.329206,7.745447,7.818495,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,10283,290941344,7878405269,7.014933,50.521088,7.816996,7.794678,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,525706,290800924,3127870582,2.895784,22.095807,6.554184,6.530242,0,1,0,...,0,0,0,1,0,0,0,0,0,1
996,526604,4775833861,4909437297,3.483059,27.500757,4.801377,4.728286,0,0,0,...,0,0,0,1,0,0,0,0,0,1
997,527213,4807102920,4732308198,3.440952,27.840056,1.530748,2.141107,0,0,0,...,0,0,0,1,0,0,0,0,0,1
998,527520,5966955830,4491475140,0.702519,5.932918,2.701672,2.764008,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [87]:
final_dataset = processed_test_orders.drop(["Id", "running_time"], axis=1)
final_dataset

Unnamed: 0,node_start,node_finish,avg_distance,avg_time,centr_distance_st,centr_distance_fin,Kyivsky Raion,Malynovsky Raion,Prymorsk Raion,unknown_region,...,morning,afternoon_1,afternoon_2,evening_1,evening_2,Cloudy,Fog,Heavy Snow Shower,Overcast,Snow Shower
0,8952394129,6878011682,4.706362,30.176927,1562.863912,34.259433,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,290008230,1262365786,6.238546,40.495348,1.812387,3.502169,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,1984088902,317189357,3.905904,29.912151,2.393025,2.145064,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,290941343,290897544,5.949408,43.329206,7.745447,7.818495,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,290941344,7878405269,7.014933,50.521088,7.816996,7.794678,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,290800924,3127870582,2.895784,22.095807,6.554184,6.530242,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
996,4775833861,4909437297,3.483059,27.500757,4.801377,4.728286,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
997,4807102920,4732308198,3.440952,27.840056,1.530748,2.141107,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
998,5966955830,4491475140,0.702519,5.932918,2.701672,2.764008,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1


Save unique values of id

In [89]:
user_ids = list(processed_test_orders["Id"].values)

Model prediction for test data

In [90]:
test_pred = model.predict(final_dataset)

In [91]:
user_ids = [int(i[0]) for i in user_ids]
test_pred = [round(i) for i in test_pred]

In [92]:
sub_dict = {}

sub_dict["Id"] = user_ids
sub_dict["Predicted"] = test_pred

submission = pd.DataFrame.from_dict(sub_dict)
submission

Unnamed: 0,Id,Predicted
0,6198,766
1,6417,680
2,7054,501
3,9628,697
4,10283,805
...,...,...
995,525706,521
996,526604,753
997,527213,747
998,527520,463


In [93]:
submission.to_csv('submission.csv', index=False)