### Preprocessing data

Import necessary libaries

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

***Reading data***

In [4]:
center_info = pd.read_csv('../Data/fulfilment_center_info.csv')
meal_info = pd.read_csv('../Data/meal_info.csv')
train = pd.read_csv('../Data/train.csv')

Merge data

In [5]:
data = train.merge(center_info, on=['center_id'], how='inner').merge(meal_info, on=['meal_id'], how='inner').sort_values(by=['week']).reset_index(drop=True)
print(data.shape)
data.head()

(456548, 15)


Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,1466964,1,55,1993,136.83,135.83,0,0,270,647,56,TYPE_C,2.0,Beverages,Thai
2,1346989,1,55,2539,134.86,135.86,0,0,189,647,56,TYPE_C,2.0,Beverages,Thai
3,1338232,1,55,2139,339.5,437.53,0,0,54,647,56,TYPE_C,2.0,Beverages,Indian
4,1448490,1,55,2631,243.5,242.5,0,0,40,647,56,TYPE_C,2.0,Beverages,Indian


Merging data from 3 different file to get a full data file for training

In [None]:
shuffled = data.sample(frac=1)
shuffled.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
73085,1487827,26,34,1445,533.53,670.33,0,0,15,615,34,TYPE_B,4.2,Seafood,Continental
356552,1095449,115,161,1770,484.03,486.03,0,0,15,658,34,TYPE_B,3.9,Biryani,Indian
10107,1209260,4,14,1803,147.47,183.36,0,0,500,654,56,TYPE_C,2.7,Extras,Thai
91623,1262977,32,75,1754,241.56,270.66,1,1,2092,651,77,TYPE_B,4.7,Sandwich,Italian
16290,1478007,6,53,1878,243.53,280.33,0,1,108,590,56,TYPE_A,3.8,Starters,Thai


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     456548 non-null  int64  
 1   week                   456548 non-null  int64  
 2   center_id              456548 non-null  int64  
 3   meal_id                456548 non-null  int64  
 4   checkout_price         456548 non-null  float64
 5   base_price             456548 non-null  float64
 6   emailer_for_promotion  456548 non-null  int64  
 7   homepage_featured      456548 non-null  int64  
 8   num_orders             456548 non-null  int64  
 9   city_code              456548 non-null  int64  
 10  region_code            456548 non-null  int64  
 11  center_type            456548 non-null  object 
 12  op_area                456548 non-null  float64
 13  category               456548 non-null  object 
 14  cuisine                456548 non-nu

**Splitting data for training**

In [8]:
data.columns

Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders', 'city_code',
       'region_code', 'center_type', 'op_area', 'category', 'cuisine'],
      dtype='object')

Choosing features and target for the model

In [None]:
x = data[['week', 'center_id', 'meal_id', 'checkout_price', 'base_price','emailer_for_promotion', 'homepage_featured', 'city_code','region_code', 'center_type', 'op_area', 'category', 'cuisine']]
y = data['num_orders']

Splitting data in to train, val and test with the ratio of (80:10:10)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

Preprocessing data

In [11]:
x.info()
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   week                   456548 non-null  int64  
 1   center_id              456548 non-null  int64  
 2   meal_id                456548 non-null  int64  
 3   checkout_price         456548 non-null  float64
 4   base_price             456548 non-null  float64
 5   emailer_for_promotion  456548 non-null  int64  
 6   homepage_featured      456548 non-null  int64  
 7   city_code              456548 non-null  int64  
 8   region_code            456548 non-null  int64  
 9   center_type            456548 non-null  object 
 10  op_area                456548 non-null  float64
 11  category               456548 non-null  object 
 12  cuisine                456548 non-null  object 
dtypes: float64(3), int64(7), object(3)
memory usage: 45.3+ MB
<class 'pandas.core.series.Seri

Features "center_type", "category" and "cuisine" has 'object' data-type, so we have to processing it before training the model

Feature engineering

In [12]:
categorical_features = ["center_type", "category", "cuisine"]
numeric_features = [
    "week", "center_id", "meal_id", "checkout_price", "base_price",
    "emailer_for_promotion", "homepage_featured", "city_code", "region_code", "op_area"
]


In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


### Choosing model

*Mean*

In [None]:
avarage = y_test.mean()
mean_absolute_error(y_val, [avarage]*len(y_val))

225.9348659020881

*Linear Regression*

In [23]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [20]:
linear_model = model.fit(x_train, y_train)
print("MEA: ", mean_absolute_error(y_val, linear_model.predict(x_val)))
print("R2 score: ", r2_score(y_val, linear_model.predict(x_val)))

MEA:  161.4454321197711
R2 score:  0.41478694883586364


*Random Forest Regression*

In [24]:
model_r = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])  

In [21]:
rfr_model = model_r.fit(x_train, y_train)
print("MEA score: ",mean_absolute_error(y_val, rfr_model.predict(x_val)))
print("R2 score: ", r2_score(y_val, rfr_model.predict(x_val)))

MEA score:  68.84930360858613
R2 score:  0.8595616932513241


*XGBoost*

In [33]:
model_x = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
])

In [None]:
xgb_model = model_x.fit(x_train, y_train)
print("MEA score: ", mean_absolute_error(y_val, xgb_model.predict(x_val)))
print("R2 score: ", r2_score(y_val, rfr_model.predict(x_val)))

MEA score:  116.58675384521484
R2 score:  0.8595616932513241


As we can see the result of all the models is :
- Linear Regression (**MEA score**:  161.4454321197711,   **R2 score**:  0.41478694883586364)
- Random Forest Regression (**MEA score**:  68.84930360858613,  **R2 score**:  0.8595616932513241)
- XGBoost (**MEA score**:  116.58675384521484,  **R2 score**:  0.8595616932513241)

With the lowest MEA score and R2 score we will choose Random Forest Regression Model for our project

In [None]:
def Predict_legend_from_file(file_path):
    df = pd.read_csv(file_path)

    required_cols = ['week', 'center_id', 'meal_id', 'checkout_price', 'base_price','emailer_for_promotion', 'homepage_featured', 'city_code','region_code', 'center_type', 'op_area', 'category', 'cuisine']
    if not all(col in df.columns for col in required_cols):
        return "Error: Input file is missing one or more required columns."

    features = df[required_cols]
    predictions = xgb_model.predict(features)

    df["num_orders"] = predictions
    df.to_csv("predictions_output.csv", index=False)

    return df[['week', 'center_id', 'meal_id', 'checkout_price', 'base_price','emailer_for_promotion', 'homepage_featured', 'city_code','region_code', 'center_type', 'op_area', 'category', 'cuisine','num_orders']]