<a href="https://colab.research.google.com/github/haddybhaiya/save_dishes/blob/main/saveDishes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pipeline Preprocess**

In [None]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

Unnamed: 0,ID,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465
1,1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,,MeAt,51.549053
2,4,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt,53.008323
3,5,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt,48.621527
4,6,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT,44.156984


In [None]:
df.shape

(911, 12)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                911 non-null    int64  
 1   date              911 non-null    object 
 2   meals_served      911 non-null    int64  
 3   kitchen_staff     911 non-null    int64  
 4   temperature_C     911 non-null    float64
 5   humidity_percent  911 non-null    float64
 6   day_of_week       911 non-null    int64  
 7   special_event     911 non-null    int64  
 8   past_waste_kg     911 non-null    float64
 9   staff_experience  747 non-null    object 
 10  waste_category    911 non-null    object 
 11  food_waste_kg     911 non-null    float64
dtypes: float64(4), int64(5), object(3)
memory usage: 85.5+ KB


In [None]:
df['date'].value_counts()

Unnamed: 0_level_0,count
date,Unnamed: 1_level_1
2022-02-02,2
2022-01-11,2
2022-01-27,2
2022-02-14,2
2022-01-06,2
...,...
2024-04-02,1
2023-09-04,1
2024-01-17,1
2024-03-20,1


In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
date,0
meals_served,0
kitchen_staff,0
temperature_C,0
humidity_percent,0
day_of_week,0
special_event,0
past_waste_kg,0
staff_experience,164


In [None]:
df['ID']

Unnamed: 0,ID
0,0
1,1
2,4
3,5
4,6
...,...
906,1044
907,1045
908,1046
909,1048


In [None]:
df['date'] = pd.to_datetime(df['date'])


In [None]:
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

In [None]:
df.drop(columns=['date'], inplace=True)

In [None]:
df.head()

Unnamed: 0,ID,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg,month,day,is_weekend
0,0,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465,12,19,0
1,1,244,15,10.317872,64.430475,1,0,42.311779,,MeAt,51.549053,11,21,0
2,4,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt,53.008323,2,1,0
3,5,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt,48.621527,3,19,1
4,6,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT,44.156984,7,18,0


In [None]:
df['staff_experience'].value_counts()

Unnamed: 0_level_0,count
staff_experience,Unnamed: 1_level_1
Beginner,191
Intermediate,186
EXPERT,186
intermediate,184


In [None]:
df['waste_category'].value_counts()

Unnamed: 0_level_0,count
waste_category,Unnamed: 1_level_1
MEAT,210
dairy,180
Vegetables,176
GRAINS,176
MeAt,169


In [None]:
df['staff_experience'] = df['staff_experience'].fillna('Unknown')


In [None]:
df['waste_category'] = df['waste_category'].str.lower().str.strip()
df['staff_experience'] = df['staff_experience'].str.lower().str.strip()



In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.get_dummies(
    df,
    columns=['staff_experience', 'waste_category'],
    drop_first=True
)


In [None]:
df.head()

Unnamed: 0,ID,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,food_waste_kg,month,day,is_weekend,staff_experience_expert,staff_experience_intermediate,staff_experience_unknown,waste_category_grains,waste_category_meat,waste_category_vegetables
0,0,196,13,27.887273,45.362854,0,0,7.740587,28.946465,12,19,0,False,True,False,False,False,False
1,1,244,15,10.317872,64.430475,1,0,42.311779,51.549053,11,21,0,False,False,True,False,True,False
2,4,148,16,27.7143,69.046113,1,0,41.184305,53.008323,2,1,0,False,False,False,False,True,False
3,5,157,19,19.173902,46.292823,6,0,41.543492,48.621527,3,19,1,False,False,False,False,True,False
4,6,297,10,26.375233,79.741064,0,0,26.525097,44.156984,7,18,0,False,True,False,False,True,False


# **Model training**

In [None]:
X = df.drop(columns=['food_waste_kg', 'ID'])
y = df['food_waste_kg']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
m1  = LinearRegression()

In [None]:
m1.fit(X_train, y_train)

In [None]:
r2_score(y_test, m1.predict(X_test))


0.7445298009075155

In [None]:
#import mean absolute
from sklearn.metrics import mean_absolute_error

In [None]:
from sklearn.ensemble import RandomForestRegressor
m2 = RandomForestRegressor()
m2.fit(X_train, y_train)
mean_absolute_error(y_test, m2.predict(X_test))

4.617213087669551

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
m3 = GradientBoostingRegressor()
m3.fit(X_train, y_train)
mean_absolute_error(y_test, m3.predict(X_test))


4.986206602595093

In [None]:
mean_absolute_error(y_test, m1.predict(X_test))


5.6189289162329725

In [None]:
r2_score(y_test, m3.predict(X_test))


0.8355795321860804

In [None]:
r2_score(y_test, m2.predict(X_test))

0.9231798793993848

# **testing on test data**

In [None]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,ID,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category
0,0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy
1,1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,,MeAt
2,4,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt
3,5,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt
4,6,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT


In [None]:
df_test['date'] = pd.to_datetime(df_test['date'])


In [None]:
df_test['month'] = df_test['date'].dt.month
df_test['day'] = df_test['date'].dt.day
df_test['is_weekend'] = df_test['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

In [None]:
df_test.drop(columns=['date'], inplace=True)

In [None]:
df_test['staff_experience'] = df_test['staff_experience'].fillna('Unknown')

In [None]:
df_test['waste_category'] = df_test['waste_category'].str.lower().str.strip()
df_test['staff_experience'] = df_test['staff_experience'].str.lower().str.strip()

In [None]:
df_test = pd.get_dummies(
    df_test,
    columns=['staff_experience', 'waste_category'],
    drop_first=True
)

In [None]:
y_pred_m1 = m1.predict(X_test)
r2_score(y_test, y_pred_m1)

0.7445298009075155

In [None]:
y_pred_m2 = m2.predict(X_test)
r2_score(y_test, y_pred_m2)

0.9231798793993848

In [None]:
y_pred_m3 = m3.predict(X_test)
r2_score(y_test, y_pred_m3)

0.8355795321860804