Importing Libraries

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
np.random.seed = 42
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

Importing dataset

In [2]:
data = pd.read_csv("C:\\Users\\Hüseyin Sefa Kiriş\\Desktop\\onlinefoods.csv")
data.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,Unnamed: 12
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes


In [3]:
data["Monthly Income"].value_counts()

Monthly Income
No Income          187
25001 to 50000      69
More than 50000     62
10001 to 25000      45
Below Rs.10000      25
Name: count, dtype: int64

Set monthly income column

In [4]:
data.loc[data["Monthly Income"] == 'No Income' , "Monthly Income"] =0
data.loc[data["Monthly Income"] == "Below Rs.10000", "Monthly Income"] = 5000
data.loc[data["Monthly Income"] == "More than 50000", "Monthly Income"] = 75000
data.loc[data["Monthly Income"] == "10001 to 25000", "Monthly Income"] = 17500
data.loc[data["Monthly Income"] == "25001 to 50000", "Monthly Income"] = 37500

In [5]:
data["Monthly Income"].value_counts()

Monthly Income
0        187
37500     69
75000     62
17500     45
5000      25
Name: count, dtype: int64

In [6]:
data.drop("Unnamed: 12", axis=1, inplace=True)

In [7]:
data.head(1)

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback
0,20,Female,Single,Student,0,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         388 non-null    int64  
 1   Gender                      388 non-null    object 
 2   Marital Status              388 non-null    object 
 3   Occupation                  388 non-null    object 
 4   Monthly Income              388 non-null    object 
 5   Educational Qualifications  388 non-null    object 
 6   Family size                 388 non-null    int64  
 7   latitude                    388 non-null    float64
 8   longitude                   388 non-null    float64
 9   Pin code                    388 non-null    int64  
 10  Output                      388 non-null    object 
 11  Feedback                    388 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 36.5+ KB


Transforming the columns

In [19]:
categories = ["Gender", "Marital Status", "Occupation", "Educational Qualifications", "Output", "Feedback"]
ct = ColumnTransformer([("one_hot", OneHotEncoder(), categories)], remainder="passthrough")

In [15]:
X = data.drop("Monthly Income", axis=1)
y = data["Monthly Income"]

In [20]:
transformed_X = ct.fit_transform(X)

In [21]:
print(transformed_X)

[[1.00000e+00 0.00000e+00 0.00000e+00 ... 1.29766e+01 7.75993e+01
  5.60001e+05]
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 1.29770e+01 7.75773e+01
  5.60009e+05]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 1.29551e+01 7.76593e+01
  5.60017e+05]
 ...
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 1.29850e+01 7.75533e+01
  5.60010e+05]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 1.29770e+01 7.75773e+01
  5.60009e+05]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 1.28988e+01 7.75764e+01
  5.60078e+05]]


Splitting the dataset

In [22]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

Creating Regressor Model

In [26]:
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)
y_preds = regressor.predict(X_test)

In [28]:
r2_score(y_test, y_preds)

0.8008326551422327

Creating grid and RandomizedSearchCV

In [30]:
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200, 2000],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

In [31]:
rs_clf = RandomizedSearchCV(estimator=regressor,
                   param_distributions=grid,
                   n_iter=500,
                   verbose=2)

rs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   0.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   0.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   0.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   0.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   0.5s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_es

1225 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1225 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hüseyin Sefa Kiriş\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hüseyin Sefa Kiriş\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "C:\Users\Hüseyin Sefa Kiriş\AppData\Local\Packages\PythonSof

In [32]:
best_clf =  rs_clf.best_estimator_

Exporting model

In [34]:
pickle.dump(best_clf, open("online__food best clf.pkl", "wb"))

In [35]:
best_clf.fit(X_train, y_train)
y_preds_2 = best_clf.predict(X_test)

In [36]:
r2_score(y_test, y_preds_2)

0.8139963782688062