In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error



In [3]:
data = pd.read_csv('friday.csv')
data

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
label_encoder = LabelEncoder()
data['Age'] = label_encoder.fit_transform(data['Age'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['City_Category'] = label_encoder.fit_transform(data['City_Category'])

In [6]:
data = pd.get_dummies(data, columns=['Stay_In_Current_City_Years'])

In [7]:
imputer = SimpleImputer(strategy='mean')
data['Product_ID'] = pd.to_numeric(data['Product_ID'], errors='coerce')
data['Product_Category_2'] = pd.to_numeric(data['Product_Category_2'], errors='coerce')
data['Product_Category_3'] = pd.to_numeric(data['Product_Category_3'], errors='coerce')
data['Product_Category_2'] = imputer.fit_transform(data[['Product_Category_2']])
data['Product_Category_3'] = imputer.fit_transform(data[['Product_Category_3']])

In [8]:
X = data.drop(columns=['Purchase'])  
y = data['Purchase'] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [12]:
print(f"RMSE: {rmse}")

RMSE: 2905.58404600998
