In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy  as np

from lazypredict.Supervised import LazyRegressor 
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

from  sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold

from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score , mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [None]:
with open("databangladeshi_users.json") as f :
    jsn = json.load(f)


In [325]:
flat_data = []

for user_data in jsn :
    flat_data.append({
        "purchases" : sum(user_data["purchases"]),
        "items" : len(user_data["purchases"]),
        "average" : sum(user_data["purchases"])/len(user_data["purchases"]),
        "location" : user_data["location"],
        "lastlogin" : user_data["last_login"],
        "loyalty" : user_data["loyalty"]
    })

df = pd.DataFrame(flat_data)
df.head()

Unnamed: 0,purchases,items,average,location,lastlogin,loyalty
0,4931,4,1232.75,Barisal,2025-04-25,Bronze
1,1137,2,568.5,Chittagong,2025-04-30,Bronze
2,2483,2,1241.5,Rajshahi,2025-05-06,Bronze
3,1986,3,662.0,Mymensingh,2025-03-24,Bronze
4,4862,5,972.4,Khulna,2025-05-20,Bronze


Cleaning and encoding

In [327]:
df["lastlogin"] = pd.to_datetime(df["lastlogin"])

df["lastseen"] = (datetime.today() - df["lastlogin"]).dt.days


df = df.drop(columns= "lastlogin")

df.head()

Unnamed: 0,purchases,items,average,location,loyalty,lastseen
0,4931,4,1232.75,Barisal,Bronze,30
1,1137,2,568.5,Chittagong,Bronze,25
2,2483,2,1241.5,Rajshahi,Bronze,19
3,1986,3,662.0,Mymensingh,Bronze,62
4,4862,5,972.4,Khulna,Bronze,5


In [328]:
le_loc = LabelEncoder()
le_loyl = LabelEncoder()

df["location"] = le_loc.fit_transform(df["location"])

df["loyalty"] = le_loyl.fit_transform(df["loyalty"])

df.head()

Unnamed: 0,purchases,items,average,location,loyalty,lastseen
0,4931,4,1232.75,0,0,30
1,1137,2,568.5,1,0,25
2,2483,2,1241.5,5,0,19
3,1986,3,662.0,4,0,62
4,4862,5,972.4,3,0,5


splitting

In [329]:
x = df.drop(["purchases"], axis=1)  
y = df["purchases"]               

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


lazypredict regression

In [330]:

reg = LazyRegressor( ignore_warnings=True, random_state=42)

models, predictions = reg.fit(x_train, x_test, y_train, y_test)
models

  0%|          | 0/42 [00:00<?, ?it/s]

  "Stochastic Optimizer: Maximum iterations (%d) "


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,1.0,1.0,31.4,0.28
RandomForestRegressor,1.0,1.0,77.17,0.37
BaggingRegressor,1.0,1.0,81.24,0.07
XGBRegressor,1.0,1.0,100.7,0.17
HistGradientBoostingRegressor,1.0,1.0,104.29,0.52
GradientBoostingRegressor,1.0,1.0,107.19,0.2
DecisionTreeRegressor,1.0,1.0,107.73,0.02
ExtraTreeRegressor,1.0,1.0,114.93,0.02
LGBMRegressor,1.0,1.0,118.72,0.08
GaussianProcessRegressor,0.98,0.98,270.11,0.4


tuning

In [None]:
enet = ElasticNet(max_iter=10000)
param = {
    "alpha": [0.001, 0.01, 0.1, 0.5, 1, 10],
    "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
    "random_state": list(range(1, 100)),
    "selection": ['cyclic', 'random']
}

In [341]:
grd_srch = GridSearchCV(enet,param,cv=5,scoring="r2")
grd_srch.fit(x_train,y_train)

In [342]:
print("Best Parameters:", grd_srch.best_params_)
print("Best R² Score on CV:", grd_srch.best_score_)


Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.7, 'random_state': 20, 'selection': 'random'}
Best R² Score on CV: 0.9157571396217558


Model

In [354]:

enet = ElasticNet(alpha= 0.1, l1_ratio= 0.7,  random_state=20, selection= 'random')
enet.fit(x_train,y_train)

y_train_pred = enet.predict(x_train )
y_test_pred = enet.predict(x_test)

Evaluate

In [355]:
MSE = mean_squared_error(y_test,y_test_pred)
RMSE = np.sqrt(MSE)


print("Train R²:", r2_score(y_train, y_train_pred))
print("Test R²:", r2_score(y_test, y_test_pred))
print("MSE : ",MSE )
print("RMSE:", RMSE)

Train R²: 0.9230635494586872
Test R²: 0.9179023226840927
MSE :  304626.2938027399
RMSE: 551.9296094636886


validation

In [357]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = cross_val_score(enet, x, y, cv=5, scoring='r2')

print("Cross-validated R² scores:", r2_scores)
print("Average R² score:", np.mean(r2_scores))
print("Standard Deviation of R²:", np.std(r2_scores))


neg_mse_scores = cross_val_score(enet, x, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-neg_mse_scores)
print("CV RMSE Scores:", rmse_scores)
print("Mean CV RMSE:", np.mean(rmse_scores))

Cross-validated R² scores: [0.90203158 0.93422587 0.91231293 0.92750835 0.90921282]
Average R² score: 0.9170583085249626
Standard Deviation of R²: 0.011948234831537844
CV RMSE Scores: [574.16036681 532.43878396 574.06291363 511.36305334 557.56810065]
Mean CV RMSE: 549.9186436780792
