In [1]:
import datetime
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import json

from sklearn import datasets, ensemble, model_selection
from scipy.stats import anderson_ksamp

In [2]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

In [3]:
raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)),
                                axis=1)

In [4]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17379 entries, 2011-01-01 00:00:00 to 2012-12-31 23:00:00
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64  

In [6]:
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday']

reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [7]:
#null_hypothesis--> Two samples come from the same distribution

from scipy import stats 

counter =0
for i in numerical_features:
    test, p_val= stats.ks_2samp(reference[i], current[i])

    if p_val <0.05:
        counter+=1

In [8]:
counter/7 *100

71.42857142857143

In [9]:
from scipy.stats import chi2_contingency

rejected_chi=0
def drift_chisq(sample_1, sample_2):
    return chi2_contingency([sample_1, sample_2])[1]

for i in categorical_features:
    val= drift_chisq(reference[i].value_counts(), current[i].value_counts())

    if val<0.05:
        rejected_chi+=1
        
    

In [10]:
rejected_chi

0

In [11]:
# COMPLETE MODEL BUILDING

In [12]:


X_train, X_test, y_train, y_test = model_selection.train_test_split(
    reference[numerical_features + categorical_features],
    reference['cnt'],
    test_size=0.3
)

In [13]:
regressor = ensemble.RandomForestRegressor(random_state = 0)

regressor.fit(X_train, y_train)

preds_test = regressor.predict(X_test)

In [14]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

ref_mae=mean_absolute_error(y_test,preds_test)
ref_mse=mean_squared_error(y_test,preds_test)
ref_r2 = r2_score(y_test,preds_test)


print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)

MAE 10.580752688172042
MSE 230.75542258064516
R2 0.8762289470121575


In [15]:
current_x=current[numerical_features + categorical_features]
current_y=current['cnt']

current_pred = regressor.predict(current_x)

In [16]:
curr_mae=mean_absolute_error(current_y,current_pred)
curr_mse=mean_squared_error(current_y,current_pred)
curr_r2 = r2_score(current_y,current_pred)


print("MAE",curr_mae)
print("MSE",curr_mse)
print("R2",curr_r2)

MAE 19.660834492350485
MSE 1054.8304436717663
R2 0.7271415708079487


In [17]:
import mlflow

In [20]:
mlflow.set_experiment("Bicycle–Sharing")


2025/10/31 22:20:46 INFO mlflow.tracking.fluent: Experiment with name 'Bicycle–Sharing' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/apple/Desktop/mlruns/738021474840256342', creation_time=1761929446391, experiment_id='738021474840256342', last_update_time=1761929446391, lifecycle_stage='active', name='Bicycle–Sharing', tags={}>

In [21]:
with mlflow.start_run():

    mlflow.set_tag('mlflow.runName','Refrence_run')
    mlflow.log_metric("MAE",ref_mae)
    mlflow.log_metric("MSE",ref_mse)
    mlflow.log_metric("R2",ref_r2)

    mlflow.sklearn.log_model(regressor, "model")



In [22]:
!mlflow ui

[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:5000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m13229[0m]
[32mINFO[0m:     Started server process [[36m13232[0m]
[32mINFO[0m:     Started server process [[36m13233[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m13234[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m13231[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     127.0.0.1:65313 - "[1mGET / HTTP/1.1[0m" [32m200 OK[0m
[32mINFO[0m:     127.0.0.1:65313 - "[1mGET /static-files/static/js/main.e1ed6ef5.js HTTP/1.1[0m" [32m200 OK[0m
[32mINFO[

In [18]:
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

experiment_batches = [
    ('2011-01-29 00:00:00','2011-02-07 23:00:00'),
    ('2011-02-07 00:00:00','2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00','2011-02-21 23:00:00'),
]

In [19]:
for i in experiment_batches:
    with mlflow.start_run():
        
        mlflow.set_tag('mlflow.runName','Run_'+ str(i[0])+" : "+str(i[1]))

        current_data = current.loc[i[0]: i[1]]

        current_x=current_data[numerical_features + categorical_features]
        current_y=current_data['cnt']
        current_pred = regressor.predict(current_x)

        mae= mean_absolute_error(current_y, current_pred)
        mse= mean_squared_error(current_y, current_pred)
        r2= r2_score(current_y, current_pred)
        
        mlflow.log_metric("MAE",mae)
        mlflow.log_metric("MSE",mse)
        mlflow.log_metric("R2",r2)


In [20]:
!mlflow ui

[31mERROR[0m:    [Errno 48] Address already in use
Running the mlflow server failed. Please see the logs above for details.


In [24]:
!lsof -i :5000

COMMAND     PID  USER   FD   TYPE             DEVICE SIZE/OFF NODE NAME
python3.1 13321 apple    3u  IPv4 0xca6caa2fbac01f74      0t0  TCP localhost:commplex-main (LISTEN)
ControlCe 13423 apple   10u  IPv4 0x495bab8075e97f9f      0t0  TCP *:commplex-main (LISTEN)
ControlCe 13423 apple   11u  IPv6 0x1ac84d4713a608f2      0t0  TCP *:commplex-main (LISTEN)


In [25]:
!kill -9 13321

In [None]:
!mlflow ui 

[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:5000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m13435[0m]
[32mINFO[0m:     Started server process [[36m13439[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m13438[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m13440[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Started server process [[36m13437[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     127.0.0.1:49591 - "[1mGET / HTTP/1.1[0m" [32m200 OK[0m
[32mINFO[0m:     127.0.0.1:49591 - "[1mGET /ajax-api/2.0/mlflow/experiments/search?max_results=25&order_by=last_update_time+D

In [None]:
#run it for the last week and you will see the degradation in MAE, MSE