In [57]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import datetime
import mlflow
from scipy.stats import ks_2samp, chi2_contingency

from sklearn import datasets, ensemble, model_selection
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# from evidently.report import Report
# from evidently.metric_preset import DataDriftPreset
# from evidently import ColumnMapping


In [10]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

In [11]:
raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)), axis=1)


In [12]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [13]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17379 entries, 2011-01-01 00:00:00 to 2012-12-31 23:00:00
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64  

In [None]:
# numerical_features = [i for i in raw_data.columns if raw_data[i].dtype in [int, float]]

In [None]:
# len(numerical_features)

16

In [26]:
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
cat_features = ['season', 'holiday', 'workingday']

reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [30]:
ks_2samp(reference['temp'], current['temp'])[1]

np.float64(6.026276035162504e-32)

In [31]:
data_drift = 0
for col in numerical_features:
    ks = ks_2samp(reference[col], current[col])[1]
    if ks < 0.05 :
        data_drift += 1

In [32]:
data_drift

5

In [None]:
# reference['season'].value_counts()

season
1    618
Name: count, dtype: int64

In [37]:
data_drift = 0
for col in cat_features:
    ks = chi2_contingency([reference[col].value_counts(), current[col].value_counts()])[1]
    if ks < 0.05 :
        data_drift += 1

data_drift

0

In [38]:
target = 'cnt'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
cat_features = ['season', 'holiday', 'workingday']


In [46]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    reference[numerical_features + cat_features],
    reference[target],
    test_size = 0.3
)

In [47]:
reg = ensemble.RandomForestRegressor(
    random_state=0
)
reg.fit(x_train, y_train)
pred = reg.predict(x_test)

In [48]:
ref_mae=mean_absolute_error(y_test,pred)
ref_mse=mean_squared_error(y_test,pred)
ref_r2 = r2_score(y_test,pred)


print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)

MAE 11.972795698924735
MSE 338.55659247311826
R2 0.8650485213907592


In [49]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    current[numerical_features + cat_features],
    current[target],
    test_size = 0.3
)

In [50]:
reg = ensemble.RandomForestRegressor(
    random_state=0
)
reg.fit(x_train, y_train)
pred = reg.predict(x_test)

In [51]:
ref_mae=mean_absolute_error(y_test,pred)
ref_mse=mean_squared_error(y_test,pred)
ref_r2 = r2_score(y_test,pred)


print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)

MAE 14.774444444444443
MSE 589.0735055555556
R2 0.8430636466130217


In [52]:
experiment_batches = [
    ('2011-01-29 00:00:00','2011-02-07 23:00:00'),
    ('2011-02-07 00:00:00','2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00','2011-02-21 23:00:00'),
]

In [54]:
for date in experiment_batches:
	with mlflow.start_run() as run: #inside brackets run_name='test'
		mlflow.set_tag('mlflow.runName',"run_"+str(date[0])+" : "+str(date[1]))

		# Log parameters
		mlflow.log_param("start_date", date[0])
		mlflow.log_param("end_date", date[1])

		# Log metrics
		current_data=current.loc[date[0]:date[1]]
		current_x=current_data[numerical_features + cat_features]
		current_y=current_data[target]
		current_pred = reg.predict(current_x)


		mae=mean_absolute_error(current_y,current_pred)
		mse=mean_squared_error(current_y,current_pred)
		r2 = r2_score(current_y,current_pred)

		# Log metrics
		mlflow.log_metric('MAE', round(mae, 3))
		mlflow.log_metric('MSE', round(mse, 3))
		mlflow.log_metric('R2', round(r2, 3))

		print(run.info)


<RunInfo: artifact_uri='file:///Users/kavithiyagu/Documents/Projects/Deployments/mlruns/0/35e9810e89ca4ab3ad767e5aba2ca528/artifacts', end_time=None, experiment_id='0', lifecycle_stage='active', run_id='35e9810e89ca4ab3ad767e5aba2ca528', run_name='learned-midge-816', run_uuid='35e9810e89ca4ab3ad767e5aba2ca528', start_time=1746174137880, status='RUNNING', user_id='kavithiyagu'>
<RunInfo: artifact_uri='file:///Users/kavithiyagu/Documents/Projects/Deployments/mlruns/0/b02ba1ed7c704a6fb07ad12de5ee5788/artifacts', end_time=None, experiment_id='0', lifecycle_stage='active', run_id='b02ba1ed7c704a6fb07ad12de5ee5788', run_name='handsome-gnu-21', run_uuid='b02ba1ed7c704a6fb07ad12de5ee5788', start_time=1746174137900, status='RUNNING', user_id='kavithiyagu'>
<RunInfo: artifact_uri='file:///Users/kavithiyagu/Documents/Projects/Deployments/mlruns/0/9cae43a350e04ca8bb4f3f15b53c7e87/artifacts', end_time=None, experiment_id='0', lifecycle_stage='active', run_id='9cae43a350e04ca8bb4f3f15b53c7e87', run_