In [26]:
import pandas as pd
import joblib

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [27]:
def remove_outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return df.copy(deep=True)[(df[col] > lower_bound) & (df[col] < upper_bound)]

In [28]:
def format_dates(df: pd.DataFrame, col: str) -> pd.DataFrame:
    df[col] = pd.to_datetime(df[col])
    df["month"] = df[col].dt.month
    df["day"] = df[col].dt.day
    df["hour"] = df[col].dt.hour
    df = df.drop(columns=[col])
    return df.copy(deep=True)

In [29]:
def normalize(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    normalized_df = df.copy(deep=True)
    normalized_df[cols] = min_max_scaler.fit_transform(
        std_scaler.fit_transform(normalized_df[cols])
    )
    return normalized_df

In [30]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    new_df = remove_outliers(df, "windspeed")
    new_df = format_dates(new_df, "datetime")
    new_df = normalize(new_df, ["temp", "atemp", "humidity", "windspeed"])
    return new_df

In [31]:
model = joblib.load('./models/regressor.pkl')
model

In [32]:
df_test = pd.read_csv('./data/test.csv')
df_test.sample(20)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
5072,2012-07-29 04:00:00,3,0,0,1,26.24,30.305,69,12.998
4925,2012-07-23 01:00:00,3,0,1,1,27.06,29.545,89,12.998
2749,2011-11-23 02:00:00,4,0,1,3,20.5,24.24,94,23.9994
2790,2011-11-24 19:00:00,4,1,0,1,16.4,20.455,54,7.0015
6313,2012-12-24 11:00:00,1,0,1,2,10.66,12.88,60,11.0014
1684,2011-07-25 04:00:00,3,0,1,1,29.52,35.605,84,6.0032
713,2011-03-30 17:00:00,2,0,1,3,10.66,13.635,93,7.0015
3917,2012-03-27 01:00:00,2,0,1,1,12.3,13.635,26,31.0009
2853,2011-11-27 10:00:00,4,0,0,1,18.86,22.725,72,19.0012
4819,2012-06-29 15:00:00,3,0,1,1,39.36,45.455,36,0.0


In [33]:
preprocessed_df_test = preprocess(df_test)
preprocessed_df_test.sample(20)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,day,hour
5495,3,0,0,1,0.708333,0.6515,0.5,0.419278,9,22,19
5445,3,0,1,1,0.645833,0.6212,0.404762,0.483899,9,20,17
5189,3,0,1,1,0.625,0.5909,0.738095,0.0,8,22,1
5043,3,0,1,1,0.770833,0.7273,0.595238,0.0,7,27,23
3733,1,0,1,3,0.291667,0.303,0.77381,0.354874,2,29,9
380,1,0,1,3,0.541667,0.5152,0.52381,1.0,2,25,13
5821,4,0,1,2,0.520833,0.5,0.797619,0.419278,10,25,9
627,2,0,0,2,0.229167,0.2273,0.428571,0.419278,3,27,1
5490,3,0,0,1,0.770833,0.6818,0.380952,0.612924,9,22,14
714,2,0,1,2,0.25,0.2576,0.916667,0.483899,3,30,18


In [34]:
y_pred = model.predict(preprocessed_df_test)
y_pred = y_pred.astype(int)

In [35]:
preprocessed_df_test['prediction'] = pd.Series(y_pred)
preprocessed_df_test

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,day,hour,prediction
0,1,0,1,1,0.25,0.2273,0.476190,0.838772,1,20,0,44.0
1,1,0,1,1,0.25,0.2727,0.476190,0.000000,1,20,1,52.0
2,1,0,1,1,0.25,0.2727,0.476190,0.000000,1,20,2,60.0
3,1,0,1,1,0.25,0.2576,0.476190,0.354874,1,20,3,68.0
4,1,0,1,1,0.25,0.2576,0.476190,0.354874,1,20,4,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,1,0,1,2,0.25,0.2576,0.523810,0.354874,12,31,19,
6489,1,0,1,2,0.25,0.2576,0.523810,0.354874,12,31,20,
6490,1,0,1,1,0.25,0.2576,0.523810,0.354874,12,31,21,
6491,1,0,1,1,0.25,0.2727,0.476190,0.290253,12,31,22,


In [40]:
preprocessed_df_test.dropna(inplace=True)
preprocessed_df_test.sample(50)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,day,hour,prediction
3613,1,0,1,2,0.375,0.3939,0.714286,0.709747,2,24,8,237.0
5343,3,0,1,1,0.791667,0.7273,0.464286,0.548303,8,28,11,117.0
1352,3,0,1,1,0.729167,0.697,0.690476,0.225848,6,22,8,102.0
350,1,0,1,1,0.1875,0.197,0.630952,0.419278,2,24,6,150.0
4854,3,0,1,2,0.666667,0.5909,0.869048,0.193646,7,20,2,209.0
5207,3,0,1,2,0.666667,0.6212,0.690476,0.0,8,22,19,195.0
3242,1,0,1,2,0.1875,0.2273,0.25,0.225848,1,20,19,192.0
2123,3,0,1,2,0.5625,0.5303,0.678571,0.354874,9,20,0,101.0
1934,3,0,1,1,0.729167,0.6515,0.166667,0.193646,8,23,14,160.0
3206,1,0,0,3,0.375,0.3939,0.714286,0.0,12,31,7,100.0
