In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
data_dir = Path(".").resolve(strict=True).parent.parent / "data"
assert data_dir.is_dir()

dataset_path = data_dir / "london_merged.csv"
assert dataset_path.is_file()

features_path = data_dir / "london_features.csv"

In [3]:
dataset = pd.read_csv(dataset_path)
assert not dataset.empty

In [4]:
# we want these features: t1, t2, hum, wind_speed, is_holiday,
# is_weekend, is_summer, is_winter, is_good_weather, month, hour
dataset["is_summer"] = (dataset["season"] == 1).astype(int)
dataset["is_winter"] = (dataset["season"] == 3).astype(int)
dataset["is_good_weather"] = (dataset["weather_code"] <= 4).astype(int)
dataset["month"] = pd.to_datetime(dataset["timestamp"]).dt.month
dataset["hour"] = pd.to_datetime(dataset["timestamp"]).dt.hour

In [5]:
# remove unused columns
dataset.drop(columns=["timestamp", "season", "weather_code"], inplace=True)

In [6]:
# check to make sure we have all the features and target
features = [
    "t1", "t2", "hum", "wind_speed", "is_holiday", "is_weekend",
    "is_summer", "is_winter", "is_good_weather", "month", "hour"
]
target = "cnt"
assert sorted(dataset.columns.tolist()) == sorted(features + [target])

In [7]:
# log target
dataset["cnt"] = np.log1p(dataset["cnt"])

# normalize values to a range of 0-1
for column in dataset.columns:
    column_mean = dataset[column].mean()
    column_std = dataset[column].std() + 1e-8
    dataset[column] = (dataset[column] - column_mean) / column_std

In [8]:
dataset.head()

Unnamed: 0,cnt,t1,t2,hum,wind_speed,is_holiday,is_weekend,is_summer,is_winter,is_good_weather,month,hour
0,-0.958454,-1.699282,-1.439248,1.444475,-1.255681,-0.150157,1.582302,-0.580295,1.738256,0.381739,-1.597286,-1.664755
1,-1.172838,-1.699282,-1.363664,1.444475,-1.38235,-0.150157,1.582302,-0.580295,1.738256,0.381739,-1.597286,-1.52016
2,-1.1956,-1.78902,-1.363664,1.689005,-2.015697,-0.150157,1.582302,-0.580295,1.738256,0.381739,-1.597286,-1.375566
3,-1.674876,-1.878757,-1.439248,1.933535,-2.015697,-0.150157,1.582302,-0.580295,1.738256,0.381739,-1.597286,-1.230971
4,-2.001707,-1.878757,-1.741585,1.444475,-1.192346,-0.150157,1.582302,-0.580295,1.738256,0.381739,-1.597286,-1.086377


In [9]:
dataset.to_csv(features_path, index=False)