### Weather Data Preparation

In [1]:
import pandas as pd

In [2]:
weather_df = pd.read_csv('../00_data/weather_hourly_la.csv')

In [3]:
weather_df.isna().sum()

date_time    92
max_temp     92
min_temp     92
precip       90
dtype: int64

In [4]:
n_datetime_na = weather_df["date_time"].isna().sum()
print(
    f"Number of missing datetime values: {n_datetime_na} "
    + f"({n_datetime_na/len(weather_df) * 100:.2f}%)"
)

Number of missing datetime values: 92 (0.21%)


In [5]:
weather_df = weather_df[weather_df['date_time'].notna()] 

In [6]:
weather_df.isna().sum()

date_time    0
max_temp     0
min_temp     0
precip       0
dtype: int64

In [7]:
weather_df.describe()

Unnamed: 0,max_temp,min_temp,precip
count,43756.0,43756.0,43756.0
mean,17.928581,17.88525,0.019814
std,4.198326,4.20856,0.139364
min,2.8,2.8,0.0
25%,15.0,15.0,0.0
50%,17.8,17.8,0.0
75%,20.6,20.6,0.0
max,39.4,39.4,1.0


In [8]:
weather_df['date_time'] = pd.to_datetime(weather_df['date_time'])

In [9]:
datetime_format = '%d.%m.%Y %H:%M:%S'
print(f"earliest observation: {format(weather_df['date_time'].min(), datetime_format)}")
print(f"latest observation: {format(weather_df['date_time'].max(), datetime_format)}")

earliest observation: 01.01.2015 09:00:00
latest observation: 02.01.2020 08:00:00


In [10]:
len(weather_df)

43756

In [11]:
# drop all entries that have a date_time earlier than 01.01.2019 or later than 31.12.2019
weather_df = weather_df[
    (weather_df["date_time"] >= "2019-01-01 00:00:00")
    & (weather_df["date_time"] <= "2019-12-31 23:59:59")
]

In [12]:
# there exist duplicates in the data
weather_df[ weather_df.duplicated('date_time') ].sort_index().head(4)

Unnamed: 0,date_time,max_temp,min_temp,precip
35162,2019-01-06 02:00:00,12.8,12.2,1.0
35165,2019-01-06 05:00:00,10.0,10.0,0.0
35177,2019-01-05 17:00:00,12.8,12.2,0.0
35180,2019-01-05 20:00:00,14.4,14.4,0.0


In [13]:
weather_df = weather_df.groupby('date_time').mean()

In [14]:
pd.to_pickle(weather_df, "../00_data/weather.pkl")