## Explore & Clean Weather Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
# Load data
weather = pd.read_csv('data/weather.csv', parse_dates=['Date'])
weather.head()

### Explore the Data

In [None]:
weather.info()

In [None]:
weather.isna().sum()

In [None]:
weather.describe()

In [None]:
weather.describe(include='all', datetime_is_numeric=True)

In [None]:
weather[['MaxTemp','Sunshine','Humidity3pm', 'Rainfall']].agg(func=['mean','median','skew', 'kurtosis'])

### Feature Engineering - Add Feature

In [None]:
print("First Date: ",weather.Date.min())

print("Last Date: ",weather.Date.max())

In [None]:
weather.Date.dt.year.value_counts()

In [None]:
weather.Date.dt.month.value_counts()

In [None]:
weather['Month'] = pd.Categorical(weather.Date.dt.month)
weather['MonthText'] = pd.Categorical(weather.Date.dt.month_name())

weather.info()

In [None]:
# what do you notice about the relationship between month integer and cat codes?
weather.Month.cat.codes.value_counts()

In [None]:
weather.Month.value_counts()

### Explore Using Visualizations

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))

sns.set(style="whitegrid")
sns.boxplot(x="RainTomorrow", y="Sunshine", hue='RainToday', width=0.6, data=weather)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))

sns.distplot(weather.Humidity3pm, rug=True, kde=False)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))

ax = sns.barplot(x='WindGustDir', y='WindGustDir', orient='h', data=weather, hue='RainTomorrow'
                 ,estimator=lambda x: len(x)/len(weather)*100,order = weather['WindGustDir'].value_counts().index)
ax.set(xlabel="Percent")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))

sns.distplot(weather.Rainfall, rug=True, kde=False)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

sns.set(style="whitegrid")
sns.boxplot(x="Month", y="Rainfall", width=0.6, data=weather)
plt.show()

In [None]:
weather.groupby("Month").Rainfall.agg(func=['mean','median', 'min', 'max'])

### Handle Missing Values
- Fill the missing values before preparing the data for modeling
- With pipelines (next week!) we can handle these with imputation within the pipeline

In [None]:
weather.isna().sum()

In [None]:
weather.fillna(value={"Sunshine": weather.Sunshine.mean()
                      ,"WindGustDir": weather.WindGustDir.value_counts().index[0]}, inplace=True)

weather.isna().sum()

### Serialize My Dataset

In [None]:
import joblib

cols_to_keep = ['RainTomorrow','Rainfall','Sunshine', 'MaxTemp', 'Humidity3pm', 'WindGustDir', 'RainToday', 'Month', 'MonthText']

saved_weather = weather[cols_to_keep]

with open("data/weather.pkl", "wb") as fwb:
    joblib.dump(saved_weather, fwb)