In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# read in data from csv file to pandas dataframe.  
df = pd.read_csv('weather_data.csv',  keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
df.shape

(21686, 16)

In [4]:
df.head(5)

Unnamed: 0,hourly_data_time,hourly_data_summary,hourly_data_icon,hourly_data_precip_intensity,hourly_data_precip_probability,hourly_data_temperature,hourly_data_apparent_temperature,hourly_data_dew_point,hourly_data_humidity,hourly_data_wind_speed,hourly_data_wind_gust,hourly_data_wind_bearing,hourly_data_cloud_cover,hourly_data_uv_index,hourly_data_visibility,hourly_data_pressure
0,2017-01-01 00:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,43.82,39.72,39.22,0.84,7.03,11.04,326.0,0.75,0.0,6.216,
1,2017-01-01 01:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,42.02,39.79,39.22,0.9,3.84,7.99,299.0,0.75,0.0,5.049,
2,2017-01-01 02:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,42.02,38.21,39.22,0.9,5.91,10.7,304.0,0.75,0.0,6.216,
3,2017-01-01 03:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,42.02,37.28,37.4,0.84,7.53,9.51,304.0,0.75,0.0,6.216,
4,2017-01-01 04:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,40.23,35.74,37.4,0.9,6.47,13.04,318.0,0.75,0.0,6.216,


In [5]:
df.dtypes

hourly_data_time                     object
hourly_data_summary                  object
hourly_data_icon                     object
hourly_data_precip_intensity        float64
hourly_data_precip_probability      float64
hourly_data_temperature             float64
hourly_data_apparent_temperature    float64
hourly_data_dew_point               float64
hourly_data_humidity                float64
hourly_data_wind_speed              float64
hourly_data_wind_gust               float64
hourly_data_wind_bearing            float64
hourly_data_cloud_cover             float64
hourly_data_uv_index                float64
hourly_data_visibility              float64
hourly_data_pressure                float64
dtype: object

### Just remove any non-int values (Null, NaN, etc) and replace them with 0 to make the addition of this data to the transit database.

### Also split it into categorical and continuous data

In [6]:
categorical_columns = df[['hourly_data_summary','hourly_data_icon']].columns
for column in categorical_columns:
    df[column] = df[column].astype('category')

In [7]:
df.dtypes

hourly_data_time                      object
hourly_data_summary                 category
hourly_data_icon                    category
hourly_data_precip_intensity         float64
hourly_data_precip_probability       float64
hourly_data_temperature              float64
hourly_data_apparent_temperature     float64
hourly_data_dew_point                float64
hourly_data_humidity                 float64
hourly_data_wind_speed               float64
hourly_data_wind_gust                float64
hourly_data_wind_bearing             float64
hourly_data_cloud_cover              float64
hourly_data_uv_index                 float64
hourly_data_visibility               float64
hourly_data_pressure                 float64
dtype: object

In [8]:
df[categorical_columns].describe().T

Unnamed: 0,count,unique,top,freq
hourly_data_summary,21686,28,Mostly Cloudy,14956
hourly_data_icon,21686,10,partly-cloudy-day,9862


In [9]:
continuous_columns = df.select_dtypes(['float']).columns
df[continuous_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hourly_data_precip_intensity,20657.0,9.7e-05,0.009839,0.0,0.0,0.0,0.0,1.0
hourly_data_precip_probability,20657.0,0.029433,0.169021,0.0,0.0,0.0,0.0,1.0
hourly_data_temperature,21686.0,51.008321,8.950463,24.76,44.7225,50.86,57.12,80.98
hourly_data_apparent_temperature,21686.0,48.797408,11.248467,11.35,40.2,50.86,57.12,82.27
hourly_data_dew_point,21686.0,44.170854,7.794922,16.29,38.5725,44.59,49.93,65.74
hourly_data_humidity,21686.0,0.78365,0.112433,0.27,0.72,0.81,0.87,0.98
hourly_data_wind_speed,21681.0,9.706628,5.334474,0.0,5.82,8.92,12.75,40.27
hourly_data_wind_gust,20661.0,14.784843,9.602819,0.0,7.35,12.45,20.61,60.54
hourly_data_wind_bearing,21465.0,204.247845,77.683857,0.0,151.0,226.0,261.0,359.0
hourly_data_cloud_cover,21436.0,0.650481,0.195614,0.0,0.59,0.75,0.75,1.0


In [10]:
#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', df.duplicated().sum())

# Check for duplicate rows. 
# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


In [11]:
# Check for duplicate columns
#First transpose the df so columns become rows, then apply the same check as above
dfT = df.T
print("Number of duplicate (excluding first) columns in the table is: ", dfT.duplicated().sum())
print("Number of duplicate (including first) columns in the table is: ",  dfT[dfT.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) columns in the table is:  0
Number of duplicate (including first) columns in the table is:  0


In [12]:
df.isnull().sum()

hourly_data_time                       0
hourly_data_summary                    0
hourly_data_icon                       0
hourly_data_precip_intensity        1029
hourly_data_precip_probability      1029
hourly_data_temperature                0
hourly_data_apparent_temperature       0
hourly_data_dew_point                  0
hourly_data_humidity                   0
hourly_data_wind_speed                 5
hourly_data_wind_gust               1025
hourly_data_wind_bearing             221
hourly_data_cloud_cover              250
hourly_data_uv_index                 109
hourly_data_visibility                 5
hourly_data_pressure                1084
dtype: int64

In [13]:
df["hourly_data_precip_intensity"].fillna("Null", inplace = True)
df["hourly_data_precip_probability"].fillna("Null", inplace = True)
df["hourly_data_wind_speed"].fillna("Null", inplace = True)
df["hourly_data_wind_gust"].fillna("Null", inplace = True)
df["hourly_data_wind_bearing"].fillna("Null", inplace = True)
df["hourly_data_cloud_cover"].fillna("Null", inplace = True)
df["hourly_data_cloud_cover"].fillna("Null", inplace = True)
df["hourly_data_uv_index"].fillna("Null", inplace = True)
df["hourly_data_visibility"].fillna("Null", inplace = True)
df["hourly_data_pressure"].fillna("Null", inplace = True)

In [14]:
df_no_null = df.replace(to_replace=r'^Null', value=0, regex=True)

df_no_null.head(5)

Unnamed: 0,hourly_data_time,hourly_data_summary,hourly_data_icon,hourly_data_precip_intensity,hourly_data_precip_probability,hourly_data_temperature,hourly_data_apparent_temperature,hourly_data_dew_point,hourly_data_humidity,hourly_data_wind_speed,hourly_data_wind_gust,hourly_data_wind_bearing,hourly_data_cloud_cover,hourly_data_uv_index,hourly_data_visibility,hourly_data_pressure
0,2017-01-01 00:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,43.82,39.72,39.22,0.84,7.03,11.04,326.0,0.75,0.0,6.216,0.0
1,2017-01-01 01:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,42.02,39.79,39.22,0.9,3.84,7.99,299.0,0.75,0.0,5.049,0.0
2,2017-01-01 02:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,42.02,38.21,39.22,0.9,5.91,10.7,304.0,0.75,0.0,6.216,0.0
3,2017-01-01 03:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,42.02,37.28,37.4,0.84,7.53,9.51,304.0,0.75,0.0,6.216,0.0
4,2017-01-01 04:00:00,Mostly Cloudy,partly-cloudy-night,0.0,0.0,40.23,35.74,37.4,0.9,6.47,13.04,318.0,0.75,0.0,6.216,0.0


In [15]:
df_no_null.isnull().sum()

hourly_data_time                    0
hourly_data_summary                 0
hourly_data_icon                    0
hourly_data_precip_intensity        0
hourly_data_precip_probability      0
hourly_data_temperature             0
hourly_data_apparent_temperature    0
hourly_data_dew_point               0
hourly_data_humidity                0
hourly_data_wind_speed              0
hourly_data_wind_gust               0
hourly_data_wind_bearing            0
hourly_data_cloud_cover             0
hourly_data_uv_index                0
hourly_data_visibility              0
hourly_data_pressure                0
dtype: int64

### Some reading into weather affects on traffic conditions

- **Weather Impact on Traffic Conditions and Travel Time Prediction** http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.332.740&rep=rep1&type=pdf
- **The impact of weather and climate on transport in the UK** https://journals.sagepub.com/doi/abs/10.1177/030913339201600202?casa_token=wHHQW6d4vlkAAAAA:EGdo1zeOvJGpiRZrztXSDNQY-LYTj005-jUXZQmgiZtnicacIiTG9shWOWMFWPoj6bSpdxDJG3w
- **Modeling impacts of adverse weather conditions on a road network with uncertainties in demand and supply** https://www.sciencedirect.com/science/article/pii/S0191261508000301

From these it seems that the following features do need to be accounted for:

- hourly_data_precip_intensity
- hourly_data_precip_probability
- hourly_data_apparent_temperature
- hourly_data_wind_speed
- hourly_data_cloud_cover
- hourly_data_uv_index
- hourly_data_visibility

### Rain, temperature, cloud cover, UV index and visibility affect traffic the most according to the above studies

### Now to investigate features in this dataset and remove any that may not be useful for the machine learning model

### Initially at the start it can be seen that certain features would have little affect on traffic levels

- *summary & icon* provides no extra numerical info 
- *temperature* can use apparent temp instead as more noticeable to bus passengers
- *dew point* no extra information gathered from this
- *humidity* no strong affect on traffic over temp
- *wind gust* overall wind speed would be more noticeable to passengers
- *wind bearing* the direction of the wind would not significantly affect people. It may have an affect on the overall temp but this is accounted for in temp
- *pressure* not very noticeable by people. More useful for weather studies

### The other features can be used and investigated further

In [38]:
df_key_feats = df_no_null.drop(columns=['hourly_data_summary',
                                        'hourly_data_icon',
                                        'hourly_data_temperature',
                                        'hourly_data_dew_point',
                                        'hourly_data_humidity',
                                        'hourly_data_wind_gust',
                                        'hourly_data_wind_bearing',
                                        'hourly_data_pressure'])

df_key_feats.head(10)

Unnamed: 0,hourly_data_time,hourly_data_precip_intensity,hourly_data_precip_probability,hourly_data_apparent_temperature,hourly_data_wind_speed,hourly_data_cloud_cover,hourly_data_uv_index,hourly_data_visibility
0,2017-01-01 00:00:00,0.0,0.0,39.72,7.03,0.75,0.0,6.216
1,2017-01-01 01:00:00,0.0,0.0,39.79,3.84,0.75,0.0,5.049
2,2017-01-01 02:00:00,0.0,0.0,38.21,5.91,0.75,0.0,6.216
3,2017-01-01 03:00:00,0.0,0.0,37.28,7.53,0.75,0.0,6.216
4,2017-01-01 04:00:00,0.0,0.0,35.74,6.47,0.75,0.0,6.216
5,2017-01-01 05:00:00,0.0,0.0,32.91,7.53,0.75,0.0,6.216
6,2017-01-01 06:00:00,0.0,0.0,31.69,7.03,0.58,0.0,6.216
7,2017-01-01 07:00:00,0.0,0.0,30.4,7.53,0.19,0.0,6.216
8,2017-01-01 08:00:00,0.0,0.0,30.69,7.6,0.19,0.0,6.216
9,2017-01-01 09:00:00,0.0,0.0,32.24,8.16,0.49,0.0,6.216


In [40]:
continuous_columns = df_key_feats.select_dtypes(['float']).columns
df_key_feats[continuous_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hourly_data_precip_intensity,21686.0,9.2e-05,0.009603,0.0,0.0,0.0,0.0,1.0
hourly_data_precip_probability,21686.0,0.028037,0.165081,0.0,0.0,0.0,0.0,1.0
hourly_data_apparent_temperature,21686.0,48.797408,11.248467,11.35,40.2,50.86,57.12,82.27
hourly_data_wind_speed,21686.0,9.70439,5.335895,0.0,5.82,8.91,12.75,40.27
hourly_data_cloud_cover,21686.0,0.642982,0.206508,0.0,0.59,0.75,0.75,1.0
hourly_data_uv_index,21686.0,0.769068,1.400542,0.0,0.0,0.0,1.0,7.0
hourly_data_visibility,21686.0,6.014229,0.892211,0.0,6.216,6.216,6.216,10.0


In [43]:
df_key_feats.to_csv('weather_data_cleaned_1.csv', index=False)