In [1]:
import pandas as pd
import datetime as dt

# Weather data cleaning

### CSV comes from historical weather dump from OpenWeatherMap API

In [2]:
weatherDF = pd.read_csv('historicalWeatherDataDublin2018.csv')

In [3]:
weatherDF

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1514678400,2017-12-31 00:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,9.91,6.41,8.00,12.00,...,140,,,,,75,803,Clouds,broken clouds,04n
1,1514682000,2017-12-31 01:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,11.09,1.29,11.00,11.57,...,230,,,,,75,803,Clouds,broken clouds,04n
2,1514685600,2017-12-31 02:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,11.01,2.37,10.51,11.52,...,220,,,,,75,803,Clouds,broken clouds,04n
3,1514689200,2017-12-31 03:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,9.99,-1.95,9.27,10.58,...,240,,,,,75,803,Clouds,broken clouds,04n
4,1514692800,2017-12-31 04:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,8.60,-5.04,8.00,9.59,...,240,,,,,75,803,Clouds,broken clouds,04n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8843,1546369200,2019-01-01 19:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,6.32,3.56,5.00,8.18,...,10,,,,,75,803,Clouds,broken clouds,04n
8844,1546372800,2019-01-01 20:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,6.06,2.55,5.00,7.68,...,360,,,,,75,803,Clouds,broken clouds,04n
8845,1546376400,2019-01-01 21:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,4.90,1.54,2.78,7.20,...,20,,,,,75,803,Clouds,broken clouds,04n
8846,1546380000,2019-01-01 22:00:00 +0000 UTC,0,Dublin,53.349805,-6.26031,4.21,1.69,2.00,6.99,...,0,,,,,75,803,Clouds,broken clouds,04n


In [4]:
weatherDF.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'feels_like', 'temp_min', 'temp_max', 'pressure', 'sea_level',
       'grnd_level', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h',
       'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all', 'weather_id',
       'weather_main', 'weather_description', 'weather_icon'],
      dtype='object')

### For the leavetimes dataset, we have the folowing columns and explanations for each of them taken from the National Transport Authority concept design document
- 'dt':
- 'dt_iso':
- 'timezone':
- 'city_name':
- 'lat':
- 'lon':
- 'temp':
- 'feels_like':
- 'temp_min':
- 'temp_max':
- 'pressure':
- 'sea_level':
- 'grnd_level':
- 'humidity':
- 'wind_speed':
- 'wind_deg':
- 'rain_1h':
- 'rain_3h':
- 'snow_1h':
- 'snow_3h':
- 'clouds_all':
- 'weather_id':
- 'weather_main':
- 'weather_description':
- 'weather_icon':

### Columns to be dropped on first observation:
- dt: we can use dt_iso to identify the datetime of the weather.
- timezone, city_name, lat, lon: we are just viewing one timezone, so it is not necessary to distingush these values
- feels_like, temp_min, temp_max: for simplicity's sake, we will just use temp
- pressure, sea_level, grnd_level, wind_deg, clouds_all: for simplicity's in prediction models
- rain_3h, snow_3h: weather forecast data goes to just one hour ahead, and it will be simpler to predict with just rain data per hour.
- weather_id, weather_icon: not needed for prediction models

In [5]:
weatherDF.drop(columns=['dt', 'timezone', 'city_name', 'lat', 'lon', 'feels_like', 'temp_min', 'temp_max',\
                       'pressure', 'sea_level', 'grnd_level', 'clouds_all', 'wind_deg', 'rain_3h', 'snow_3h',\
                        'weather_id', 'weather_icon'], inplace=True)

### Examine the data types for the weather dataframe

In [6]:
weatherDF.dtypes

dt_iso                  object
temp                   float64
humidity                 int64
wind_speed             float64
rain_1h                float64
snow_1h                float64
weather_main            object
weather_description     object
dtype: object

- We need to convert dt_iso to datetime

In [7]:
weatherDF

Unnamed: 0,dt_iso,temp,humidity,wind_speed,rain_1h,snow_1h,weather_main,weather_description
0,2017-12-31 00:00:00 +0000 UTC,9.91,93,4.63,,,Clouds,broken clouds
1,2017-12-31 01:00:00 +0000 UTC,11.09,82,13.38,,,Clouds,broken clouds
2,2017-12-31 02:00:00 +0000 UTC,11.01,76,11.32,,,Clouds,broken clouds
3,2017-12-31 03:00:00 +0000 UTC,9.99,62,14.92,,,Clouds,broken clouds
4,2017-12-31 04:00:00 +0000 UTC,8.60,61,16.98,,,Clouds,broken clouds
...,...,...,...,...,...,...,...,...
8843,2019-01-01 19:00:00 +0000 UTC,6.32,86,2.10,,,Clouds,broken clouds
8844,2019-01-01 20:00:00 +0000 UTC,6.06,86,3.10,,,Clouds,broken clouds
8845,2019-01-01 21:00:00 +0000 UTC,4.90,86,2.60,,,Clouds,broken clouds
8846,2019-01-01 22:00:00 +0000 UTC,4.21,93,1.50,,,Clouds,broken clouds


### Now that we have erased unnecessary rows, we need to convert the dt_iso column to a datetime format that is consistent with the DAYOFSERVICE column in the historical bus data

In [8]:
weatherDF['dt_iso'] = pd.to_datetime(weatherDF['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC')
weatherDF['dt_iso'] = weatherDF['dt_iso'].dt.strftime('%Y-%m-%d %H:%M:%S')
weatherDF['dt_iso'] = pd.to_datetime(weatherDF['dt_iso'])

In [9]:
weatherDF

Unnamed: 0,dt_iso,temp,humidity,wind_speed,rain_1h,snow_1h,weather_main,weather_description
0,2017-12-31 00:00:00,9.91,93,4.63,,,Clouds,broken clouds
1,2017-12-31 01:00:00,11.09,82,13.38,,,Clouds,broken clouds
2,2017-12-31 02:00:00,11.01,76,11.32,,,Clouds,broken clouds
3,2017-12-31 03:00:00,9.99,62,14.92,,,Clouds,broken clouds
4,2017-12-31 04:00:00,8.60,61,16.98,,,Clouds,broken clouds
...,...,...,...,...,...,...,...,...
8843,2019-01-01 19:00:00,6.32,86,2.10,,,Clouds,broken clouds
8844,2019-01-01 20:00:00,6.06,86,3.10,,,Clouds,broken clouds
8845,2019-01-01 21:00:00,4.90,86,2.60,,,Clouds,broken clouds
8846,2019-01-01 22:00:00,4.21,93,1.50,,,Clouds,broken clouds


### We will also rename the  dt_iso column as 'DAYHOUR', so it can be joined with the historical bus data column

In [10]:
weatherDF.rename(columns={'dt_iso':'DAYHOUR'}, inplace=True)

### Is there any data for snow in the dataframe at all?

In [11]:
weatherDF[pd.notnull(weatherDF['snow_1h'])].sort_values(by='DAYHOUR')

Unnamed: 0,DAYHOUR,temp,humidity,wind_speed,rain_1h,snow_1h,weather_main,weather_description
406,2018-01-16 19:00:00,0.24,93,8.75,,0.24,Snow,light shower snow
890,2018-02-05 23:00:00,0.27,93,3.60,,0.39,Snow,light snow
891,2018-02-06 00:00:00,0.19,93,3.60,,0.55,Snow,light snow
892,2018-02-06 01:00:00,-0.25,93,2.57,,0.52,Snow,light snow
1418,2018-02-27 23:00:00,-1.46,92,5.10,,0.56,Snow,light shower snow
...,...,...,...,...,...,...,...,...
1866,2018-03-18 10:00:00,-0.80,93,11.32,,0.35,Snow,light shower snow
1867,2018-03-18 11:00:00,-0.79,100,11.32,,0.41,Snow,light shower snow
1868,2018-03-18 12:00:00,-0.28,93,13.38,,0.58,Snow,light shower snow
1869,2018-03-18 13:00:00,0.11,93,11.32,,0.31,Snow,light shower snow


### As we have 65 hours of data for snow, this could be useful in a prediction model, so we will not drop the column

## There are some dates not in the year 2018 in this dataset
- we will remove these rows, as they will not be useful for bus times

In [12]:
wrongYears = weatherDF[weatherDF['DAYHOUR'].dt.year != 2018].index
weatherDF.drop(wrongYears, inplace=True)
weatherDF.reset_index(drop=True)

Unnamed: 0,DAYHOUR,temp,humidity,wind_speed,rain_1h,snow_1h,weather_main,weather_description
0,2018-01-01 00:00:00,4.15,87,12.86,0.3,,Rain,light intensity shower rain
1,2018-01-01 01:00:00,4.14,87,11.83,0.3,,Rain,light intensity shower rain
2,2018-01-01 02:00:00,4.61,81,12.35,,,Clouds,scattered clouds
3,2018-01-01 03:00:00,4.64,81,12.35,,,Clouds,scattered clouds
4,2018-01-01 04:00:00,5.04,81,11.83,,,Clouds,scattered clouds
...,...,...,...,...,...,...,...,...
8795,2018-12-31 19:00:00,9.65,76,4.10,,,Clouds,broken clouds
8796,2018-12-31 20:00:00,9.27,81,4.10,,,Clouds,broken clouds
8797,2018-12-31 21:00:00,9.31,81,5.10,,,Clouds,broken clouds
8798,2018-12-31 22:00:00,9.19,81,5.70,,,Clouds,broken clouds


### There are 8760 hours in one year, but there are 8800 entries in this dataframe
- We will check if there are duplicate entries for the DAYHOUR column

In [16]:
duplicates = weatherDF[weatherDF['DAYHOUR'].duplicated()].index

### As we can see there are 40 duplicate entries for the DAYHOUR column that just differ in the weather_main and weather_description columns.
- As we are mainly using numerical values to predict the arrival times of buses, we will drop the duplicate columns
    - Otherwise more entries will be created when merging the weather dataframe with the leavetimes dataframe, which will skew the data

In [17]:
weatherDF.drop(duplicates, inplace=True)
weatherDF.shape[0]

8760

### Save this dataframe as a csv to merge with the historical bus data

In [18]:
weatherDF.to_csv('weatherData_CLEANED.csv', index_label=False)