In [2]:
import pandas as pd
import numpy as np
from numpy import average
import matplotlib.pyplot as plt 
import matplotlib.mlab as mlab
import matplotlib.patches as mpatches
import seaborn as sb
import datetime as dt
import glob
import json
from collections import Counter

# called to counteract dark-mode Jupyter from swallowing axes &c.
plt.style.use('seaborn-whitegrid')

In [3]:
# Source citation: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', 
# Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg (http://dx.doi.org/10.1007/s13748-013-0040-3) 
# (accessed: 15 Mar 2022)

# Import the core datasets
daily_core = pd.read_csv('data/uci_data/day.csv')
hourly_core = pd.read_csv('data/uci_data/hour.csv')

<hr>

A fly-by view of the data. Looking for things that poke out.

In [4]:
daily_core.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [5]:
hourly_core.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [6]:
daily_core.head().T

Unnamed: 0,0,1,2,3,4
instant,1,2,3,4,5
dteday,2011-01-01,2011-01-02,2011-01-03,2011-01-04,2011-01-05
season,1,1,1,1,1
yr,0,0,0,0,0
mnth,1,1,1,1,1
holiday,0,0,0,0,0
weekday,6,0,1,2,3
workingday,0,0,1,1,1
weathersit,2,2,1,1,1
temp,0.344167,0.363478,0.196364,0.2,0.226957


In [7]:
hourly_core.head().T

Unnamed: 0,0,1,2,3,4
instant,1,2,3,4,5
dteday,2011-01-01,2011-01-01,2011-01-01,2011-01-01,2011-01-01
season,1,1,1,1,1
yr,0,0,0,0,0
mnth,1,1,1,1,1
hr,0,1,2,3,4
holiday,0,0,0,0,0
weekday,6,6,6,6,6
workingday,0,0,0,0,0
weathersit,1,1,1,1,1


In [8]:
daily_core.shape

(731, 16)

In [9]:
hourly_core.shape

(17379, 17)

In [10]:
daily_core.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,731.0,366.0,211.165812,1.0,183.5,366.0,548.5,731.0
season,731.0,2.49658,1.110807,1.0,2.0,3.0,3.0,4.0
yr,731.0,0.500684,0.500342,0.0,0.0,1.0,1.0,1.0
mnth,731.0,6.519836,3.451913,1.0,4.0,7.0,10.0,12.0
holiday,731.0,0.028728,0.167155,0.0,0.0,0.0,0.0,1.0
weekday,731.0,2.997264,2.004787,0.0,1.0,3.0,5.0,6.0
workingday,731.0,0.683995,0.465233,0.0,0.0,1.0,1.0,1.0
weathersit,731.0,1.395349,0.544894,1.0,1.0,1.0,2.0,3.0
temp,731.0,0.495385,0.183051,0.05913,0.337083,0.498333,0.655417,0.861667
atemp,731.0,0.474354,0.162961,0.07907,0.337842,0.486733,0.608602,0.840896


In [11]:
hourly_core.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,17379.0,8690.0,5017.0295,1.0,4345.5,8690.0,13034.5,17379.0
season,17379.0,2.50164,1.106918,1.0,2.0,3.0,3.0,4.0
yr,17379.0,0.502561,0.500008,0.0,0.0,1.0,1.0,1.0
mnth,17379.0,6.537775,3.438776,1.0,4.0,7.0,10.0,12.0
hr,17379.0,11.546752,6.914405,0.0,6.0,12.0,18.0,23.0
holiday,17379.0,0.02877,0.167165,0.0,0.0,0.0,0.0,1.0
weekday,17379.0,3.003683,2.005771,0.0,1.0,3.0,5.0,6.0
workingday,17379.0,0.682721,0.465431,0.0,0.0,1.0,1.0,1.0
weathersit,17379.0,1.425283,0.639357,1.0,1.0,1.0,2.0,4.0
temp,17379.0,0.496987,0.192556,0.02,0.34,0.5,0.66,1.0


In [12]:
daily_core.nunique(axis=0)

instant       731
dteday        731
season          4
yr              2
mnth           12
holiday         2
weekday         7
workingday      2
weathersit      3
temp          499
atemp         690
hum           595
windspeed     650
casual        606
registered    679
cnt           696
dtype: int64

In [13]:
hourly_core.nunique(axis=0)

instant       17379
dteday          731
season            4
yr                2
mnth             12
hr               24
holiday           2
weekday           7
workingday        2
weathersit        4
temp             50
atemp            65
hum              89
windspeed        30
casual          322
registered      776
cnt             869
dtype: int64

Whoops, what's up with *weathersit*? There is one missing in the daily set.

In [14]:
print(daily_core['weathersit'].unique())
print(hourly_core['weathersit'].unique())

[2 1 3]
[1 2 3 4]


In [15]:
daily_core.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [16]:
hourly_core.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [17]:
daily_core.duplicated().sum()

0

In [18]:
hourly_core.duplicated().sum()

0

In [19]:
daily_core.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [20]:
hourly_core.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

Apparently there are no null values, but there **are** values missing / non-existant.

Can I impute the missing values?<br />
For the missing data in *hourly_core*. The missing hourly entries could be filled. Date & hour can be sequenced, temperature and windspeed filled in from the weather site the UCI researchers used (which by the way takes its data from the weather station at Reagan National Airport located in the middle of Washington D.C. and also supplies NOAA).<br />
The number of bike riders though cannot be imputed because at least one range of missing data covers an extreme weather  event (there is only one entry for 2012-10-29 [Hurricane Sandy]).

In [21]:
# How much missing data is there in the hourly and daily data set?
print("Number of hourly entries needed:",((4*30+7*31)*2+28+29)*24, 
        "Number of hourly entries missing:", ((4*30+7*31)*2+28+29)*24 - hourly_core.instant.count())
print("Number of daily entries needed:",((4*30+7*31)*2+28+29), 
        "Number of daily entries missing:", ((4*30+7*31)*2+28+29) - daily_core.instant.count())


Number of hourly entries needed: 17544 Number of hourly entries missing: 165
Number of daily entries needed: 731 Number of daily entries missing: 0


<hr>

Little bit of data maintenance.

In [22]:
# Rename columns
hourly_core.columns = ['instant', 'date_day', 'season', 'year', 'month', 'hour', 'is_holiday', 'weekday', 'is_workday', 'weather', 'temp', 'temp_feel', 'humidity', 'windspeed', 'casual', 'registered', 'total_riders']
daily_core.columns = ['instant', 'date_day', 'season', 'year', 'month', 'is_holiday', 'weekday', 'is_workday', 'weather', 'temp', 'temp_feel', 'humidity', 'windspeed', 'casual', 'registered', 'total_riders']


In [23]:
# Convert certain numerical columns to categories with more human-friendly content
# note: the mapping of seasons is wrong both in the readme file (winter=1) and on the data set website (spring=1), 
# seasons were remapped according to WMO conventions (see note).

hourly_core['season'] = hourly_core.month.map({
        1:'winter', 2:'winter', 
        3:'spring', 4:'spring', 5:'spring', 
        6:'summer', 7:'summer', 8:'summer', 
        9:'autumn', 10:'autumn', 11:'autumn', 
        12:'winter'
    }).astype('category')
hourly_core['weekday'] = hourly_core.weekday.map({
        1:'monday', 2:'tuesday', 3:'wednesday', 4:'thursday', 5:'friday', 6:'saturday', 0:'sunday'
    }).astype('category')
hourly_core['is_holiday'] = hourly_core.is_holiday.map({0:'no', 1:'yes'}).astype('category')
hourly_core['weather_desc'] = hourly_core.weather.map({1:'clear', 2:'foggy', 3:'inclement', 4:'crap'}).astype('category')
hourly_core['year'] = hourly_core.year.map({0:2011, 1:2012}).astype('category')

daily_core['season'] = daily_core.month.map({
        1:'winter', 2:'winter', 
        3:'spring', 4:'spring', 5:'spring', 
        6:'summer', 7:'summer', 8:'summer', 
        9:'autumn', 10:'autumn', 11:'autumn', 
        12:'winter'}).astype('category')
daily_core['weekday'] = daily_core.weekday.map({
        1:'monday', 2:'tuesday', 3:'wednesday', 4:'thursday', 5:'friday', 6:'saturday', 0:'sunday'
    }).astype('category')
daily_core['is_holiday'] = daily_core.is_holiday.map({0:'no', 1:'yes'}).astype('category')
daily_core['weather_desc'] = daily_core.weather.map({1:'clear', 2:'foggy', 3:'inclement', 4:'crap'}).astype('category')
daily_core['year'] = daily_core.year.map({0:2011, 1:2012}).astype('category')


In [24]:
# Add a time of the day column (morning (06-12), afternoon (12-18), evening (18-00), night (00-06))
hourly_core['daytime'] = hourly_core.hour.apply(lambda value: "night" 
                                            if value <= 6 else "morning"
                                            if value <= 12 else "afternoon"
                                            if value <= 18 else "evening").astype('category')


In [25]:
# Convert time and date to timestamps
hourly_core['hour'] = pd.to_datetime(hourly_core['hour'], format='%H').dt.time
hourly_core['date_day'] = pd.to_datetime(hourly_core['date_day']).dt.date

daily_core['date_day'] = pd.to_datetime(daily_core['date_day']).dt.date

# Combine time and date into an extra column
hourly_core['date_time'] = hourly_core.apply(lambda r : dt.datetime.combine(r['date_day'],r['hour']),1)
daily_core['date_time'] = daily_core['date_day']


In [26]:
daily_core.dtypes

instant            int64
date_day          object
season          category
year            category
month              int64
is_holiday      category
weekday         category
is_workday         int64
weather            int64
temp             float64
temp_feel        float64
humidity         float64
windspeed        float64
casual             int64
registered         int64
total_riders       int64
weather_desc    category
date_time         object
dtype: object

In [27]:
hourly_core.dtypes

instant                  int64
date_day                object
season                category
year                  category
month                    int64
hour                    object
is_holiday            category
weekday               category
is_workday               int64
weather                  int64
temp                   float64
temp_feel              float64
humidity               float64
windspeed              float64
casual                   int64
registered               int64
total_riders             int64
weather_desc          category
daytime               category
date_time       datetime64[ns]
dtype: object

Alternative method to find out how many records are missing, saves you from counting.

In [28]:
# And set the datetime column as index
hourly_core = hourly_core.set_index(['date_time'])
daily_core = daily_core.set_index(['date_time'])

In [29]:
temp = (pd.date_range(start="2011-01-01", end="2012-12-31", freq='1H').difference(hourly_core.index)).to_frame(index=False, name='hr_missing')
print("Number of hourly records missing:", temp.count())

temp = (pd.date_range(start="2011-01-01", end="2012-12-31", freq='1d').difference(daily_core.index)).to_frame(index=False, name='day_missing')
print("Number of daily records missing:", temp.count())

Number of hourly records missing: hr_missing    165
dtype: int64
Number of daily records missing: day_missing    0
dtype: int64


We could drop some unnecessary columns:
* date_time combines yr, mnth, day, hr
* workingday == NOT is_holiday OR saturday/sunday
* instant => date_time index takes over

but they are not really bothering us, and we may be able to use the split date more easily.<br />
Maybe drop them later, if performance is an issue.

df = df.drop(columns=['col1', 'col2'])

In [32]:
# De-normalise the values for temperature, apparent temperature, humidity, 
# and windspeed
# x = x_norm * (max_y - min_y) + min_y
hourly_core['temp'] = hourly_core.temp.apply(lambda x: ((x * (39 - -8)) + -8))
hourly_core['temp_feel'] = hourly_core.temp_feel.apply(lambda x: ((x * (50 - -16)) + -16))
hourly_core['humidity'] = hourly_core.humidity.apply(lambda x: (x * 100))
hourly_core['windspeed'] = hourly_core.windspeed.apply(lambda x: (x * 67))

daily_core['temp'] = daily_core.temp.apply(lambda x: ((x * (39 - -8)) + -8)) # (x*47-8)
daily_core['temp_feel'] = daily_core.temp_feel.apply(lambda x: ((x * (50 - -16)) + -16))
daily_core['humidity'] = daily_core.humidity.apply(lambda x: x * 100)
daily_core['windspeed'] = daily_core.windspeed.apply(lambda x: x * 67)


In [33]:
hourly_core.dtypes

instant            int64
date_day          object
season          category
year            category
month              int64
hour              object
is_holiday      category
weekday         category
is_workday         int64
weather            int64
temp             float64
temp_feel        float64
humidity         float64
windspeed        float64
casual             int64
registered         int64
total_riders       int64
weather_desc    category
daytime         category
dtype: object

Note that date_day and hour are still object types and not datetime even though they were converted to timestamps.
Why?

In [34]:
# Dataframe of all missing hourly values
# Datetime as index
missing_hours = pd.date_range(start="2011-01-01", end="2012-12-31", freq='1H').difference(hourly_core.index)
missing_hours = missing_hours.to_frame(index=False, name='hr_missing')

missing_hours['date'] = pd.to_datetime(missing_hours['hr_missing']).dt.date
missing_hours['time'] = pd.to_datetime(missing_hours['hr_missing']).dt.time

missing_hours = missing_hours.set_index(['hr_missing'])

In [35]:
missing_hours.head()

Unnamed: 0_level_0,date,time
hr_missing,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-02 05:00:00,2011-01-02,05:00:00
2011-01-03 02:00:00,2011-01-03,02:00:00
2011-01-03 03:00:00,2011-01-03,03:00:00
2011-01-04 03:00:00,2011-01-04,03:00:00
2011-01-05 03:00:00,2011-01-05,03:00:00


In [37]:
# Get the records of a specific date
filtered_df = hourly_core.loc['2011-01-02']
filtered_df

Unnamed: 0_level_0,instant,date_day,season,year,month,hour,is_holiday,weekday,is_workday,weather,temp,temp_feel,humidity,windspeed,casual,registered,total_riders,weather_desc,daytime
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-02 00:00:00,25,2011-01-02,winter,2011,1,00:00:00,no,sunday,0,2,13.62,13.997,88.0,19.9995,4,13,17,foggy,night
2011-01-02 01:00:00,26,2011-01-02,winter,2011,1,01:00:00,no,sunday,0,2,12.68,13.0004,94.0,16.9979,1,16,17,foggy,night
2011-01-02 02:00:00,27,2011-01-02,winter,2011,1,02:00:00,no,sunday,0,2,11.74,11.9972,100.0,19.0012,1,8,9,foggy,night
2011-01-02 03:00:00,28,2011-01-02,winter,2011,1,03:00:00,no,sunday,0,2,13.62,13.997,94.0,12.998,2,4,6,foggy,night
2011-01-02 04:00:00,29,2011-01-02,winter,2011,1,04:00:00,no,sunday,0,2,13.62,13.997,94.0,12.998,2,1,3,foggy,night
2011-01-02 06:00:00,30,2011-01-02,winter,2011,1,06:00:00,no,sunday,0,3,11.74,11.9972,77.0,19.9995,0,2,2,inclement,night
2011-01-02 07:00:00,31,2011-01-02,winter,2011,1,07:00:00,no,sunday,0,2,10.8,11.0006,76.0,12.998,0,1,1,foggy,morning
2011-01-02 08:00:00,32,2011-01-02,winter,2011,1,08:00:00,no,sunday,0,3,10.8,11.0006,71.0,15.0013,0,8,8,inclement,morning
2011-01-02 09:00:00,33,2011-01-02,winter,2011,1,09:00:00,no,sunday,0,2,9.86,9.9974,76.0,15.0013,1,19,20,foggy,morning
2011-01-02 10:00:00,34,2011-01-02,winter,2011,1,10:00:00,no,sunday,0,2,8.92,7.001,81.0,15.0013,7,46,53,foggy,morning


In [43]:
# Get the records for Hurricane Sandy
# Oct 26: state of emergency declared in the US
# Oct 29-30: all government buildings closed and DC Metro services suspended
filtered_df = hourly_core['2012-10-26':'2012-10-30']
filtered_df.weather.unique()

# For this date range the weather is clear (1), misty (2), and light rain (3).
# Do we agree that a hurricane is at least a 4? Heavy rain and the like?


array([2, 1, 3])

Taking as an example a day where there is only one missing value. If the number of riders (casual, registered, total) is different to the sum of all 23 values present in the hourly data of the day then we can fill in the rider data with the difference and take the weather information from the Washington National Airport weather station.

Result: Checked several days, the difference was always zero. It is unlikely that they all had 0 riders at that time. The sums of the riders in the hourly data from the example day matches the recorded riders from that day's daily record, therefore the daily records were computed using the hourly data with missing records not substituted in any way.

In [None]:
# Example day
filtered_df = hourly_core.loc['2011-01-04']

# Values for that day (2011-01-02): casual: 131, registered: 670, total: 801
# Values for (2011-01-04): 108	1454	1562
print("casuals:", filtered_df.casual.sum())
print("registered:", filtered_df.registered.sum())

In [50]:
# Some things to make categories of: cold, hot, and muggy hours and days
# NOAA typically considers relative humidity (RH) levels of 50% or more, and dewpoints 
# (a more direct measure of humidity) above 65 F (18 C) to be uncomfortably high

hourly_core['cold'] = np.where(hourly_core.temp_feel < 5, 1, 0)
hourly_core['hot'] = np.where(hourly_core.temp_feel > 30, 1, 0)
hourly_core['muggy'] = np.where(hourly_core.humidity >= 50, 1, 0)

daily_core['cold'] = np.where(daily_core.temp_feel < 5, 1, 0)
daily_core['hot'] = np.where(daily_core.temp_feel > 30, 1, 0)
daily_core['muggy'] = np.where(daily_core.humidity >= 50, 1, 0)

In [51]:
hourly_core.reset_index(inplace=True)
hourly_core.to_csv('data/mod_data/hour_mod.csv', index=False)
daily_core.reset_index(inplace=True)
daily_core.to_csv('data/mod_dataday_mod.csv', index=False)