In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
#Constants
RAW_DATASET_PATH = '../../data/raw/SeoulBikeData_MP1.csv'

### Utility Methods

In [86]:
def get_range_for_column(dataframe, column_label):
    """
    Make sure the column given is in appropriate data type
    :param dataframe: `python.Dataframe` dataframe object
    :param column_label: the string label for the column
    :return: two values, first one being the min and the second one being the max
    """
    return dataframe[column_label].min(), dataframe[column_label].max()

In [87]:
def get_null_values_for_column(dataframe, column_label):
    """
    A utility function that returns the count of instances of a column which are null
    :param dataframe: `pandas.Dataframe` dataframe object
    :param column_label: the string label for the column
    :return: a positive integer indicating the number of instances with null values
    """
    return dataframe[column_label].isna().sum()

In [99]:
dataset = pd.read_csv(RAW_DATASET_PATH)

In [100]:
dataset

Unnamed: 0.1,Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,01/12/17,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,1,01/12/17,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,2,01/12/17,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,3,01/12/17,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,4,01/12/17,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,8755,30/11/18,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,8756,30/11/18,764,20,,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,8757,30/11/18,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,8758,30/11/18,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes


In [90]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                8760 non-null   int64  
 1   Date                      8760 non-null   object 
 2   Rented Bike Count         8760 non-null   int64  
 3   Hour                      8760 non-null   int64  
 4   Temperature(C)            8260 non-null   float64
 5   Humidity(%)               8760 non-null   int64  
 6   Wind speed (m/s)          8760 non-null   float64
 7   Visibility (10m)          8760 non-null   int64  
 8   Dew point temperature(C)  8760 non-null   float64
 9   Solar Radiation (MJ/m2)   8760 non-null   float64
 10  Rainfall(mm)              8760 non-null   float64
 11  Snowfall (cm)             8760 non-null   float64
 12  Seasons                   8760 non-null   object 
 13  Holiday                   8760 non-null   object 
 14  Function

In [98]:
dataset.describe()

Unnamed: 0.1,Unnamed: 0,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,8760.0,8760.0,8760.0,8260.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,4379.5,704.602055,11.5,12.877349,58.226256,1.724909,1436.825799,4.073813,0.569111,0.148687,0.075068
std,2528.938512,644.997468,6.922582,11.957556,20.362413,1.0363,608.298712,13.060369,0.868746,1.128193,0.436746
min,0.0,0.0,0.0,-17.5,0.0,0.0,27.0,-30.6,0.0,0.0,0.0
25%,2189.75,191.0,5.75,3.5,42.0,0.9,940.0,-4.7,0.0,0.0,0.0
50%,4379.5,504.5,11.5,13.75,57.0,1.5,1698.0,5.1,0.01,0.0,0.0
75%,6569.25,1065.25,17.25,22.5,74.0,2.3,2000.0,14.8,0.93,0.0,0.0
max,8759.0,3556.0,23.0,39.4,98.0,7.4,2000.0,27.2,3.52,35.0,8.8


**Verify what's the first and last date value**

In [91]:
dataset['Date'] = pd.to_datetime(dataset['Date'], format="%d/%m/%y")
end_date = dataset['Date'].max()
start_date = dataset['Date'].min()
print("Dates are from {} to {}".format(start_date, end_date))
print("Number of null fields {}".format(dataset['Date'].isna().sum()))

Dates are from 2017-12-01 00:00:00 to 2018-11-30 00:00:00
Number of null fields 0


**Verify data about the Rented Bike count Column**

In [92]:
min_rented_bike_count = dataset['Rented Bike Count'].min()
max_rented_bike_count = dataset['Rented Bike Count'].max()
print("Rented Bike count ranges from {} to {}".format(min_rented_bike_count, max_rented_bike_count))
print("Number of null fields {}".format(dataset['Rented Bike Count'].isna().sum()))

Rented Bike count ranges from 0 to 3556
Number of null fields 0


**Verify values for Temperature column**

In [93]:
min_temperature_celsius = dataset['Temperature(C)'].min()
max_temperature_celsius = dataset['Temperature(C)'].max()
print("Temperature ranges from {} degreec C to {} degrees C".format(min_temperature_celsius, max_temperature_celsius))
print("Number of null fields {}".format(dataset['Temperature(C)'].isna().sum()))

Temperature ranges from -17.5 degreec C to 39.4 degrees C
Number of null fields 500


**Verify values for Humidity column**

In [96]:
min_humidity, max_humidity = get_range_for_column(dataframe=dataset, column_label='Humidity(%)')
print("Humidity ranges from {}% to {}%".format(min_humidity, max_humidity))
print("Number of null values {}".format(get_null_values_for_column(dataframe=dataset, column_label='Humidity(%)')))

Humidity ranges from 0% to 98%
Number of null values 0


### Dataset description
Based on the above dataset we have about 8760 rows and about 14 attributes (ignoring the serial number)

1. Date - Format is dd/mm/yy starts from 01/12/17 and goes till 30/11/18, none of the values are null
2. Rented Bike count - Number denoting the number of bikes that were rented on any given day, ranges from 0 to 3556, none of the values are null
3. Hour - A categorical value ranging from 0 to 23 for each day, none of the values are null
4. Temperature - Temperature value in celsius, about 500 values are null, we might need to do some imputation for this. Ranges from -17.5 to 39.4, quite the range!
5. Humidity - A value denoting the amount of humidity in a given day, ranges from 0 to 98%. None of the rows are null
6. Wind speed - Number denoting wind speed in m/s. Ranges from 0 to 7.4
7. Visibility - The values*10 reveal the actual visibility in miles. Ranges from 270 meters to 20000 meters. None of the values are null
8. Dew Point Temperature - the temperature to which a parcel of moist air must be cooled at constant atmospheric pressure and constant water vapour content in order for saturation to occur. Ranges from -30.60 to 27.20 degrees celsius. None of the values are null
9. Solar radiation - MJ/m^2 (Mega Joule/m^2). None of the values are null, ranges from 0 to 3.52
10. Rainfall - In millimeters. None of the values are null. Ranges from 0 to 35mm
11. Snowfall - In centimeters. None of the values are null. Ranges from 0 to 8.8cm
12. Seasons - Categorical Variable. Values in (Winter, Soring, Summer, Autumn)
13. Holiday - Categorical Variable. Values in (Not Holiday, Holiday)
14. Functiondal day - Categorical Variable. Values in (Yes, No). Seen for each hour. None of the values are null

### Correlation of different numeric values with rented bike count

In [102]:
dataset.corr()['Rented Bike Count']

Unnamed: 0                  0.341894
Rented Bike Count           1.000000
Hour                        0.410257
Temperature(C)              0.540538
Humidity(%)                -0.199780
Wind speed (m/s)            0.121108
Visibility (10m)            0.199280
Dew point temperature(C)    0.379788
Solar Radiation (MJ/m2)     0.261837
Rainfall(mm)               -0.123074
Snowfall (cm)              -0.141804
Name: Rented Bike Count, dtype: float64

**Interesting observations**

1. We know that the hour ranges from 0 to 23 and we see a good correlation between hour and rented bike count
2. Also, Temperature seems to be positively correlated with Rented Bike Count as well