In [2]:
import kaggle
import zipfile
import pandas as pd

In [3]:
!kaggle datasets download -d hmavrodiev/london-bike-sharing-dataset

Downloading london-bike-sharing-dataset.zip to d:\Share\DA\data-analytics-projects\data-visualization\bikes-london




  0%|          | 0.00/165k [00:00<?, ?B/s]
100%|██████████| 165k/165k [00:00<00:00, 354kB/s]
100%|██████████| 165k/165k [00:00<00:00, 353kB/s]


In [None]:
# Extract zip file
zip_file = "london-bike-sharing-dataset.zip"
with zipfile.ZipFile(zipfile, 'r') as file:
    file.extractall()

In [6]:
# Read csv into `dataset`
dataset_csv = "london_merged.csv"
dataset = pd.read_csv(dataset_csv)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


In [8]:
# Number of duplicated records
dataset.duplicated().sum()

0

In [9]:
# Conver timestamp to datetime datatype
dataset.timestamp = pd.to_datetime(dataset.timestamp)
dataset['year'] = dataset.timestamp.dt.year

# Convert time in hour to day periods
def convertDayPeriod(hour):
    period_mapping = {
        (5, 9): 'early morning',
        (9, 12): 'morning',
        (12, 18): 'noon',
        (18, 22): 'evening',
    }
    for period_range, period in period_mapping.items():
        if hour >= period_range[0] and hour < period_range[1]:
            return period
    return 'night'
    
dataset['day_period'] = dataset.timestamp.dt.hour.apply(convertDayPeriod)

# Extract day of week from timestamp
dataset['day_of_week'] = dataset.timestamp.dt.day_name()

In [10]:
# dataset.drop('timestamp', axis=1, inplace=True)

In [11]:
# Rename specific columns
cols_renamed_dict = {
    'timestamp': 'time',
    'cnt': 'count',
    't1': 'real_temp',
    't2': 'feel_like_temp',
    'hum': 'humidity',
}

dataset.rename(mapper=cols_renamed_dict, axis=1, inplace=True)

In [12]:
print(f"\tweather codes\n{dataset.weather_code.value_counts()}\n")
print(f"\tseason codes\n{dataset.season.value_counts()}")

	weather codes
1.0     6150
2.0     4034
3.0     3551
7.0     2141
4.0     1464
26.0      60
10.0      14
Name: weather_code, dtype: int64

	season codes
0.0    4394
1.0    4387
3.0    4330
2.0    4303
Name: season, dtype: int64


In [13]:
# Map qualitative variables to their corresponding name

weather_val_dict = {
    1.0: 'clear',
    2.0: 'scattered cloud',
    3.0: 'broken cloud',
    4.0: 'cloudy',
    7.0: 'rain',
    10.0: 'rain with thunderstorm',
    26.0: 'snowfall'
}

season_val_dict = {
    0.0: 'spring',
    1.0: 'summer',
    2.0: 'fall',
    3.0: 'winter'
}

dataset.weather_code = dataset.weather_code.map(weather_val_dict)
dataset.season = dataset.season.map(season_val_dict)

In [14]:
# Convert humidity (current has percentage unit) to exact value 
dataset.humidity = dataset.humidity / 100

In [15]:
# Reorder columns
dataset = dataset[['year', 'season', 'day_of_week', 'day_period', 'count', 'real_temp', 'feel_like_temp', 
                   'humidity', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'time']]

In [16]:
dataset

Unnamed: 0,year,season,day_of_week,day_period,count,real_temp,feel_like_temp,humidity,wind_speed,weather_code,is_holiday,is_weekend,time
0,2015,winter,Sunday,night,182,3.0,2.0,0.930,6.0,broken cloud,0.0,1.0,2015-01-04 00:00:00
1,2015,winter,Sunday,night,138,3.0,2.5,0.930,5.0,clear,0.0,1.0,2015-01-04 01:00:00
2,2015,winter,Sunday,night,134,2.5,2.5,0.965,0.0,clear,0.0,1.0,2015-01-04 02:00:00
3,2015,winter,Sunday,night,72,2.0,2.0,1.000,0.0,clear,0.0,1.0,2015-01-04 03:00:00
4,2015,winter,Sunday,night,47,2.0,0.0,0.930,6.5,clear,0.0,1.0,2015-01-04 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017,winter,Tuesday,evening,1042,5.0,1.0,0.810,19.0,broken cloud,0.0,0.0,2017-01-03 19:00:00
17410,2017,winter,Tuesday,evening,541,5.0,1.0,0.810,21.0,cloudy,0.0,0.0,2017-01-03 20:00:00
17411,2017,winter,Tuesday,evening,337,5.5,1.5,0.785,24.0,cloudy,0.0,0.0,2017-01-03 21:00:00
17412,2017,winter,Tuesday,night,224,5.5,1.5,0.760,23.0,cloudy,0.0,0.0,2017-01-03 22:00:00


In [17]:
dataset.to_excel("london_bikes_final.xlsx", sheet_name='Data')