In [1]:
import pandas as pd
import numpy as np
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime as dt
from datetime import datetime
%matplotlib inline
%config InlineBackend.figure_format = 'png'

In [2]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'Analysis_Cleaning_Sharing_Bike.ipynb',
 'Data',
 'london_merged_v1.csv',
 'README.md']

### Overview of the DataSet

In [3]:
os.getcwd()

'D:\\Repositories\\london_bike_sharing'

In [4]:
df = pd.read_csv('.\data/london_merged.csv')

In [5]:
df

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [6]:
df.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [7]:
df.columns

Index(['timestamp', 'cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season'],
      dtype='object')

### Renaming Columns

In [8]:
df.columns = ['timestamp', 'number', 'temperature', 'temperature_feels', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season']

In [9]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season'],
      dtype='object')

In [10]:
pd.to_datetime(df['timestamp'])

0       2015-01-04 00:00:00
1       2015-01-04 01:00:00
2       2015-01-04 02:00:00
3       2015-01-04 03:00:00
4       2015-01-04 04:00:00
                ...        
17409   2017-01-03 19:00:00
17410   2017-01-03 20:00:00
17411   2017-01-03 21:00:00
17412   2017-01-03 22:00:00
17413   2017-01-03 23:00:00
Name: timestamp, Length: 17414, dtype: datetime64[ns]

### Converting to Datetime the dates

In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Adding Year, Month, Day, to the dataset

In [12]:
df['month'] = df.timestamp.apply(lambda x : x.strftime('%m'))

In [13]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season',
       'month'],
      dtype='object')

In [14]:
column_names = ['timestamp', 'month', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

* Reorganizing the columns

In [15]:
df = df.reindex(columns=column_names)

* Year, Days, Day_name, Week number

In [16]:
df['year'] = df.timestamp.apply(lambda x : x.strftime('20%y'))

In [17]:
df['day'] = df.timestamp.apply(lambda x : x.strftime('%d'))

In [18]:
column_names = ['timestamp', 'year', 'month', 'day', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

In [19]:
df = df.reindex(columns=column_names)

In [20]:
df['day_name'] = df['timestamp'].dt.day_name()

In [21]:
df['week_number'] = df.timestamp.apply(lambda x : x.strftime('%U'))


In [22]:
df['time'] = df.timestamp.apply(lambda x : x.strftime('%H:%M'))

In [23]:
df['date'] = df.timestamp.apply(lambda x : x.strftime('%y-%m-%d'))

* Reorganizing the columns again

In [24]:
df.columns

Index(['timestamp', 'year', 'month', 'day', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season', 'day_name', 'week_number', 'time', 'date'],
      dtype='object')

In [25]:
column_names = ['timestamp','date', 'year', 'month', 'day', 'day_name', 'week_number', 'time', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season']

In [26]:
df = df.reindex(columns=column_names)

In [27]:
df

Unnamed: 0,timestamp,date,year,month,day,day_name,week_number,time,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,15-01-04,2015,01,04,Sunday,01,00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,15-01-04,2015,01,04,Sunday,01,01:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,15-01-04,2015,01,04,Sunday,01,02:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,15-01-04,2015,01,04,Sunday,01,03:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,15-01-04,2015,01,04,Sunday,01,04:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,17-01-03,2017,01,03,Tuesday,01,19:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,17-01-03,2017,01,03,Tuesday,01,20:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,17-01-03,2017,01,03,Tuesday,01,21:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,17-01-03,2017,01,03,Tuesday,01,22:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [28]:
df

Unnamed: 0,timestamp,date,year,month,day,day_name,week_number,time,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,15-01-04,2015,01,04,Sunday,01,00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,15-01-04,2015,01,04,Sunday,01,01:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,15-01-04,2015,01,04,Sunday,01,02:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,15-01-04,2015,01,04,Sunday,01,03:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,15-01-04,2015,01,04,Sunday,01,04:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,17-01-03,2017,01,03,Tuesday,01,19:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,17-01-03,2017,01,03,Tuesday,01,20:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,17-01-03,2017,01,03,Tuesday,01,21:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,17-01-03,2017,01,03,Tuesday,01,22:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


* Changing number of month to name of the month

In [29]:
df['month'] = pd.to_datetime(df['month'], format='%m').dt.month_name().str.slice(stop=3)


In [30]:
df.columns

Index(['timestamp', 'date', 'year', 'month', 'day', 'day_name', 'week_number',
       'time', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season'],
      dtype='object')

In [31]:
df.columns = ['timestamp', 'date', 'year', 'month', 'day', 'day_name', 'week_number',
       'time', 'bike_number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season']

In [32]:
df.to_csv('london_merged_v1.csv')

## Null Values

In [33]:
df.isna().sum()

timestamp            0
date                 0
year                 0
month                0
day                  0
day_name             0
week_number          0
time                 0
bike_number          0
temperature          0
temperature_feels    0
hum                  0
wind_speed           0
weather_code         0
is_holiday           0
is_weekend           0
season               0
dtype: int64

### Saving the updates and cleanning in a new Data Set

In [34]:
df = pd.read_csv('london_merged_v1.csv')

In [35]:
df = df.reset_index()

In [36]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [37]:
df.to_csv('london_merged_v1.csv', index=False)

In [38]:
df = pd.read_csv('london_merged_v1.csv')

In [39]:
df.sample(50)

Unnamed: 0,index,timestamp,date,year,month,day,day_name,week_number,time,bike_number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
3487,3487,2015-05-29 15:00:00,15-05-29,2015,May,29,Friday,21,15:00,1145,13.5,13.5,69.5,26.0,2.0,0.0,0.0,0.0
11759,11759,2016-05-10 20:00:00,16-05-10,2016,May,10,Tuesday,19,20:00,1184,16.0,16.0,91.0,13.0,3.0,0.0,0.0,0.0
17229,17229,2016-12-27 07:00:00,16-12-27,2016,Dec,27,Tuesday,52,07:00,119,2.5,-1.0,84.0,12.0,1.0,1.0,0.0,3.0
3214,3214,2015-05-18 06:00:00,15-05-18,2015,May,18,Monday,20,06:00,640,10.5,9.5,74.0,16.0,3.0,0.0,0.0,0.0
4328,4328,2015-07-03 19:00:00,15-07-03,2015,Jul,3,Friday,26,19:00,2388,21.0,21.0,66.5,30.5,1.0,0.0,0.0,1.0
12782,12782,2016-06-22 11:00:00,16-06-22,2016,Jun,22,Wednesday,25,11:00,1174,20.0,20.0,78.0,18.0,3.0,0.0,0.0,1.0
4290,4290,2015-07-02 05:00:00,15-07-02,2015,Jul,2,Thursday,26,05:00,158,19.0,19.0,83.0,13.0,2.0,0.0,0.0,1.0
15657,15657,2016-10-22 17:00:00,16-10-22,2016,Oct,22,Saturday,42,17:00,2095,13.0,13.0,65.0,12.0,2.0,0.0,1.0,2.0
11241,11241,2016-04-19 06:00:00,16-04-19,2016,Apr,19,Tuesday,16,06:00,726,10.0,9.0,73.5,7.0,1.0,0.0,0.0,0.0
3241,3241,2015-05-19 09:00:00,15-05-19,2015,May,19,Tuesday,20,09:00,2110,12.0,12.0,56.0,28.0,2.0,0.0,0.0,0.0
