In [1]:
import pandas as pd
import numpy as np
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime as dt
from datetime import datetime
%matplotlib inline
%config InlineBackend.figure_format = 'png'

In [2]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'Analysis_Cleaning_Sharing_Bike.ipynb',
 'Data',
 'london_merged_v1.csv',
 'README.md']

### Overview of the DataSet

In [3]:
os.getcwd()

'D:\\Repositories\\london_bike_sharing'

In [4]:
df = pd.read_csv('.\data/london_merged.csv')

In [5]:
df

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [6]:
df.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [7]:
df.columns

Index(['timestamp', 'cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season'],
      dtype='object')

### Renaming Columns

In [8]:
df.columns = ['timestamp', 'number', 'temperature', 'temperature_feels', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season']

In [9]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season'],
      dtype='object')

In [10]:
pd.to_datetime(df['timestamp'])

0       2015-01-04 00:00:00
1       2015-01-04 01:00:00
2       2015-01-04 02:00:00
3       2015-01-04 03:00:00
4       2015-01-04 04:00:00
                ...        
17409   2017-01-03 19:00:00
17410   2017-01-03 20:00:00
17411   2017-01-03 21:00:00
17412   2017-01-03 22:00:00
17413   2017-01-03 23:00:00
Name: timestamp, Length: 17414, dtype: datetime64[ns]

### Converting to Datetime the dates

In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Adding Year, Month, Day, to the dataset

In [12]:
df['month'] = df.timestamp.apply(lambda x : x.strftime('%m'))

In [13]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season',
       'month'],
      dtype='object')

In [14]:
column_names = ['timestamp', 'month', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

* Reorganizing the columns

In [15]:
df = df.reindex(columns=column_names)

* Year, Days, Day_name, Week number

In [16]:
df['year'] = df.timestamp.apply(lambda x : x.strftime('20%y'))

In [17]:
df['day'] = df.timestamp.apply(lambda x : x.strftime('%d'))

In [18]:
column_names = ['timestamp', 'year', 'month', 'day', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

In [19]:
df = df.reindex(columns=column_names)

In [20]:
df['day_name'] = df['timestamp'].dt.day_name()

In [21]:
df['week_number'] = df.timestamp.apply(lambda x : x.strftime('%U'))


In [22]:
df['time'] = df.timestamp.apply(lambda x : x.strftime('%H:%M'))

In [23]:
df['date'] = df.timestamp.apply(lambda x : x.strftime('%y-%m-%d'))

* Reorganizing the columns again

In [24]:
df.columns

Index(['timestamp', 'year', 'month', 'day', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season', 'day_name', 'week_number', 'time', 'date'],
      dtype='object')

In [25]:
column_names = ['timestamp','date', 'year', 'month', 'day', 'day_name', 'week_number', 'time', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season']

In [26]:
df = df.reindex(columns=column_names)

In [27]:
df

Unnamed: 0,timestamp,date,year,month,day,day_name,week_number,time,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,15-01-04,2015,01,04,Sunday,01,00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,15-01-04,2015,01,04,Sunday,01,01:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,15-01-04,2015,01,04,Sunday,01,02:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,15-01-04,2015,01,04,Sunday,01,03:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,15-01-04,2015,01,04,Sunday,01,04:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,17-01-03,2017,01,03,Tuesday,01,19:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,17-01-03,2017,01,03,Tuesday,01,20:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,17-01-03,2017,01,03,Tuesday,01,21:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,17-01-03,2017,01,03,Tuesday,01,22:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [28]:
df

Unnamed: 0,timestamp,date,year,month,day,day_name,week_number,time,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,15-01-04,2015,01,04,Sunday,01,00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,15-01-04,2015,01,04,Sunday,01,01:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,15-01-04,2015,01,04,Sunday,01,02:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,15-01-04,2015,01,04,Sunday,01,03:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,15-01-04,2015,01,04,Sunday,01,04:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,17-01-03,2017,01,03,Tuesday,01,19:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,17-01-03,2017,01,03,Tuesday,01,20:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,17-01-03,2017,01,03,Tuesday,01,21:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,17-01-03,2017,01,03,Tuesday,01,22:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


* Changing number of month to name of the month

In [29]:
df['month'] = pd.to_datetime(df['month'], format='%m').dt.month_name().str.slice(stop=3)


In [33]:
df.columns

Index(['timestamp', 'date', 'year', 'month', 'day', 'day_name', 'week_number',
       'time', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season'],
      dtype='object')

In [34]:
df.columns = ['timestamp', 'date', 'year', 'month', 'day', 'day_name', 'week_number',
       'time', 'bike_number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season']

In [35]:
df.to_csv('london_merged_v1.csv')

In [None]:
Null Value

### Saving the updates and cleanning in a new Data Set

In [36]:
df = pd.read_csv('london_merged_v1.csv')

In [37]:
df = df.reset_index()

In [38]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [39]:
df.to_csv('london_merged_v1.csv', index=False)

In [40]:
df = pd.read_csv('london_merged_v1.csv')

In [41]:
df.sample(50)

Unnamed: 0,index,timestamp,date,year,month,day,day_name,week_number,time,bike_number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
2510,2510,2015-04-18 22:00:00,15-04-18,2015,Apr,18,Saturday,15,22:00,550,8.5,5.5,68.5,16.0,1.0,0.0,1.0,0.0
6424,6424,2015-09-29 16:00:00,15-09-29,2015,Sep,29,Tuesday,39,16:00,2173,17.0,17.0,54.0,24.0,1.0,0.0,0.0,2.0
13452,13452,2016-07-21 00:00:00,16-07-21,2016,Jul,21,Thursday,29,00:00,385,19.5,19.5,66.5,20.5,1.0,0.0,0.0,1.0
16121,16121,2016-11-11 02:00:00,16-11-11,2016,Nov,11,Friday,45,02:00,79,6.0,3.0,81.0,15.0,2.0,0.0,0.0,2.0
6708,6708,2015-10-11 13:00:00,15-10-11,2015,Oct,11,Sunday,41,13:00,2770,14.0,14.0,59.0,21.0,2.0,0.0,1.0,2.0
6978,6978,2015-10-23 01:00:00,15-10-23,2015,Oct,23,Friday,42,01:00,219,12.0,12.0,69.5,2.0,4.0,0.0,0.0,2.0
5960,5960,2015-09-09 23:00:00,15-09-09,2015,Sep,9,Wednesday,36,23:00,451,14.0,14.0,88.0,6.0,1.0,0.0,0.0,2.0
12358,12358,2016-06-04 19:00:00,16-06-04,2016,Jun,4,Saturday,22,19:00,1923,19.0,19.0,78.0,10.0,4.0,0.0,1.0,1.0
2490,2490,2015-04-18 02:00:00,15-04-18,2015,Apr,18,Saturday,15,02:00,193,7.0,4.0,73.5,18.0,3.0,0.0,1.0,0.0
4218,4218,2015-06-29 05:00:00,15-06-29,2015,Jun,29,Monday,26,05:00,154,15.0,15.0,77.0,12.0,1.0,0.0,0.0,1.0


### Null Values

In [None]:
df.isna().sum()