In [1]:
import pandas as pd
import numpy as np
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime as dt
from datetime import datetime
%matplotlib inline
%config InlineBackend.figure_format = 'png'

In [2]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'Analysis_Cleaning_Sharing_Bike.ipynb',
 'Data',
 'london_merged_v1.csv',
 'README.md']

### Overview of the DataSet

In [3]:
os.getcwd()

'D:\\Repositories\\london_bike_sharing'

In [4]:
df = pd.read_csv('.\data/london_merged.csv')

In [5]:
df

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [6]:
df.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [7]:
df.columns

Index(['timestamp', 'cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season'],
      dtype='object')

### Renaming Columns

In [8]:
df.columns = ['timestamp', 'number', 'temperature', 'temperature_feels', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season']

In [9]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season'],
      dtype='object')

In [10]:
pd.to_datetime(df['timestamp'])

0       2015-01-04 00:00:00
1       2015-01-04 01:00:00
2       2015-01-04 02:00:00
3       2015-01-04 03:00:00
4       2015-01-04 04:00:00
                ...        
17409   2017-01-03 19:00:00
17410   2017-01-03 20:00:00
17411   2017-01-03 21:00:00
17412   2017-01-03 22:00:00
17413   2017-01-03 23:00:00
Name: timestamp, Length: 17414, dtype: datetime64[ns]

### Converting to Datetime the dates

In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Adding Year, Month, Day, to the dataset

In [12]:
df['month'] = df.timestamp.apply(lambda x : x.strftime('%m'))

In [13]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season',
       'month'],
      dtype='object')

In [14]:
column_names = ['timestamp', 'month', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

* Reorganizing the columns

In [15]:
df = df.reindex(columns=column_names)

* Year, Days, Day_name, Week number

In [16]:
df['year'] = df.timestamp.apply(lambda x : x.strftime('20%y'))

In [17]:
df['day'] = df.timestamp.apply(lambda x : x.strftime('%d'))

In [18]:
column_names = ['timestamp', 'year', 'month', 'day', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

In [19]:
df = df.reindex(columns=column_names)

In [20]:
df['day_name'] = df['timestamp'].dt.day_name()

In [21]:
df['week_number'] = df.timestamp.apply(lambda x : x.strftime('%U'))


In [22]:
df['time'] = df.timestamp.apply(lambda x : x.strftime('%H:%M'))

In [23]:
df['date'] = df.timestamp.apply(lambda x : x.strftime('%y-%m-%d'))

0        15-01-04
1        15-01-04
2        15-01-04
3        15-01-04
4        15-01-04
           ...   
17409    17-01-03
17410    17-01-03
17411    17-01-03
17412    17-01-03
17413    17-01-03
Name: date, Length: 17414, dtype: object

* Reorganizing the columns again

In [27]:
df.columns

Index(['timestamp', 'year', 'month', 'day', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season', 'day_name', 'week_number', 'time', 'date'],
      dtype='object')

In [26]:
column_names = ['timestamp','date', 'year', 'month', 'day', 'number', 'week_number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season', 'day_name', 'week_number', 'time']

In [27]:
df = df.reindex(columns=column_names)

In [28]:
df

Unnamed: 0,timestamp,year,month,day,day_name,week_number,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,2015,01,04,Sunday,01,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,2015,01,04,Sunday,01,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,2015,01,04,Sunday,01,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,2015,01,04,Sunday,01,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,2015,01,04,Sunday,01,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,2017,01,03,Tuesday,01,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,2017,01,03,Tuesday,01,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,2017,01,03,Tuesday,01,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,2017,01,03,Tuesday,01,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [29]:
df

Unnamed: 0,timestamp,year,month,day,day_name,week_number,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,2015,01,04,Sunday,01,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,2015,01,04,Sunday,01,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,2015,01,04,Sunday,01,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,2015,01,04,Sunday,01,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,2015,01,04,Sunday,01,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,2017,01,03,Tuesday,01,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,2017,01,03,Tuesday,01,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,2017,01,03,Tuesday,01,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,2017,01,03,Tuesday,01,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


* Changing number of month to name of the month

In [30]:
df['month'] = pd.to_datetime(df['month'], format='%m').dt.month_name().str.slice(stop=3)


In [31]:
df.to_csv('london_merged_v1.csv')

### Saving the updates and cleanning in a new Data Set

In [32]:
df = pd.read_csv('london_merged_v1.csv')

In [33]:
df = df.reset_index()

In [34]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [35]:
df.to_csv('london_merged_v1.csv', index=False)

In [36]:
df = pd.read_csv('london_merged_v1.csv')

In [37]:
df.sample(50)

Unnamed: 0,index,timestamp,year,month,day,day_name,week_number,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
7745,7745,2015-11-24 09:00:00,2015,Nov,24,Tuesday,47,1711,10.0,8.0,85.0,17.0,7.0,0.0,0.0,2.0
1811,1811,2015-03-20 13:00:00,2015,Mar,20,Friday,11,1257,9.0,8.0,71.0,7.0,1.0,0.0,0.0,0.0
5120,5120,2015-08-05 23:00:00,2015,Aug,5,Wednesday,31,778,17.0,17.0,80.0,7.0,1.0,0.0,0.0,1.0
1303,1303,2015-02-27 09:00:00,2015,Feb,27,Friday,8,1841,5.5,2.5,70.5,14.0,1.0,0.0,0.0,3.0
11464,11464,2016-04-28 13:00:00,2016,Apr,28,Thursday,17,1272,11.0,11.0,38.0,18.5,1.0,0.0,0.0,0.0
15771,15771,2016-10-27 11:00:00,2016,Oct,27,Thursday,43,1224,11.5,11.5,85.0,16.0,2.0,0.0,0.0,2.0
12421,12421,2016-06-07 10:00:00,2016,Jun,7,Tuesday,23,1239,20.5,20.5,62.0,8.0,1.0,0.0,0.0,1.0
10021,10021,2016-02-27 18:00:00,2016,Feb,27,Saturday,8,682,5.0,1.0,57.0,19.5,4.0,0.0,1.0,3.0
13766,13766,2016-08-03 05:00:00,2016,Aug,3,Wednesday,31,141,20.0,20.0,83.0,19.5,4.0,0.0,0.0,1.0
14526,14526,2016-09-05 14:00:00,2016,Sep,5,Monday,36,1247,21.5,21.5,75.5,7.5,3.0,0.0,0.0,2.0


### Null Values

In [38]:
df.isna().sum()

index                0
timestamp            0
year                 0
month                0
day                  0
day_name             0
week_number          0
number               0
temperature          0
temperature_feels    0
hum                  0
wind_speed           0
weather_code         0
is_holiday           0
is_weekend           0
season               0
dtype: int64