In [1]:
import pandas as pd
import numpy as np
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime as dt
from datetime import datetime
%matplotlib inline
%config InlineBackend.figure_format = 'png'

In [2]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'Analysis_Cleaning_Sharing_Bike.ipynb',
 'Data',
 'london_merged_v1.csv',
 'README.md']

### Overview of the DataSet

In [3]:
os.getcwd()

'D:\\Repositories\\london_bike_sharing'

In [4]:
df = pd.read_csv('.\data/london_merged.csv')

In [5]:
df

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [6]:
df.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [7]:
df.columns

Index(['timestamp', 'cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season'],
      dtype='object')

### Renaming Columns

In [8]:
df.columns = ['timestamp', 'number', 'temperature', 'temperature_feels', 'hum', 'wind_speed', 'weather_code',
       'is_holiday', 'is_weekend', 'season']

In [9]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season'],
      dtype='object')

In [10]:
pd.to_datetime(df['timestamp'])

0       2015-01-04 00:00:00
1       2015-01-04 01:00:00
2       2015-01-04 02:00:00
3       2015-01-04 03:00:00
4       2015-01-04 04:00:00
                ...        
17409   2017-01-03 19:00:00
17410   2017-01-03 20:00:00
17411   2017-01-03 21:00:00
17412   2017-01-03 22:00:00
17413   2017-01-03 23:00:00
Name: timestamp, Length: 17414, dtype: datetime64[ns]

### Converting to Datetime the dates

In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Adding Year, Month, Day, to the dataset

In [12]:
df['month'] = df.timestamp.apply(lambda x : x.strftime('%m'))

In [13]:
df.columns

Index(['timestamp', 'number', 'temperature', 'temperature_feels', 'hum',
       'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season',
       'month'],
      dtype='object')

In [14]:
column_names = ['timestamp', 'month', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

* Reorganizing the columns

In [15]:
df = df.reindex(columns=column_names)

* Year, Days, Day_name, Week number

In [16]:
df['year'] = df.timestamp.apply(lambda x : x.strftime('20%y'))

In [17]:
df['day'] = df.timestamp.apply(lambda x : x.strftime('%d'))

In [18]:
column_names = ['timestamp', 'year', 'month', 'day', 'number', 'temperature', 'temperature_feels',
       'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend',
       'season']

In [19]:
df = df.reindex(columns=column_names)

In [20]:
df['day_name'] = df['timestamp'].dt.day_name()

In [21]:
df['week_number'] = df.timestamp.apply(lambda x : x.strftime('%U'))


In [22]:
df

Unnamed: 0,timestamp,year,month,day,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season,day_name,week_number
0,2015-01-04 00:00:00,2015,01,04,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,Sunday,01
1,2015-01-04 01:00:00,2015,01,04,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,Sunday,01
2,2015-01-04 02:00:00,2015,01,04,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,Sunday,01
3,2015-01-04 03:00:00,2015,01,04,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,Sunday,01
4,2015-01-04 04:00:00,2015,01,04,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,Sunday,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,2017,01,03,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0,Tuesday,01
17410,2017-01-03 20:00:00,2017,01,03,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0,Tuesday,01
17411,2017-01-03 21:00:00,2017,01,03,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0,Tuesday,01
17412,2017-01-03 22:00:00,2017,01,03,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0,Tuesday,01


* Reorganizing the columns again

In [23]:
df.columns

Index(['timestamp', 'year', 'month', 'day', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season', 'day_name', 'week_number'],
      dtype='object')

In [24]:
column_names = ['timestamp', 'year', 'month', 'day', 'day_name', 'week_number', 'number', 'temperature',
       'temperature_feels', 'hum', 'wind_speed', 'weather_code', 'is_holiday',
       'is_weekend', 'season']

In [25]:
df = df.reindex(columns=column_names)

In [26]:
df

Unnamed: 0,timestamp,year,month,day,day_name,week_number,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,2015,01,04,Sunday,01,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,2015,01,04,Sunday,01,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,2015,01,04,Sunday,01,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,2015,01,04,Sunday,01,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,2015,01,04,Sunday,01,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,2017,01,03,Tuesday,01,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,2017,01,03,Tuesday,01,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,2017,01,03,Tuesday,01,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,2017,01,03,Tuesday,01,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [27]:
df

Unnamed: 0,timestamp,year,month,day,day_name,week_number,number,temperature,temperature_feels,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,2015,01,04,Sunday,01,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,2015,01,04,Sunday,01,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,2015,01,04,Sunday,01,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,2015,01,04,Sunday,01,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,2015,01,04,Sunday,01,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,2017,01,03,Tuesday,01,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,2017,01,03,Tuesday,01,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,2017,01,03,Tuesday,01,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,2017,01,03,Tuesday,01,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


* Changing number of month to name of the month

In [28]:
df['month'] = pd.to_datetime(df['month'], format='%m').dt.month_name().str.slice(stop=3)


In [29]:
df.to_csv('london_merged_v1.csv')

### Saving the updates and cleanning in a new Data Set

In [30]:
df = pd.read_csv('london_merged_v1.csv')

In [31]:
df = df.reset_index()

In [32]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [33]:
df.to_csv('london_merged_v1.csv', index=False)

In [34]:
df = pd.read_csv('london_merged_v1.csv')