In [20]:
import numpy as np
import pandas as pd
import glob
from pprint import pprint

pd.set_option('display.max_columns',None)

#Import Data
path = r'/homes/chh30/ECS784-PROJECT/sourcedata'
filenames = glob.glob(path+"/*.csv")

frame = pd.DataFrame

list = []

for file_ in filenames:
     df = pd.read_csv(file_,index_col=None,encoding = "ISO-8859-1",header=0)
     list.append(df)

frame = pd.concat(list,ignore_index=True,sort=False)

frame.head()
print(frame.shape)








(10216388, 15)


In [21]:
#Clean Data

frame.dropna(axis=0, subset=["Rental Id",
                             "Start Date",
                             "StartStation Id",
                             "StartStation Name",
                             "End Date",
                             "EndStation Id",
                             "Duration"], inplace=True)




frame["Rental Id"] = frame["Rental Id"].astype(int)
frame["EndStation Id"] = frame["EndStation Id"].astype(int)
frame["StartStation Id"] = frame["StartStation Id"].astype(int)
frame["Duration"] = frame["Duration"].astype(int)

frame = frame[frame["StartStation Id"] != frame["EndStation Id"]]


frame = frame.loc[:,('Start Date',
                           'StartStation Id',
                           'StartStation Name',
#                           'End Date',
#                           'EndStation Id',
                           'Duration')]

## Extra drop for duplicates
frame.drop_duplicates(inplace=True)
print(frame.shape)
frame.tail(10)







(8966926, 4)


Unnamed: 0,Start Date,StartStation Id,StartStation Name,Duration
10216376,17/05/2016 23:56,755,"The Vale, West Chelsea",420
10216377,17/05/2016 23:56,407,"Speakers' Corner 1, Hyde Park",1380
10216378,17/05/2016 23:56,331,"Bunhill Row, Moorgate",480
10216379,17/05/2016 23:56,4,"St. Chad's Street, King's Cross",480
10216380,17/05/2016 23:57,815,"Lambeth Palace Road, Waterloo",360
10216381,17/05/2016 23:57,670,"Ashley Crescent, Battersea",480
10216382,17/05/2016 23:57,486,"Granby Street, Shoreditch",1380
10216383,17/05/2016 23:57,206,"New Road 1 , Whitechapel",300
10216385,17/05/2016 23:57,188,"Nutford Place, Marylebone",420
10216387,17/05/2016 23:59,666,"Olympia Way, Olympia",420


In [22]:
# Missing data checking

missing_data = frame.isnull()

for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts())
    print("")

Start Date
False    8966926
Name: Start Date, dtype: int64

StartStation Id
False    8966926
Name: StartStation Id, dtype: int64

StartStation Name
False    8966926
Name: StartStation Name, dtype: int64

Duration
False    8966926
Name: Duration, dtype: int64



In [23]:
import matplotlib.pyplot as plt
%matplotlib inline

frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8966926 entries, 0 to 10216387
Data columns (total 4 columns):
Start Date           object
StartStation Id      int64
StartStation Name    object
Duration             int64
dtypes: int64(2), object(2)
memory usage: 342.1+ MB


In [24]:
frame.groupby(['StartStation Name','StartStation Id']).size().reset_index(name='Rental_Counts').nlargest(5,columns='Rental_Counts').tail(5)

    

Unnamed: 0,StartStation Name,StartStation Id,Rental_Counts
59,"Belgrove Street , King's Cross",14,89082
763,"Waterloo Station 3, Waterloo",154,72753
761,"Waterloo Station 1, Waterloo",374,54451
364,"Hyde Park Corner, Hyde Park",191,46861
799,"Wormwood Street, Liverpool Street",217,44929


In [55]:
singleStationData = frame[frame['StartStation Id']==14]
singleStationData.info()
singleStationData.head(5)



<class 'pandas.core.frame.DataFrame'>
Int64Index: 89082 entries, 0 to 10216104
Data columns (total 4 columns):
Start Date           89082 non-null datetime64[ns]
StartStation Id      89082 non-null int64
StartStation Name    89082 non-null object
Duration             89082 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.4+ MB


Unnamed: 0,Start Date,StartStation Id,StartStation Name,Duration
0,2016-12-28 10:35:00,14,"Belgrove Street , King's Cross",780
9,2016-12-29 09:59:00,14,"Belgrove Street , King's Cross",780
61,2016-12-29 09:37:00,14,"Belgrove Street , King's Cross",900
62,2016-12-28 09:20:00,14,"Belgrove Street , King's Cross",900
164,2016-12-28 07:46:00,14,"Belgrove Street , King's Cross",1380


In [56]:
#import datetime
#singleStationData["Start Date"]= pd.to_datetime(singleStationData["Start Date"],dayfirst=True, errors='raise') 
#singleStationData["End Date"]= pd.to_datetime(singleStationData["End Date"], dayfirst=True, yearfirst=False errors='ignore') 
#singleStationData["Month"]=pd.to_datetime(singleStationData["Start Date"]).dt.month

singleStationData['Start Date'] = pd.to_datetime(singleStationData['Start Date'],format='%d/%m/%Y %H:%M',dayfirst=True)


singleStationData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89082 entries, 0 to 10216104
Data columns (total 4 columns):
Start Date           89082 non-null datetime64[ns]
StartStation Id      89082 non-null int64
StartStation Name    89082 non-null object
Duration             89082 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.4+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [84]:
#Clean Data out of year range
begin = pd.Timestamp('2016-01-01 00:00:00')
end = pd.Timestamp('2016-12-31 23:59:59')
mask=(singleStationData['Start Date'] >= begin)&(singleStationData['Start Date'] <= end)
singleStationData = singleStationData.loc[mask]


singleStationData.info()

singleStationData['YearMonth'] = singleStationData['Start Date'].map(lambda x: 100*x.year+x.month)
singleStationData['Hour'] = singleStationData['Start Date'].map(lambda x: x.hour)

singleStationData.head(5)

#df2 = singleStationData['Start Date'].groupby([singleStationData['Start Date'].dt.year,singleStationData['Start Date'].dt.month]).agg({'count'})

#df2.info()
#path = r'/homes/chh30/ECS784-PROJECT/exportCSV.csv'

#df2.to_csv(path,index=False)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 88815 entries, 0 to 10216104
Data columns (total 5 columns):
Start Date           88815 non-null datetime64[ns]
StartStation Id      88815 non-null int64
StartStation Name    88815 non-null object
Duration             88815 non-null int64
YearMonth            88815 non-null int64
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 4.1+ MB


Unnamed: 0,Start Date,StartStation Id,StartStation Name,Duration,YearMonth,Hour
0,2016-12-28 10:35:00,14,"Belgrove Street , King's Cross",780,201612,10
9,2016-12-29 09:59:00,14,"Belgrove Street , King's Cross",780,201612,9
61,2016-12-29 09:37:00,14,"Belgrove Street , King's Cross",900,201612,9
62,2016-12-28 09:20:00,14,"Belgrove Street , King's Cross",900,201612,9
164,2016-12-28 07:46:00,14,"Belgrove Street , King's Cross",1380,201612,7


In [89]:
#Month Rental Group by Month
df2 = singleStationData.groupby(['YearMonth']).size().reset_index(name='MonthlyRental').nlargest(20,columns='MonthlyRental').tail(20).sort_values(by='YearMonth',ascending=True)
df2
#Hourly Rental Group by Hour

df3 = singleStationData.groupby(['Hour']).size().reset_index(name='HourlyRental').nlargest(20,columns='HourlyRental').tail(20).sort_values(by='Hour',ascending=True)
df3

Unnamed: 0,Hour,HourlyRental
0,0,393
1,1,225
6,6,3284
7,7,19078
8,8,30578
9,9,7843
10,10,2840
11,11,2160
12,12,2074
13,13,2162


In [92]:
#Export MonthlyRental CSV
MonthExportPath = r'/homes/chh30/ECS784-PROJECT/Export_Data/MonthlyRental2016.csv'
df2.to_csv(MonthExportPath,index=False)

#Export HourlyRental CSV
HourlyExportPath= r'/homes/chh30/ECS784-PROJECT/Export_Data/HourlyRental2016.csv'
df3.to_csv(HourlyExportPath,index=False)