In [1]:
import pandas as pd 
import numpy as np
import random
import requests 
import zipfile
import io
from numpy import product
import geopy as gpy
from geopy.distance import great_circle
import glob
import os

### Bike Data

In [2]:
#generate zipfile names
prefix = range(201501,201513)
zipfiles = []
for i in prefix: 
    name = str(i) + '-citibike-tripdata.zip'
    zipfiles.append(name)

url = 'https://s3.amazonaws.com/tripdata/'

In [None]:
#download data 
for n in range(len(zipfiles)):
    r = requests.get(url + zipfiles[n])
    z = zipfile.ZipFile(io.BytesIO(r.content))
    name = z.namelist()
    z.extractall('data/')

In [3]:
#get filenames
filenames = []
for i in prefix: 
        name = str(i) + '-citibike-tripdata.csv'
        filenames.append(name)

In [4]:
#get first file to colheaders
dt = pd.read_csv('data/' + filenames[1])

In [6]:
#load data 
path = 'data/'                 
all_files = glob.glob(os.path.join(path, "*.csv"))     
df = (pd.read_csv(f) for f in all_files)
data   = pd.concat(df, ignore_index=True)

In [13]:
#convert timestamps to datetime
data['starttime'] = pd.to_datetime(data['starttime'], format = '%m/%d/%Y %H:%M', exact = False)
data['stoptime'] = pd.to_datetime(data['stoptime'], format = '%m/%d/%Y %H:%M', exact = False)

#convert id vars to string
to_string = ['start station id', 'end station id', 'bikeid']
for var in to_string: 
    data[var] = data[var].astype(str)

#remove unnecessary vars
data = data.drop(['birth year', 'gender'], axis = 1)

In [24]:
bike = data

In [25]:
#Q2-1 What is the median trip duration, in seconds?
#Note trip duration appears to be in seconds. You could subtract the starttime from the stop time but 
#in this case that may be less accurate as they do not include seconds. 
#duration = bike['stoptime'] - bike['starttime']
#duration_mean = duration.mean().total_seconds()

q2_1_answer = bike['tripduration'].mean()
print(q2_1_answer)

968.087654228


In [29]:
#Q2-2 What fraction of rides start and end at the same station?
same_station = bike[bike['start station id'] == bike ['end station id']] 
q2_2_answer = float(len(same_station))/float(len(bike))
print(q2_2_answer)

0.022358391337304433


In [30]:
#Q3 std of number of stations visited by each bike
#We say a bike has visited a station if it has a ride that either started or ended 
#at that station. Some bikes have visited many stations; others just a few. 
#What is the standard deviation of the number of stations visited by a bike?
no_stations = []
for id in list(pd.unique(bike['bikeid'])): 
    sub = bike[bike['bikeid'] == id]
    start, stop = list(sub['start station id'].values[:]), list(sub['end station id'].values[:])
    distinct_stations = len(list(set(start + stop)))
    no_stations.append(distinct_stations)
q2_3_answer = pd.Series(no_stations).std()
print(q2_3_answer)

54.5451138723


In [31]:
#Q4 What is the average length, in kilometers, of a trip? 
#Assume trips follow great circle arcs from the start station to the end station. 
#Ignore trips that start and end at the same station, as well as those with obviously wrong data.

#ignore trips that start and end at the same station
no_good = bike.index.isin(list(same_station.index))
trips  = bike[~no_good]

#calculate trip distance
trip_dist = []
start_point = list(zip(trips['start station latitude'], trips['start station longitude']))
end_point = list(zip(trips['end station latitude'], trips['end station longitude']))
for i in range(len(start_point)): 
    distance = great_circle(start_point[i], end_point[i]).km
    trip_dist.append(distance)
    
#identify outliers 
trip_dist = pd.Series(trip_dist)
q1= trip_dist.describe()[4]
q3= trip_dist.describe()[6]
iqr = (q3-q1)*4
outliers = trip_dist[(trip_dist < (q1-iqr)) | (trip_dist > (q3+iqr))]
keep = trip_dist[(trip_dist > (q1-iqr)) | (trip_dist < (q3+iqr))]
print(keep.mean())

q2_4_answer = trip_dist.mean()
print(q2_4_answer)

1.76014829948
1.76014829948


In [32]:
#Q5 Calculate the average duration of trips for each month in the year. 
#(Consider a trip to occur in the month in which it starts.) 
#What is the difference, in seconds, between the longest and shortest average durations?
#bike['duration'] = duration/ np.timedelta64(1, 's')
ts = bike.set_index('starttime')
by_month = ts.groupby(pd.TimeGrouper("M"))['tripduration'].mean().reset_index()
q2_5_answer = by_month.tripduration.max() - by_month.tripduration.min()
print(q2_5_answer)

430.57029597


In [None]:
#Q6 hourly usage fration by station compared to system
#Let us define the hourly usage fraction of a station to be the fraction of all rides starting at that station 
#that leave during a specific hour. A station has surprising usage patterns if it has an hourly usage 
#fraction for an hour significantly different from the corresponding hourly usage fraction of the system as a whole. 
#What is the largest ratio of station hourly usage fraction to system hourly usage fraction 
#(hence corresponding to the most "surprising" station-hour pair)?

In [33]:
ts["hour"] = ts.index.hour

#find hourly usage fraction by station by hour 
by_station_hour_count = ts.groupby(['start station id','hour'])['start station name'].size().unstack()
by_station_sum = ts.groupby(['start station id'])['start station name'].size().reset_index()
by_station_hour_count['starts_total'] = by_station_sum[0].values
by_stn_div = by_station_hour_count.div(by_station_hour_count['starts_total'], axis='index')

#find hourly usage fraction by station for system 
system_hour_count = ts.groupby('hour').size()/len(ts)
by_stn_div = by_stn_div.T
by_stn_div['sys_starts_total'] = system_hour_count

#compare hourly usage fraction by station to that of the hourly usage fraction by system
by_stn_div = by_stn_div[0:24]
stn_to_sys = by_stn_div.div(by_stn_div['sys_starts_total'], axis='index')
q2_6_answer = stn_to_sys.iloc[:,:-1].max().max()
print(q2_6_answer)

11.4216840462


In [None]:
#Q7 #What fraction of rides exceed their corresponding time limit by rider type
#There are two types of riders: "Customers" and "Subscribers." Customers buy a short-time pass which 
#allows 30-minute rides. Subscribers buy yearly passes that allow 45-minute rides. 
#What fraction of rides exceed their corresponding time limit?

In [34]:
bike['duration_min'] = bike['tripduration']/60
over_subscribers = len(bike[(bike['usertype'] == 'Subscriber') & (bike['duration_min'] > 45)])
over_customers = len(bike[(bike['usertype'] == 'Customer') & (bike['duration_min'] > 30)])
q2_7_answer = (over_subscribers + over_customers) / len(bike)
print(q2_7_answer)

0.038106780168060496


In [None]:
#Q8 What is the average number of times a bike is moved during this period
#Most of the time, a bike will begin a trip at the same station where its previous trip ended. 
#Sometimes a bike will be moved by the program, either for maintenance or to rebalance the distribution of bikes. 
#What is the average number of times a bike is moved during this period, as detected by seeing if it 
#starts at a different station than where the previous ride ended?

In [35]:
#groupby bike id 
moved_by_bike = []
bikeIds = pd.unique(bike['bikeid']).tolist()

for i in range(len(bikeIds)): 
    by_bike = bike[bike['bikeid'] == bikeIds[i]]
    previous = [0] + by_bike['end station id'].tolist()
    by_bike.loc[:,'previous'] = previous[:-1]
    moved = len(by_bike[by_bike['end station id'] != by_bike['previous']])
    moved_by_bike.append(moved)
    
q2_8_answer = np.mean(moved_by_bike)
print(q2_8_answer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


1147.29656718
