In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
plt.style.use('seaborn-whitegrid')

In [None]:
df_bike = pd.read_csv('/Users/dingranlu/Documents/E4501/Project/JC-201810-citibike-tripdata.csv')

In [None]:
df_bike.describe()

In [None]:
df_bike.head()

In [None]:
df_bike = df_bike.dropna()

In [None]:
westlimit=-74.2635; southlimit=40.4856; eastlimit=-73.7526; northlimit=40.9596
df_bike = df_bike[(df_bike['start station longitude'] >= westlimit) & (df_bike['start station longitude'] <= eastlimit) 
        & (df_bike['start station latitude'] >= southlimit) & (df_bike['start station latitude'] <= northlimit)
        & (df_bike['end station longitude'] >= westlimit) & (df['end station longitude'] <= eastlimit)
        & (df['end station latitude'] >= southlimit) & (df['end station latitude'] <= northlimit)]



# Plotting Map

In [None]:

def plot_on_map(df, bound, nyc_map, s=1, alpha=0.2):
    fig, axs = plt.subplots(1, 2, figsize=(16,10))
    axs[0].imshow(nyc_map, extent=bound)
    axs[0].scatter(df['start station longitude'], df['start station latitude'], alpha=alpha, c='r', s=s)
    axs[0].set_xlim((bound[0], bound[1]))
    axs[0].set_ylim((bound[2], bound[3]))
    axs[0].set_title('Start Station Locations')
    
    axs[1].imshow(nyc_map, extent=bound)
    axs[1].scatter(df['end station longitude'], df['end station latitude'], alpha=alpha, c='r', s=s)
    axs[1].set_xlim((bound[0], bound[1]))
    axs[1].set_ylim((bound[2], bound[3]))
    axs[1].set_title('End Station Locations')


In [None]:
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')
bound = [westlimit, eastlimit, southlimit, northlimit]
plot_on_map(df_bike, bound, nyc_map)

In [None]:
bound_zoom = (-74.2, -73.79, 40.65, 40.83)
nyc_map_zoom = plt.imread('/Users/dingranlu/Desktop/nyc_map.png')
plot_on_map(df_bike, bound_zoom, nyc_map_zoom,s=1)

# Distance and time

In [None]:
# function for calculating distance
# This function is acquired online from:
# https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
def distance(lat1, lon1, lat2, lon2):
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    R = 6373
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (np.sin(dlat/2))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2))**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

In [None]:
# add a new column for distance calculated
df['distance'] = distance(df['start station latitude'], df['start station longitude'], 
                          df['end station latitude'], df['end station longitude'])

# add a new column age
df['age'] = 2018 - df['birth year']

# add a new column age range
df['age range'] = np.where(df['age'] < 20, 'younger than 20', 
                 np.where(df['age'] <= 40, 'between 20 and 40',
                 np.where(df['age'] > 40, 'older than 40', '')))

# add new column - specified gender
df['gender_specified'] = np.where(df['gender'] == 0, 'unknown', 
                 np.where(df['gender'] == 1, 'male',
                 np.where(df['gender'] == 2, 'female', '')))

# drop if distance = 0 but tripduration > 0
df.drop(df[(df['distance'] == 0) & (df['tripduration'] > 0)].index, inplace = True)


# histogram of distance without grouping
df.distance.hist(bins=80, figsize=(15,5))
plt.xlabel('Distance')
plt.title('Histogram of trip distances in miles')
df.distance.describe()

<li>We can see from above histogram that the bike trips are primarily short-distance trips. 

In [None]:
# histogram of distance with grouping of gender
df.groupby('gender_specified').distance.hist(bins=80, figsize=(15,5))
plt.xlabel('Distance')
plt.title('Histogram of trip distances in miles grouped by gender')
plt.legend(['unknown','male','female'])
df.groupby('gender_specified').distance.describe()

In [None]:
df.groupby('gender_specified')['distance', 'tripduration'].mean()
# calculate the average speed for different gender groups (mile/minute)
speed = (df.groupby('gender_specified')['distance'].mean())/(df.groupby('gender_specified')['tripduration'].mean()/60)

In [None]:
# histogram of distance with grouping of age
df.groupby('age range').distance.hist(bins=80, figsize=(15,5))
plt.xlabel('Distance')
plt.title('Histogram of trip distances in miles grouped by age range')
plt.legend(['younger than 20','between 20 and 40','older than 40'])
df.groupby('age range').distance.describe()

In [None]:
df.groupby('age range')['distance', 'tripduration'].mean()
# calculates average speed for different age groups (mile/minute)
speed = (df.groupby('age range')['distance'].mean()) / (df.groupby('age range')['tripduration'].mean() / 60)

In [None]:
# histogram of distance with grouping of usertype
df.groupby('usertype').distance.hist(bins=80, figsize=(15,5))
plt.xlabel('Distance')
plt.title('Histogram of trip distances in miles grouped by user type')
plt.legend(['Customer','Subscriber'])
df.groupby('usertype').distance.describe()

In [None]:
df.groupby('usertype')['distance', 'tripduration'].mean()
# calculates average speed for different user types (mile/minute)
speed = (df.groupby('usertype')['distance'].mean()) / (df.groupby('usertype')['tripduration'].mean() / 60)

In [None]:
# scatter plot distance - trip duration (grouped by gender)
fig, axs = plt.subplots(1, 3, figsize=(20,6))
axs[0].scatter(df[df['gender_specified'] == 'male'].distance, df[df['gender_specified'] == 'male'].tripduration, alpha = 0.4, c = 'g')
axs[0].set_xlabel('distance in mile')
axs[0].set_ylabel('trip duration')
axs[0].set_title('Male')

axs[1].scatter(df[df['gender_specified'] == 'female'].distance, df[df['gender_specified'] == 'female'].tripduration, alpha = 0.4, c = 'g')
axs[1].set_xlabel('distance in mile')
axs[1].set_ylabel('trip duration')
axs[1].set_title('Female')

axs[2].scatter(df[df['gender_specified'] == 'unknown'].distance, df[df['gender_specified'] == 'unknown'].tripduration, alpha = 0.4, c = 'g')
axs[2].set_xlabel('distance in mile')
axs[2].set_ylabel('trip duration')
axs[2].set_title('Uknown')

In [None]:
# scatter plot distance - trip duration (grouped by age range)
fig, axs = plt.subplots(1, 3, figsize=(20,6))
axs[0].scatter(df[df['age range'] == 'younger than 20'].distance, df[df['age range'] == 'younger than 20'].tripduration, alpha = 0.4, c = 'g')
axs[0].set_xlabel('distance in mile')
axs[0].set_ylabel('trip duration')
axs[0].set_title('Younger than 20')

axs[1].scatter(df[df['age range'] == 'between 20 and 40'].distance, df[df['age range'] == 'between 20 and 40'].tripduration, alpha = 0.4, c = 'g')
axs[1].set_xlabel('distance in mile')
axs[1].set_ylabel('trip duration')
axs[1].set_title('Between 20 and 40')

axs[2].scatter(df[df['age range'] == 'older than 40'].distance, df[df['age range'] == 'older than 40'].tripduration, alpha = 0.4, c = 'g')
axs[2].set_xlabel('distance in mile')
axs[2].set_ylabel('trip duration')
axs[2].set_title('Older than 40')

In [None]:
# scatter plot distance - trip duration (grouped by user type)
fig, axs = plt.subplots(1, 2, figsize=(15,6))
axs[0].scatter(df[df['usertype'] == 'Customer'].distance, df[df['usertype'] == 'Customer'].tripduration, alpha = 0.4, c = 'g')
axs[0].set_xlabel('distance in mile')
axs[0].set_ylabel('trip duration')
axs[0].set_title('Customer')

axs[1].scatter(df[df['usertype'] == 'Subscriber'].distance, df[df['usertype'] == 'Subscriber'].tripduration, alpha = 0.4, c = 'g')
axs[1].set_xlabel('distance in mile')
axs[1].set_ylabel('trip duration')
axs[1].set_title('Subscriber')

# Trips by hour on weekdays and weekends

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import datetime

In [None]:
#formatting data
df['starttime'] = df['starttime'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
df['day_of_week'] = df['starttime'].apply(lambda x: x.weekday()+1) #1-7 representing Mon to Sun
df['pickup_hour'] = df['starttime'].apply(lambda x:int(datetime.datetime.strftime(x,'%H')))
df['pickup_year'] = df['starttime'].apply(lambda x:int(datetime.datetime.strftime(x,'%Y')))
df['pickup_month'] = df['starttime'].apply(lambda x:int(datetime.datetime.strftime(x,'%m')))

#pick up density by months
group_month = df.groupby('pickup_month')
group_month.size().plot(kind='bar',title="Trips by months")

#pick up density by hour on weekdays
df_weekday=df[(df['day_of_week']!=6)&(df['day_of_week']!=7)]
group_weekday = df_weekday.groupby('hour')
group_weekday.size().plot(kind='bar',title="pick up trips by hour on weekday")

#pick up density by hour on weekends
df_weekend=df[(df['day_of_week']==6)&(df['day_of_week']==7)]
group_weekend = df_weekend.groupby('hour')
group_weekend.size().plot(kind='bar',title="pick up trips by hour on weekend")

# Pick up density in Manhattan and outer borough

In [None]:
def if_Manhattan_station(latitude,longitude):
    
    # Separate Manhattan into five rectangles to check whether one given point in Manhattan or not.
    # In referrence to the bike distribution in https://member.citibikenyc.com/map/ and 
    
    a1=[(40.6997,40.7080),(-74.0200,-74.0003)]
    a2=[(40.7501,40.8445),(-73.9704,-73.9550)]
    a3=[(40.7080,40.8162),(-74.0200,-73.9704)]
    a4=[(40.7641,40.8445),(-73.9550,-73.9418)]
    a5=[(40.7836,40.8445),(-73.9418,-73.9273)]    
    
    # Start checking
    if ((a1[0][0]<=latitude<=a1[0][1]) & (a1[1][0]<=longitude<=a1[1][1]))\
    or ((a2[0][0]<=latitude<=a2[0][1]) & (a2[1][0]<=longitude<=a2[1][1]))\
    or ((a3[0][0]<=latitude<=a3[0][1]) & (a3[1][0]<=longitude<=a3[1][1]))\
    or ((a4[0][0]<=latitude<=a4[0][1]) & (a4[1][0]<=longitude<=a4[1][1]))\
    or ((a5[0][0]<=latitude<=a5[0][1]) & (a5[1][0]<=longitude<=a5[1][1])): 
    
        return 1 # point within Manhattan
    else:
        return 0 # point out of Manhattan
