In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import os
import datetime
from statsmodels.stats.weightstats import ttest_ind
import scipy
from geopy.distance import geodesic

In [2]:
%load_ext google.cloud.bigquery

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="sunny-footing-321404-513454d4be38.json"

# Difference in mean duration of rides between male and female riders: SF

In [4]:
client = bigquery.Client()

query = """
    SELECT member_gender, duration_sec FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query)

In [5]:
sf_times = query_job.to_dataframe()
sf_times['duration_minutes'] = sf_times['duration_sec'].apply(lambda x: x / 60)
sf_times

Unnamed: 0,member_gender,duration_sec,duration_minutes
0,Male,327,5.450000
1,Male,291,4.850000
2,Male,224,3.733333
3,Male,467,7.783333
4,Male,231,3.850000
...,...,...,...
850968,Female,1256,20.933333
850969,Female,1697,28.283333
850970,Female,2459,40.983333
850971,Female,66,1.100000


In [6]:
sf_female_durations = sf_times[sf_times['member_gender'] == 'Female']['duration_minutes']
sf_male_durations = sf_times[sf_times['member_gender'] == 'Male']['duration_minutes']

diff_of_means = np.mean(sf_female_durations) - np.mean(sf_male_durations)
s_e = np.sqrt((np.std(sf_female_durations)**2/len(sf_female_durations)) + (np.std(sf_male_durations)**2/len(sf_male_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_durations), len(sf_male_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(3.323700586184976, 3.7622115313919093)

The results are significant. There is enough evidence to say that the average duration of female riders in san franscisco is more than the average duration of male riders in san franscisco.

# Difference in mean duration of rides between male and female riders: NY

In [7]:
client = bigquery.Client()

query = """
    SELECT gender, tripduration FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE (gender = 'male' OR gender = 'female') AND starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [8]:
ny_times = query_job.to_dataframe()
ny_times['duration_minutes'] = ny_times['tripduration'].apply(lambda x: x / 60)
ny_times

Unnamed: 0,gender,tripduration,duration_minutes
0,male,393,6.550000
1,male,627,10.450000
2,male,1393,23.216667
3,male,329,5.483333
4,male,630,10.500000
...,...,...,...
17760096,female,3304,55.066667
17760097,female,291,4.850000
17760098,female,1575,26.250000
17760099,female,336,5.600000


In [9]:
ny_female_durations = ny_times[ny_times['gender'] == 'female']['duration_minutes']
ny_male_durations = ny_times[ny_times['gender'] == 'male']['duration_minutes']

diff_of_means = np.mean(ny_female_durations) - np.mean(ny_male_durations)
s_e = np.sqrt((np.std(ny_female_durations)**2/len(ny_female_durations)) + (np.std(ny_male_durations)**2/len(ny_male_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(ny_female_durations), len(ny_male_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(2.341880859287621, 3.0669868558311992)

The results are significant. There is enough evidence to say that the average duration of female riders in NYC is more than the average duration of male riders in NYC.

# Difference in mean duration of rides of female riders between SF and NY

In [10]:
diff_of_means = np.mean(sf_female_durations) - np.mean(ny_female_durations)
s_e = np.sqrt((np.std(sf_female_durations)**2/len(sf_female_durations)) + (np.std(ny_female_durations)**2/len(ny_female_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_durations), len(ny_female_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(-1.5882716236002303, -0.819142272112591)

The results are significant. There is enough evidence to say that the average duration of female riders in SF is less than the average duration of female riders in NYC.

# Difference in mean duration of rides of male riders between SF and NY

In [11]:
diff_of_means = np.mean(sf_male_durations) - np.mean(ny_male_durations)
s_e = np.sqrt((np.std(sf_male_durations)**2/len(sf_male_durations)) + (np.std(ny_male_durations)**2/len(ny_male_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_male_durations), len(ny_male_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(-2.220074330926659, -1.864383967244228)

The results are significant. There is enough evidence to say that the average duration of male riders in SF is less than the average duration of male riders in NYC.

# Difference in mean distance of rides between male and female riders: SF

Note that this is finding shortest distance between two points, we do not know the actual distance traveled by the bike.

In [12]:
client = bigquery.Client()

query = """
    SELECT member_gender, start_station_latitude, start_station_longitude, end_station_latitude, end_station_longitude
    FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query)

In [13]:
sf_distances = query_job.to_dataframe()
distances = []
for start_lat, start_long, end_lat, end_long in zip(sf_distances['start_station_latitude'], 
                                                    sf_distances['start_station_longitude'],
                                                    sf_distances['end_station_latitude'],
                                                    sf_distances['end_station_longitude']):
    start = (start_lat, start_long)
    end = (end_lat, end_long)
    distances.append(geodesic(start, end).mi)
sf_distances['distances'] = distances
sf_distances

Unnamed: 0,member_gender,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,distances
0,Male,37.763316,-122.421904,37.769305,-122.426826,0.493197
1,Male,37.844279,-122.251900,37.851376,-122.252523,0.490634
2,Male,37.755213,-122.420975,37.747858,-122.424986,0.552716
3,Male,37.342725,-121.895617,37.335885,-121.885660,0.723249
4,Male,37.841800,-122.251535,37.851376,-122.252523,0.662621
...,...,...,...,...,...,...
850968,Male,37.771058,-122.402717,37.786375,-122.404904,1.063157
850969,Female,37.771058,-122.402717,37.786375,-122.404904,1.063157
850970,Male,37.771058,-122.402717,37.786375,-122.404904,1.063157
850971,Male,37.771058,-122.402717,37.786375,-122.404904,1.063157


In [14]:
sf_female_distances = sf_distances[sf_distances['member_gender'] == 'Female']['distances']
sf_male_distances = sf_distances[sf_distances['member_gender'] == 'Male']['distances']

diff_of_means = np.mean(sf_female_distances) - np.mean(sf_male_distances)
s_e = np.sqrt((np.std(sf_female_distances)**2/len(sf_female_distances)) + (np.std(sf_male_distances)**2/len(sf_male_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_distances), len(sf_male_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(0.05625778200163203, 0.06263193845889634)

The results are significant. There is enough evidence to say that the average distance of female riders in SF is more than the average distance of male riders in SF.

# Difference in mean distance of rides between male and female riders: NY

In [15]:
client = bigquery.Client()

query = """
    SELECT gender, start_station_latitude, start_station_longitude, end_station_latitude, end_station_longitude
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE (gender = 'male' OR gender = 'female') AND starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [16]:
ny_distances = query_job.to_dataframe()
distances = []
for start_lat, start_long, end_lat, end_long in zip(ny_distances['start_station_latitude'], 
                                                    ny_distances['start_station_longitude'],
                                                    ny_distances['end_station_latitude'],
                                                    ny_distances['end_station_longitude']):
    start = (start_lat, start_long)
    end = (end_lat, end_long)
    distances.append(geodesic(start, end).mi)
ny_distances['distances'] = distances
ny_distances

Unnamed: 0,gender,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,distances
0,male,40.765909,-73.976342,40.769155,-73.981918,0.368457
1,male,40.765909,-73.976342,40.765849,-73.986905,0.554219
2,male,40.765909,-73.976342,40.779668,-73.980930,0.979437
3,male,40.765909,-73.976342,40.775794,-73.976206,0.682090
4,male,40.765909,-73.976342,40.758997,-73.968654,0.624670
...,...,...,...,...,...,...
17760096,female,40.736502,-73.978095,40.741776,-74.001497,1.281065
17760097,female,40.724677,-73.987834,40.750977,-73.987654,1.814785
17760098,female,40.711512,-74.015756,40.708347,-74.017134,0.230075
17760099,female,40.663779,-73.983968,40.692418,-73.989495,1.997374


In [17]:
ny_female_distances = ny_distances[ny_distances['gender'] == 'female']['distances']
ny_male_distances = ny_distances[ny_distances['gender'] == 'male']['distances']

diff_of_means = np.mean(ny_female_distances) - np.mean(ny_male_distances)
s_e = np.sqrt((np.std(ny_female_distances)**2/len(ny_female_distances)) + (np.std(ny_male_distances)**2/len(ny_male_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(ny_female_distances), len(ny_male_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(0.04324355093191864, 0.060142900122590194)

The results are significant. There is enough evidence to say that the average distance of female riders in NY is more than the average distance of male riders in NY.

# Difference in mean distance of rides of female riders between SF and NY

In [18]:
diff_of_means = np.mean(sf_female_distances) - np.mean(ny_female_distances)
s_e = np.sqrt((np.std(sf_female_distances)**2/len(sf_female_distances)) + (np.std(ny_female_distances)**2/len(ny_female_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_distances), len(ny_female_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(-0.1414286670498046, -0.12616126672949113)

The results are significant. There is enough evidence to say that the average distance of female riders in SF is less than the average distance of female riders in NY.

# Difference in mean distance of rides of male riders between SF and NY

In [19]:
diff_of_means = np.mean(sf_male_distances) - np.mean(ny_male_distances)
s_e = np.sqrt((np.std(sf_male_distances)**2/len(sf_male_distances)) + (np.std(ny_male_distances)**2/len(ny_male_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_male_distances), len(ny_male_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(-0.14637171537566968, -0.1367214878096456)

The results are significant. There is enough evidence to say that the average distance of male riders in SF is less than the average distance of male riders in NY.

# Difference in proportion of female riders between cities

In [20]:
client = bigquery.Client()

query = """
    SELECT member_gender FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [21]:
sf_genders = query_job.to_dataframe()
sf_genders.value_counts()

member_gender
Male             651771
Female           199202
dtype: int64

In [22]:
client = bigquery.Client()

query = """
    SELECT gender FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE (gender = 'male' OR gender = 'female') AND starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [23]:
ny_genders = query_job.to_dataframe()
ny_genders.value_counts()

gender
male      13262473
female     4497628
dtype: int64

In [24]:
data = pd.DataFrame(columns = ['SF', 'NY'])
data['SF'] = list(sf_genders.value_counts())
data['NY'] = list(ny_genders.value_counts())
data.index = ['male', 'female']
data

Unnamed: 0,SF,NY
male,651771,13262473
female,199202,4497628


In [25]:
import numpy as np
# population proportion differences (proportion of females in SF and NY)
# just realized should be filtered by 2017 and after
p_female_SF = data.loc['female', 'SF'] / data['SF'].sum()
p_female_NY = data.loc['female', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_female_SF - p_female_NY
s_e = np.sqrt((p_female_SF * (1 - p_female_SF) / n_SF) + (p_female_NY * (1 - p_female_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [26]:
CI

(-0.020078157118831246, -0.01823393622132637)

The results are significant. There is enough evidence to say that the proportion of female riders in san franscisco is less than the proportion of female riders in New York.

# Difference in proportion of generations between cities

In [27]:
# map year to generation
def map_year_to_generation(year):
    if 1928 <= year <= 1945:
        generation = 'Silent'
    elif 1946 <= year <= 1964:
        generation = 'Baby Boomers'
    elif 1965 <= year <= 1980:
        generation = 'Generation X'
    elif 1981 <= year <= 1996:
        generation = 'Millennials'
    elif 1997 <= year:
        generation = 'Generation Z'
    else:
        generation = 'Before Silent'
    return generation

In [28]:
client = bigquery.Client()

query = """
    SELECT member_birth_year FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [29]:
sf_generations = query_job.to_dataframe()
sf_generations['generation'] = sf_generations['member_birth_year'].apply(lambda x: map_year_to_generation(x))
sf_generations

Unnamed: 0,member_birth_year,generation
0,1957.0,Baby Boomers
1,1998.0,Generation Z
2,1951.0,Baby Boomers
3,1998.0,Generation Z
4,1943.0,Silent
...,...,...
963766,1997.0,Generation Z
963767,1997.0,Generation Z
963768,1997.0,Generation Z
963769,1997.0,Generation Z


In [30]:
sf_generations['generation'].value_counts().sort_index()

Baby Boomers      68635
Before Silent    101906
Generation X     259685
Generation Z      12827
Millennials      518330
Silent             2388
Name: generation, dtype: int64

In [31]:
client = bigquery.Client()

query = """
    SELECT birth_year FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [32]:
ny_generations = query_job.to_dataframe()
ny_generations['generation'] = ny_generations['birth_year'].apply(lambda x: map_year_to_generation(x))
ny_generations

Unnamed: 0,birth_year,generation
0,1976.0,Generation X
1,1977.0,Generation X
2,1977.0,Generation X
3,1977.0,Generation X
4,1977.0,Generation X
...,...,...
19789697,1976.0,Generation X
19789698,1976.0,Generation X
19789699,1976.0,Generation X
19789700,1976.0,Generation X


In [33]:
ny_generations['generation'].value_counts().sort_index()

Baby Boomers     2423551
Before Silent    1531801
Generation X     6106357
Generation Z      309690
Millennials      9350028
Silent             68275
Name: generation, dtype: int64

In [34]:
# get rid of birth years before the silent generation to get rid of unrealistic data
data = pd.DataFrame(columns = ['SF', 'NY'])
data['SF'] = list(sf_generations['generation'].value_counts().sort_index())
data['NY'] = list(ny_generations['generation'].value_counts().sort_index())
data.index = ['Baby Boomers', 'Before Silent', 'Generation X', 'Generation Z', 'Millennials', 'Silent']
data = data.drop(index = ['Before Silent'])
data

Unnamed: 0,SF,NY
Baby Boomers,68635,2423551
Generation X,259685,6106357
Generation Z,12827,309690
Millennials,518330,9350028
Silent,2388,68275


In [35]:
data_proportions = data
data_proportions['SF'] = data_proportions['SF']/data_proportions['SF'].sum()
data_proportions['NY'] = data_proportions['NY']/data_proportions['NY'].sum()
data_proportions

Unnamed: 0,SF,NY
Baby Boomers,0.079635,0.13274
Generation X,0.301306,0.33445
Generation Z,0.014883,0.016962
Millennials,0.601405,0.512109
Silent,0.002771,0.003739


# Boomers

In [36]:
p_boomers_SF = data.loc['Baby Boomers', 'SF'] / data['SF'].sum()
p_boomers_NY = data.loc['Baby Boomers', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_boomers_SF - p_boomers_NY
s_e = np.sqrt((p_boomers_SF * (1 - p_boomers_SF) / n_SF) + (p_boomers_NY * (1 - p_boomers_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [37]:
CI

(-0.9038745030213239, 0.7976656909000309)

# Gen X

In [38]:
p_X_SF = data.loc['Generation X', 'SF'] / data['SF'].sum()
p_X_NY = data.loc['Generation X', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_X_SF - p_X_NY
s_e = np.sqrt((p_X_SF * (1 - p_X_SF) / n_SF) + (p_X_NY * (1 - p_X_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [39]:
CI

(-1.323047433622754, 1.2567589913305182)

# Generation Z

In [40]:
p_Z_SF = data.loc['Generation Z', 'SF'] / data['SF'].sum()
p_Z_NY = data.loc['Generation Z', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_Z_SF - p_Z_NY
s_e = np.sqrt((p_Z_SF * (1 - p_Z_SF) / n_SF) + (p_Z_NY * (1 - p_Z_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [41]:
CI

(-0.34903570147060814, 0.34487744063804704)

# Millennials

In [42]:
p_millennial_SF = data.loc['Millennials', 'SF'] / data['SF'].sum()
p_millennial_NY = data.loc['Millennials', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_millennial_SF - p_millennial_NY
s_e = np.sqrt((p_millennial_SF * (1 - p_millennial_SF) / n_SF) + (p_millennial_NY * (1 - p_millennial_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [43]:
CI

(-1.2821019139578087, 1.4606949119978714)

# Silent

In [44]:
p_silent_SF = data.loc['Silent', 'SF'] / data['SF'].sum()
p_silent_NY = data.loc['Silent', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_silent_SF - p_silent_NY
s_e = np.sqrt((p_silent_SF * (1 - p_silent_SF) / n_SF) + (p_silent_NY * (1 - p_silent_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [45]:
CI

(-0.1588497749958343, 0.1569122922018617)