In [134]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import os
import datetime
from statsmodels.stats.weightstats import ttest_ind
import scipy
from geopy.distance import geodesic

In [49]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [50]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="sunny-footing-321404-513454d4be38.json"

# Difference in mean duration of rides between male and female riders: SF

In [118]:
client = bigquery.Client()

query = """
    SELECT member_gender, duration_sec FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query)

In [120]:
sf_times = query_job.to_dataframe()
sf_times['duration_minutes'] = sf_times['duration_sec'].apply(lambda x: x / 60)
sf_times

Unnamed: 0,member_gender,duration_sec,duration_minutes
0,Male,833,13.883333
1,Male,314,5.233333
2,Male,803,13.383333
3,Male,711,11.850000
4,Male,2268,37.800000
...,...,...,...
850968,Female,94,1.566667
850969,Female,637,10.616667
850970,Female,1256,20.933333
850971,Female,115,1.916667


In [121]:
sf_female_durations = sf_times[sf_times['member_gender'] == 'Female']['duration_minutes']
sf_male_durations = sf_times[sf_times['member_gender'] == 'Male']['duration_minutes']

diff_of_means = np.mean(sf_female_durations) - np.mean(sf_male_durations)
s_e = np.sqrt((np.std(sf_female_durations)**2/len(sf_female_durations)) + (np.std(sf_male_durations)**2/len(sf_male_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_durations), len(sf_male_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(3.3237005861848767, 3.76221153139181)

The results are significant. There is enough evidence to say that the average duration of female riders in san franscisco is more than the average duration of male riders in san franscisco.

# Difference in mean duration of rides between male and female riders: NY

In [124]:
client = bigquery.Client()

query = """
    SELECT gender, tripduration FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE (gender = 'male' OR gender = 'female') AND starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [125]:
ny_times = query_job.to_dataframe()
ny_times['duration_minutes'] = ny_times['tripduration'].apply(lambda x: x / 60)
ny_times

Unnamed: 0,gender,tripduration,duration_minutes
0,male,302,5.033333
1,male,271,4.516667
2,male,489,8.150000
3,male,1249,20.816667
4,male,1398,23.300000
...,...,...,...
17760096,female,886,14.766667
17760097,female,189,3.150000
17760098,female,636,10.600000
17760099,female,773,12.883333


In [127]:
ny_female_durations = ny_times[ny_times['gender'] == 'female']['duration_minutes']
ny_male_durations = ny_times[ny_times['gender'] == 'male']['duration_minutes']

diff_of_means = np.mean(ny_female_durations) - np.mean(ny_male_durations)
s_e = np.sqrt((np.std(ny_female_durations)**2/len(ny_female_durations)) + (np.std(ny_male_durations)**2/len(ny_male_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(ny_female_durations), len(ny_male_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(2.3418808592871545, 3.066986855830575)

The results are significant. There is enough evidence to say that the average duration of female riders in NYC is more than the average duration of male riders in NYC.

# Difference in mean duration of rides of female riders between SF and NY

In [128]:
diff_of_means = np.mean(sf_female_durations) - np.mean(ny_female_durations)
s_e = np.sqrt((np.std(sf_female_durations)**2/len(sf_female_durations)) + (np.std(ny_female_durations)**2/len(ny_female_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_durations), len(ny_female_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(-1.5882716235983667, -0.8191422721110475)

The results are significant. There is enough evidence to say that the average duration of female riders in SF is less than the average duration of female riders in NYC.

# Difference in mean duration of rides of male riders between SF and NY

In [129]:
diff_of_means = np.mean(sf_male_durations) - np.mean(ny_male_durations)
s_e = np.sqrt((np.std(sf_male_durations)**2/len(sf_male_durations)) + (np.std(ny_male_durations)**2/len(ny_male_durations)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_male_durations), len(ny_male_durations)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(-2.2200743309255864, -1.864383967242785)

The results are significant. There is enough evidence to say that the average duration of male riders in SF is less than the average duration of male riders in NYC.

# Difference in mean distance of rides between male and female riders: SF

Note that this is finding shortest distance between two points, we do not know the actual distance traveled by the bike.

In [130]:
client = bigquery.Client()

query = """
    SELECT member_gender, start_station_latitude, start_station_longitude, end_station_latitude, end_station_longitude
    FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query)

In [135]:
sf_distances = query_job.to_dataframe()
distances = []
for start_lat, start_long, end_lat, end_long in zip(sf_distances['start_station_latitude'], 
                                                    sf_distances['start_station_longitude'],
                                                    sf_distances['end_station_latitude'],
                                                    sf_distances['end_station_longitude']):
    start = (start_lat, start_long)
    end = (end_lat, end_long)
    distances.append(geodesic(start, end).mi)
sf_distances['distances'] = distances
sf_distances

Unnamed: 0,member_gender,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,distances
0,Male,37.765052,-122.421866,37.777791,-122.406432,1.218930
1,Female,37.797320,-122.265320,37.790140,-122.242373,1.349943
2,Male,37.819381,-122.261928,37.841800,-122.251535,1.647377
3,Male,37.787290,-122.394380,37.773717,-122.411647,1.330253
4,Male,37.800214,-122.253810,37.797320,-122.265320,0.660742
...,...,...,...,...,...,...
850968,Male,37.771058,-122.402717,37.752428,-122.420628,1.616354
850969,Male,37.771058,-122.402717,37.752428,-122.420628,1.616354
850970,Female,37.771058,-122.402717,37.752428,-122.420628,1.616354
850971,Male,37.771058,-122.402717,37.752428,-122.420628,1.616354


In [137]:
sf_female_distances = sf_distances[sf_distances['member_gender'] == 'Female']['distances']
sf_male_distances = sf_distances[sf_distances['member_gender'] == 'Male']['distances']

diff_of_means = np.mean(sf_female_distances) - np.mean(sf_male_distances)
s_e = np.sqrt((np.std(sf_female_distances)**2/len(sf_female_distances)) + (np.std(sf_male_distances)**2/len(sf_male_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_distances), len(sf_male_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

(0.056257782001760355, 0.0626319384590247)

The results are significant. There is enough evidence to say that the average distance of female riders in SF is more than the average distance of male riders in SF.

# Difference in mean distance of rides between male and female riders: NY

In [139]:
client = bigquery.Client()

query = """
    SELECT gender, start_station_latitude, start_station_longitude, end_station_latitude, end_station_longitude
    FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE (gender = 'male' OR gender = 'female') AND starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [None]:
ny_distances = query_job.to_dataframe()
distances = []
for start_lat, start_long, end_lat, end_long in zip(ny_distances['start_station_latitude'], 
                                                    ny_distances['start_station_longitude'],
                                                    ny_distances['end_station_latitude'],
                                                    ny_distances['end_station_longitude']):
    start = (start_lat, start_long)
    end = (end_lat, end_long)
    distances.append(geodesic(start, end).mi)
ny_distances['distances'] = distances
ny_distances

In [None]:
ny_female_distances = ny_distances[ny_distances['gender'] == 'female']['distances']
ny_male_distances = ny_distances[ny_distances['gender'] == 'male']['distances']

diff_of_means = np.mean(ny_female_distances) - np.mean(ny_male_distances)
s_e = np.sqrt((np.std(ny_female_distances)**2/len(ny_female_distances)) + (np.std(ny_male_distances)**2/len(ny_male_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(ny_female_distances), len(ny_male_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

# Difference in mean distance of rides of female riders between SF and NY

In [None]:
diff_of_means = np.mean(sf_female_distances) - np.mean(ny_female_distances)
s_e = np.sqrt((np.std(sf_female_distances)**2/len(sf_female_distances)) + (np.std(ny_female_distances)**2/len(ny_female_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_female_distances), len(ny_female_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

# Difference in mean distance of rides of male riders between SF and NY

In [None]:
diff_of_means = np.mean(sf_male_distances) - np.mean(ny_male_distances)
s_e = np.sqrt((np.std(sf_male_distances)**2/len(sf_male_distances)) + (np.std(ny_male_distances)**2/len(ny_male_distances)))
t_star = scipy.stats.t.ppf(q=1-.05/2,df=min(len(sf_male_distances), len(ny_male_distances)))
CI = (diff_of_means - t_star*s_e, diff_of_means + t_star*s_e)
CI

# Difference in proportion of female riders between cities

In [51]:
client = bigquery.Client()

query = """
    SELECT member_gender FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [52]:
sf_genders = query_job.to_dataframe()
sf_genders.value_counts()

member_gender
Male             651771
Female           199202
dtype: int64

In [53]:
client = bigquery.Client()

query = """
    SELECT gender FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE (gender = 'male' OR gender = 'female') AND starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [54]:
ny_genders = query_job.to_dataframe()
ny_genders.value_counts()

gender
male      13262473
female     4497628
dtype: int64

In [55]:
data = pd.DataFrame(columns = ['SF', 'NY'])
data['SF'] = list(sf_genders.value_counts())
data['NY'] = list(ny_genders.value_counts())
data.index = ['male', 'female']
data

Unnamed: 0,SF,NY
male,651771,13262473
female,199202,4497628


In [56]:
import numpy as np
# population proportion differences (proportion of females in SF and NY)
# just realized should be filtered by 2017 and after
p_female_SF = data.loc['female', 'SF'] / data['SF'].sum()
p_female_NY = data.loc['female', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_female_SF - p_female_NY
s_e = np.sqrt((p_female_SF * (1 - p_female_SF) / n_SF) + (p_female_NY * (1 - p_female_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [57]:
CI

(-0.020078157118831246, -0.01823393622132637)

The results are significant. There is enough evidence to say that the proportion of female riders in san franscisco is less than the proportion of female riders in New York.

# Difference in proportion of generations between cities

In [58]:
# map year to generation
def map_year_to_generation(year):
    if 1928 <= year <= 1945:
        generation = 'Silent'
    elif 1946 <= year <= 1964:
        generation = 'Baby Boomers'
    elif 1965 <= year <= 1980:
        generation = 'Generation X'
    elif 1981 <= year <= 1996:
        generation = 'Millennials'
    elif 1997 <= year:
        generation = 'Generation Z'
    else:
        generation = 'Before Silent'
    return generation

In [59]:
client = bigquery.Client()

query = """
    SELECT member_birth_year FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE start_date >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [60]:
sf_generations = query_job.to_dataframe()
sf_generations['generation'] = sf_generations['member_birth_year'].apply(lambda x: map_year_to_generation(x))
sf_generations

Unnamed: 0,member_birth_year,generation
0,1955.0,Baby Boomers
1,1999.0,Generation Z
2,1952.0,Baby Boomers
3,1948.0,Baby Boomers
4,1954.0,Baby Boomers
...,...,...
963766,1997.0,Generation Z
963767,1997.0,Generation Z
963768,1997.0,Generation Z
963769,1997.0,Generation Z


In [61]:
sf_generations['generation'].value_counts().sort_index()

Baby Boomers      68635
Before Silent    101906
Generation X     259685
Generation Z      12827
Millennials      518330
Silent             2388
Name: generation, dtype: int64

In [62]:
client = bigquery.Client()

query = """
    SELECT birth_year FROM `bigquery-public-data.new_york_citibike.citibike_trips`
    WHERE starttime >= '2017-01-01 00:00:00'
"""

query_job = client.query(query) 

In [63]:
ny_generations = query_job.to_dataframe()
ny_generations['generation'] = ny_generations['birth_year'].apply(lambda x: map_year_to_generation(x))
ny_generations

Unnamed: 0,birth_year,generation
0,1995.0,Millennials
1,1995.0,Millennials
2,1995.0,Millennials
3,1995.0,Millennials
4,1995.0,Millennials
...,...,...
19789697,1987.0,Millennials
19789698,1966.0,Generation X
19789699,1989.0,Millennials
19789700,1990.0,Millennials


In [64]:
ny_generations['generation'].value_counts().sort_index()

Baby Boomers     2423551
Before Silent    1531801
Generation X     6106357
Generation Z      309690
Millennials      9350028
Silent             68275
Name: generation, dtype: int64

In [65]:
# get rid of birth years before the silent generation to get rid of unrealistic data
data = pd.DataFrame(columns = ['SF', 'NY'])
data['SF'] = list(sf_generations['generation'].value_counts().sort_index())
data['NY'] = list(ny_generations['generation'].value_counts().sort_index())
data.index = ['Baby Boomers', 'Before Silent', 'Generation X', 'Generation Z', 'Millennials', 'Silent']
data = data.drop(index = ['Before Silent'])
data

Unnamed: 0,SF,NY
Baby Boomers,68635,2423551
Generation X,259685,6106357
Generation Z,12827,309690
Millennials,518330,9350028
Silent,2388,68275


In [76]:
data_proportions = data
data_proportions['SF'] = data_proportions['SF']/data_proportions['SF'].sum()
data_proportions['NY'] = data_proportions['NY']/data_proportions['NY'].sum()
data_proportions

Unnamed: 0,SF,NY
Baby Boomers,0.079635,0.13274
Generation X,0.301306,0.33445
Generation Z,0.014883,0.016962
Millennials,0.601405,0.512109
Silent,0.002771,0.003739


# Boomers

In [66]:
p_boomers_SF = data.loc['Baby Boomers', 'SF'] / data['SF'].sum()
p_boomers_NY = data.loc['Baby Boomers', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_boomers_SF - p_boomers_NY
s_e = np.sqrt((p_boomers_SF * (1 - p_boomers_SF) / n_SF) + (p_boomers_NY * (1 - p_boomers_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [67]:
CI

(-0.05369678653127372, -0.052512025590019294)

The results are significant. There is enough evidence to say that the proportion of baby boomer riders in san franscisco is less than the proportion of baby boomer riders in New York.

# Gen X

In [68]:
p_X_SF = data.loc['Generation X', 'SF'] / data['SF'].sum()
p_X_NY = data.loc['Generation X', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_X_SF - p_X_NY
s_e = np.sqrt((p_X_SF * (1 - p_X_SF) / n_SF) + (p_X_NY * (1 - p_X_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [69]:
CI

(-0.034136789138819054, -0.032151653153416906)

The results are significant. There is enough evidence to say that the proportion of generation x riders in san franscisco is less than the proportion of generation x riders in New York.

# Generation Z

In [70]:
p_Z_SF = data.loc['Generation Z', 'SF'] / data['SF'].sum()
p_Z_NY = data.loc['Generation Z', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_Z_SF - p_Z_NY
s_e = np.sqrt((p_Z_SF * (1 - p_Z_SF) / n_SF) + (p_Z_NY * (1 - p_Z_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [71]:
CI

(-0.0023415395380374966, -0.001816721294523569)

The results are significant. There is enough evidence to say that the proportion of generation z riders in san franscisco is less than the proportion of generation z riders in New York.

# Millennials

In [72]:
p_millennial_SF = data.loc['Millennials', 'SF'] / data['SF'].sum()
p_millennial_NY = data.loc['Millennials', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_millennial_SF - p_millennial_NY
s_e = np.sqrt((p_millennial_SF * (1 - p_millennial_SF) / n_SF) + (p_millennial_NY * (1 - p_millennial_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [73]:
CI

(0.08823769583141865, 0.09035530220864403)

The results are significant. There is enough evidence to say that the proportion of millennial riders in san franscisco is more than the proportion of millennial riders in New York.

# Silent

In [74]:
p_silent_SF = data.loc['Silent', 'SF'] / data['SF'].sum()
p_silent_NY = data.loc['Silent', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_silent_SF - p_silent_NY
s_e = np.sqrt((p_silent_SF * (1 - p_silent_SF) / n_SF) + (p_silent_NY * (1 - p_silent_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [75]:
CI

(-0.0010831952399298162, -0.0008542875540427901)

The results are significant. There is enough evidence to say that the proportion of silent riders in san franscisco is less than the proportion of silent riders in New York.