In [65]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import os

In [66]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [67]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="sunny-footing-321404-513454d4be38.json"

In [68]:
client = bigquery.Client()

query = """
    SELECT member_gender FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
    WHERE (member_gender = 'Female' OR member_gender = 'Male') AND 
"""

query_job = client.query(query) 

In [69]:
sf_genders = query_job.to_dataframe()
sf_genders.value_counts()

member_gender
Male             651771
Female           199202
dtype: int64

In [70]:
client = bigquery.Client()

query = """
    SELECT gender FROM `bigquery-public-data.new_york_citibike.citibike_trips`\
    WHERE (gender = 'male' OR gender = 'female')
"""

query_job = client.query(query) 

In [71]:
ny_genders = query_job.to_dataframe()
ny_genders.value_counts()

gender
male      35611787
female    11376412
dtype: int64

In [74]:
data = pd.DataFrame(columns = ['SF', 'NY'])
data['SF'] = list(sf_genders.value_counts())
data['NY'] = list(ny_genders.value_counts())
data.index = ['male', 'female']
data

Unnamed: 0,SF,NY
male,651771,35611787
female,199202,11376412


In [79]:
import numpy as np
# population proportion differences (proportion of females in SF and NY)
# just realized should be filtered by 2017 and after
p_female_SF = data.loc['female', 'SF'] / data['SF'].sum()
p_female_NY = data.loc['female', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_female_SF - p_female_NY
s_e = np.sqrt((p_female_SF * (1 - p_female_SF) / n_SF) + (p_female_NY * (1 - p_female_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [80]:
CI
# there are less female to male riders in SF than in NY

(-0.008932732772209101, -0.0071168210741792415)

In [84]:
# map year to generation
def map_year_to_generation(year):
    if 1928 <= year <= 1945:
        generation = 'Silent'
    elif 1946 <= year <= 1964:
        generation = 'Baby Boomers'
    elif 1965 <= year <= 1980:
        generation = 'Generation X'
    elif 1981 <= year <= 1996:
        generation = 'Millennials'
    elif 1997 <= year:
        generation = 'Generation Z'
    else:
        generation = 'Before Silent'
    return generation

In [85]:
client = bigquery.Client()

query = """
    SELECT member_birth_year FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
"""

query_job = client.query(query) 

In [86]:
sf_generations = query_job.to_dataframe()
sf_generations['generation'] = sf_generations['member_birth_year'].apply(lambda x: map_year_to_generation(x))
sf_generations

Unnamed: 0,member_birth_year,generation
0,1998.0,Generation Z
1,1997.0,Generation Z
2,1966.0,Generation X
3,1947.0,Baby Boomers
4,1962.0,Baby Boomers
...,...,...
1947414,1995.0,Millennials
1947415,1995.0,Millennials
1947416,1995.0,Millennials
1947417,1995.0,Millennials


In [97]:
sf_generations['generation'].value_counts().sort_index()

Baby Boomers       68635
Before Silent    1085554
Generation X      259685
Generation Z       12827
Millennials       518330
Silent              2388
Name: generation, dtype: int64

In [89]:
client = bigquery.Client()

query = """
    SELECT birth_year FROM `bigquery-public-data.new_york_citibike.citibike_trips`
"""

query_job = client.query(query) 

In [91]:
ny_generations = query_job.to_dataframe()
ny_generations['generation'] = ny_generations['birth_year'].apply(lambda x: map_year_to_generation(x))
ny_generations

Unnamed: 0,birth_year,generation
0,1979.0,Generation X
1,1979.0,Generation X
2,1979.0,Generation X
3,1979.0,Generation X
4,1979.0,Generation X
...,...,...
58937710,,Before Silent
58937711,,Before Silent
58937712,,Before Silent
58937713,,Before Silent


In [96]:
ny_generations['generation'].value_counts().sort_index()

Baby Boomers      7246214
Before Silent    11430023
Generation X     17087299
Generation Z       426128
Millennials      22515467
Silent             232584
Name: generation, dtype: int64

In [100]:
data = pd.DataFrame(columns = ['SF', 'NY'])
data['SF'] = list(sf_generations['generation'].value_counts().sort_index())
data['NY'] = list(ny_generations['generation'].value_counts().sort_index())
data.index = ['Baby Boomers', 'Before Silent', 'Generation X', 'Generation Z', 'Millennials', 'Silent']
data = data.drop(index = ['Before Silent'])
data

Unnamed: 0,SF,NY
Baby Boomers,68635,7246214
Generation X,259685,17087299
Generation Z,12827,426128
Millennials,518330,22515467
Silent,2388,232584


In [101]:
p_boomers_SF = data.loc['Baby Boomers', 'SF'] / data['SF'].sum()
p_boomers_NY = data.loc['Baby Boomers', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_boomers_SF - p_boomers_NY
s_e = np.sqrt((p_boomers_SF * (1 - p_boomers_SF) / n_SF) + (p_boomers_NY * (1 - p_boomers_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [102]:
CI

(-0.07347237408639716, -0.07231109025323175)

In [103]:
p_X_SF = data.loc['Generation X', 'SF'] / data['SF'].sum()
p_X_NY = data.loc['Generation X', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_X_SF - p_X_NY
s_e = np.sqrt((p_X_SF * (1 - p_X_SF) / n_SF) + (p_X_NY * (1 - p_X_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [104]:
CI

(-0.05934673021477546, -0.057390223829162895)

In [105]:
p_Z_SF = data.loc['Generation Z', 'SF'] / data['SF'].sum()
p_Z_NY = data.loc['Generation Z', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_Z_SF - p_Z_NY
s_e = np.sqrt((p_Z_SF * (1 - p_Z_SF) / n_SF) + (p_Z_NY * (1 - p_Z_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [106]:
CI

(0.005656139176355604, 0.006170216843080394)

In [107]:
p_millennial_SF = data.loc['Millennials', 'SF'] / data['SF'].sum()
p_millennial_NY = data.loc['Millennials', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_millennial_SF - p_millennial_NY
s_e = np.sqrt((p_millennial_SF * (1 - p_millennial_SF) / n_SF) + (p_millennial_NY * (1 - p_millennial_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [108]:
CI

(0.12642862231291668, 0.12851539387303185)

In [109]:
p_silent_SF = data.loc['Silent', 'SF'] / data['SF'].sum()
p_silent_NY = data.loc['Silent', 'NY'] / data['NY'].sum()
n_SF = data['SF'].sum()
n_NY = data['NY'].sum()

p_hat = p_silent_SF - p_silent_NY
s_e = np.sqrt((p_silent_SF * (1 - p_silent_SF) / n_SF) + (p_silent_NY * (1 - p_silent_NY) / n_NY))
CI = (p_hat - 1.96*s_e, p_hat + 1.96*s_e)

In [110]:
CI

(-0.00223771444984091, -0.0020122393719762613)