# Big Query

In [1]:
# Importamos
from google.cloud import bigquery

In [2]:
from google.oauth2 import service_account

In [3]:
import pandas as pd

In [4]:
# DB
from sqlalchemy import create_engine

# Data analysis and wrangling
import numpy as np
import random as rnd

# Visualization
import seaborn as sns
from scipy.stats import norm, skew
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

# Warnings and other tools
import itertools
import warnings
warnings.filterwarnings("ignore")

In [5]:
!ls ~/.creds

trim-heaven-301811-7b40bce8b0f8.json


In [6]:
# creamos las variables de entorno
creds = service_account.Credentials.from_service_account_file("/Users/victoriasofia93/.creds/trim-heaven-301811-7b40bce8b0f8.json")

In [7]:
proj_id = "trim-heaven-301811"

In [8]:
# Llamamos la función Client
client = bigquery.Client(project=proj_id, credentials=creds)

In [15]:
# Testing
query = (
    'SELECT * FROM `bigquery-public-data.samples.natality` '
    'LIMIT 50'
        )

In [49]:
query_nacimientos = ("""
SELECT 
    state AS estado,
    SUM(CASE WHEN year >= 1970 AND year < 1980 THEN plurality ELSE 0 END) AS B70,
    SUM(CASE WHEN year >= 1980 AND year < 1990 THEN plurality ELSE 0 END) AS B80,
    SUM(CASE WHEN year >= 1990 AND year < 2000 THEN plurality ELSE 0 END) AS B90,
    SUM(CASE WHEN year >= 2000 AND year <=2010 THEN plurality  ELSE 0 END ) AS B00,
    SUM(IF(is_male = TRUE, 1, 0)) AS Male,
    SUM(IF(is_male = FALSE, 1, 0)) AS Female,
    ROUND(AVG(weight_pounds * 0.453592),3) AS Weight
    

FROM `bigquery-public-data.samples.natality`
WHERE 
    NOT IS_NAN(plurality) AND plurality >1 AND state is NOT NULL 
GROUP BY 
    1
ORDER BY 
    1

""")

In [50]:
query_job_nacimientos = client.query(query_nacimientos)

In [51]:
query_job_nacimientos.state

'RUNNING'

In [80]:
df_nacimientos = query_job_nacimientos.to_dataframe()
df_nacimientos

Unnamed: 0,estado,B70,B80,B90,B00,Male,Female,Weight
0,AK,1529,4529,4971,2714,3391,3396,2.464
1,AL,14866,26254,33540,21033,23479,23407,2.277
2,AR,5703,14453,16504,10678,11598,11751,2.318
3,AZ,6523,17201,36109,25765,20775,20761,2.346
4,CA,56624,147762,274371,159740,156282,157163,2.405
5,CO,12704,23079,32601,22479,22324,22024,2.292
6,CT,6887,19684,30100,18537,18463,18173,2.384
7,DC,4105,7863,12071,6088,7373,7232,2.258
8,DE,1517,3398,7047,4769,4082,4052,2.302
9,FL,35721,67570,101555,65804,66529,66131,2.324


In [16]:
query

'SELECT * FROM `bigquery-public-data.samples.natality` LIMIT 50'

In [17]:
query_job = client.query(query)

In [18]:
# para visualizar el estado de ejecución: query_job.state
query_job.state

'RUNNING'

In [19]:
rows = query_job.result()
for row in rows:
    print(row)

Row((2005, 2005, 7, None, 3, None, False, None, 8.62889293468, 1, None, 9, None, 78, 34, 41, '09262004', True, None, False, None, False, None, 57, 9, 0, 0, 10, 78, 38, 1), {'source_year': 0, 'year': 1, 'month': 2, 'day': 3, 'wday': 4, 'state': 5, 'is_male': 6, 'child_race': 7, 'weight_pounds': 8, 'plurality': 9, 'apgar_1min': 10, 'apgar_5min': 11, 'mother_residence_state': 12, 'mother_race': 13, 'mother_age': 14, 'gestation_weeks': 15, 'lmp': 16, 'mother_married': 17, 'mother_birth_state': 18, 'cigarette_use': 19, 'cigarettes_per_day': 20, 'alcohol_use': 21, 'drinks_per_week': 22, 'weight_gain_pounds': 23, 'born_alive_alive': 24, 'born_alive_dead': 25, 'born_dead': 26, 'ever_born': 27, 'father_race': 28, 'father_age': 29, 'record_weight': 30})
Row((2005, 2005, 4, None, 6, None, True, None, 2.6786164833, 1, None, 6, None, 78, 36, 34, '09012004', True, None, False, None, False, None, 23, 7, 0, 0, 8, 78, 39, 1), {'source_year': 0, 'year': 1, 'month': 2, 'day': 3, 'wday': 4, 'state': 5, 'i

In [20]:
# Convertimos los datos de filas a dataframe
df = query_job.to_dataframe()
df

Unnamed: 0,source_year,year,month,day,wday,state,is_male,child_race,weight_pounds,plurality,...,alcohol_use,drinks_per_week,weight_gain_pounds,born_alive_alive,born_alive_dead,born_dead,ever_born,father_race,father_age,record_weight
0,2005,2005,7,,3.0,,False,,8.628893,1.0,...,False,,57.0,9.0,0.0,0.0,10,78,38,1
1,2005,2005,4,,6.0,,True,,2.678616,1.0,...,False,,23.0,7.0,0.0,0.0,8,78,39,1
2,2006,2006,5,,1.0,,True,,11.062796,1.0,...,False,,11.0,,,,8,68,41,1
3,2007,2007,3,,2.0,,False,,5.436599,2.0,...,False,,10.0,,,,8,78,42,1
4,2007,2007,4,,7.0,,False,,3.560466,1.0,...,False,,18.0,,,,8,78,43,1
5,2007,2007,4,,1.0,,True,,5.998778,1.0,...,False,,20.0,,,,8,78,42,1
6,2008,2008,1,,4.0,,True,,9.186662,1.0,...,True,,37.0,,,,8,78,36,1
7,1998,1998,1,,7.0,AK,False,9.0,4.874421,1.0,...,True,99.0,17.0,5.0,2.0,1.0,8,7,27,1
8,2003,2003,12,,6.0,AR,False,,8.000575,1.0,...,False,,13.0,10.0,0.0,0.0,11,78,39,1
9,1969,1969,6,23.0,,CA,True,7.0,7.438397,,...,,,,8.0,1.0,2.0,12,7,47,2


In [23]:
df.dtypes

source_year                 int64
year                        int64
month                       int64
day                       float64
wday                      float64
state                      object
is_male                      bool
child_race                float64
weight_pounds             float64
plurality                 float64
apgar_1min                float64
apgar_5min                float64
mother_residence_state     object
mother_race                 int64
mother_age                  int64
gestation_weeks           float64
lmp                        object
mother_married             object
mother_birth_state         object
cigarette_use              object
cigarettes_per_day        float64
alcohol_use                object
drinks_per_week           float64
weight_gain_pounds        float64
born_alive_alive          float64
born_alive_dead           float64
born_dead                 float64
ever_born                   int64
father_race                 int64
father_age    

In [24]:
df.describe(include=[np.number])

Unnamed: 0,source_year,year,month,day,wday,child_race,weight_pounds,plurality,apgar_1min,apgar_5min,...,cigarettes_per_day,drinks_per_week,weight_gain_pounds,born_alive_alive,born_alive_dead,born_dead,ever_born,father_race,father_age,record_weight
count,50.0,50.0,50.0,23.0,27.0,35.0,50.0,47.0,27.0,38.0,...,1.0,1.0,26.0,45.0,45.0,45.0,50.0,50.0,50.0,50.0
mean,1991.28,1991.28,6.52,19.565217,4.703704,6.857143,7.218287,1.106383,61.925926,37.210526,...,5.0,99.0,50.576923,7.088889,1.555556,2.044444,9.08,26.54,38.52,1.24
std,12.361956,12.361956,3.369975,8.156571,1.917828,1.647509,1.708543,0.311661,45.567301,42.54483,...,,,38.543662,2.466339,8.178563,8.218481,1.639313,30.331643,11.65182,0.431419
min,1969.0,1969.0,1.0,4.0,1.0,4.0,2.678616,1.0,6.0,6.0,...,5.0,99.0,10.0,1.0,0.0,0.0,8.0,4.0,23.0,1.0
25%,1983.25,1983.25,4.0,14.0,4.0,6.0,6.413247,1.0,8.0,9.0,...,5.0,99.0,16.25,6.0,0.0,0.0,8.0,6.0,31.0,1.0
50%,1994.0,1994.0,6.0,22.0,5.0,7.0,7.275255,1.0,99.0,9.0,...,5.0,99.0,34.5,7.0,0.0,0.0,8.0,7.0,39.0,1.0
75%,2003.0,2003.0,9.0,26.0,6.0,9.0,8.359929,1.0,99.0,99.0,...,5.0,99.0,99.0,9.0,0.0,1.0,10.0,63.0,41.75,1.0
max,2008.0,2008.0,12.0,30.0,7.0,9.0,11.062796,2.0,99.0,99.0,...,5.0,99.0,99.0,14.0,55.0,55.0,15.0,78.0,99.0,2.0


In [25]:
df.columns

Index(['source_year', 'year', 'month', 'day', 'wday', 'state', 'is_male',
       'child_race', 'weight_pounds', 'plurality', 'apgar_1min', 'apgar_5min',
       'mother_residence_state', 'mother_race', 'mother_age',
       'gestation_weeks', 'lmp', 'mother_married', 'mother_birth_state',
       'cigarette_use', 'cigarettes_per_day', 'alcohol_use', 'drinks_per_week',
       'weight_gain_pounds', 'born_alive_alive', 'born_alive_dead',
       'born_dead', 'ever_born', 'father_race', 'father_age', 'record_weight'],
      dtype='object')

In [32]:
df.groupby(["year","born_alive_alive"])["year"].count()
#df[(df.year == 1970) & (df.born_alive_alive == '')]

print(df.source_year.unique())

[2005 2006 2007 2008 1998 2003 1969 1971 1974 1981 1984 1988 1989 1995
 2004 1993 1978 1983 1986 1987 2002 1999 2001]


In [53]:
query_2 = """

SELECT
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM `bigquery-public-data.samples.natality`
WHERE
    state IS NOT NULL AND
    child_race IS NOT NULL AND
    year >= 1970 AND year < 1980
GROUP BY
    estado, raza
ORDER BY
    estado



"""

In [54]:
query_job_2 = client.query(query_2)

In [55]:
df2 = query_job_2.to_dataframe()
df2

Unnamed: 0,estado,counter,raza
0,AK,69,Hawaiian
1,AK,11614,American Indian
2,AK,70,Chinese
3,AK,36396,White
4,AK,384,Filipino
...,...,...,...
403,WY,11,Hawaiian
404,WY,36996,White
405,WY,463,Black
406,WY,1247,American Indian


In [56]:
query_3 = """
WITH Race70 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1970 AND year < 1980
GROUP BY 
    estado, raza
)
SELECT 
    r70.estado, 
    r70.raza
FROM (
    SELECT 
    ARRAY_AGG(Race70 ORDER BY Race70.counter DESC LIMIT 1)[OFFSET(0)] AS r70
    FROM Race70
    GROUP BY Race70.estado 
    ORDER BY Race70.estado
)
"""

In [57]:
query_job_3 = client.query(query_3)

In [58]:
df2 = query_job_3.to_dataframe()
df2

Unnamed: 0,estado,raza
0,AK,White
1,AL,White
2,AR,White
3,AZ,White
4,CA,White
5,CO,White
6,CT,White
7,DC,Black
8,DE,White
9,FL,White


In [59]:
query_u = """
WITH Race70 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1970 AND year < 1980
GROUP BY 
    estado, raza
),
Race80 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1980 AND year < 1990
GROUP BY 
    estado, raza
),
Race90 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1990 AND year < 2000
GROUP BY 
    estado, raza
),
Race00 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 2000 AND year < 2010
GROUP BY 
    estado, raza
)
SELECT 
    r70.estado, 
    r70.raza,
    r80.raza,
    r90.raza,
    r00.raza
FROM (
    SELECT 
    ARRAY_AGG(Race70 ORDER BY Race70.counter DESC LIMIT 1)[OFFSET(0)] AS r70,
    ARRAY_AGG(Race80 ORDER BY Race80.counter DESC LIMIT 1)[OFFSET(0)] AS r80,
    ARRAY_AGG(Race90 ORDER BY Race90.counter DESC LIMIT 1)[OFFSET(0)] AS r90,
    ARRAY_AGG(Race00 ORDER BY Race00.counter DESC LIMIT 1)[OFFSET(0)] AS r00
    FROM Race70,Race80,Race90,Race00
    WHERE Race70.estado = Race80.estado
    GROUP BY Race70.estado 
    ORDER BY Race70.estado)

"""

In [60]:
query_job_u = client.query(query_u)

In [61]:
dfu = query_job_u.to_dataframe()
dfu

Unnamed: 0,estado,raza,raza_1,raza_2,raza_3
0,AK,White,White,Italiano/other...,Italiano/other...
1,AL,White,White,Italiano/other...,Italiano/other...
2,AR,White,White,Italiano/other...,Italiano/other...
3,AZ,White,White,Italiano/other...,Italiano/other...
4,CA,White,White,Italiano/other...,Italiano/other...
5,CO,White,White,Italiano/other...,Italiano/other...
6,CT,White,White,Italiano/other...,Italiano/other...
7,DC,Black,Black,Italiano/other...,Italiano/other...
8,DE,White,White,Italiano/other...,Italiano/other...
9,FL,White,White,Italiano/other...,Italiano/other...


In [62]:
query_razas2 = """
    WITH Race70 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1970 AND year < 1980
GROUP BY 
    estado, raza
),
Race80 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1980 AND year < 1990
GROUP BY 
    estado, raza
),
Race90 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 1990 AND year < 2000
GROUP BY 
    estado, raza
),
Race00 AS (
SELECT 
    state AS  estado,
    COUNT(*) AS counter,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE 
    state IS NOT NULL AND 
    child_race IS NOT NULL AND 
    year >= 2000
GROUP BY 
    estado, raza
)
SELECT 
    r70.estado, 
    r70.raza,
    r80.raza,
    r90.raza,
    r00.raza
FROM (
    SELECT 
    ARRAY_AGG(Race70 ORDER BY Race70.counter DESC LIMIT 1)[OFFSET(0)] AS r70,
    ARRAY_AGG(Race80 ORDER BY Race80.counter DESC LIMIT 1)[OFFSET(0)] AS r80,
    ARRAY_AGG(Race90 ORDER BY Race90.counter DESC LIMIT 1)[OFFSET(0)] AS r90,
    ARRAY_AGG(Race00 ORDER BY Race00.counter DESC LIMIT 1)[OFFSET(0)] AS r00
    FROM Race70,Race80,Race90,Race00
    WHERE Race70.estado = Race80.estado
    GROUP BY Race70.estado 
    ORDER BY Race70.estado
)
"""

In [78]:
query_act = """
WITH Race70 AS (
SELECT
    state AS  estado70,
    COUNT(*) AS counter70,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza70
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE
    state IS NOT NULL AND
    child_race IS NOT NULL AND
    year >= 1970 AND year < 1980
GROUP BY
    estado70, raza70
),
Race80 AS(
SELECT
    state AS  estado80,
    COUNT(*) AS counter80,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza80
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE
    state IS NOT NULL AND
    child_race IS NOT NULL AND
    year >= 1980 AND year < 1990
GROUP BY
    estado80, raza80
),
Race90 AS (
SELECT
    state AS  estado90,
    COUNT(*) AS counter90,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza90
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE
    state IS NOT NULL AND
    child_race IS NOT NULL AND
    year >= 1990 AND year < 2000
GROUP BY
    estado90, raza90
),
Race00 AS (
SELECT
    state AS  estado00,
    COUNT(*) AS counter00,
    (CASE
        WHEN child_race = 1 THEN "White"
        WHEN child_race = 2 THEN "Black"
        WHEN child_race = 3 THEN "American Indian"
        WHEN child_race = 4 THEN "Chinese"
        WHEN child_race = 5 THEN "Japanese"
        WHEN child_race = 6 THEN "Hawaiian"
        WHEN child_race = 7 THEN "Filipino"
        WHEN child_race = 18 THEN "Asian Indian"
        WHEN child_race = 28 THEN "Korean"
        WHEN child_race = 39 THEN "Samoan"
        WHEN child_race = 48 THEN "Vietnamese"
        ELSE "Italiano/other..."
        END) as raza00
FROM (
    SELECT year, state, child_race
    FROM `bigquery-public-data.samples.natality`
    )
WHERE
    state IS NOT NULL AND
    child_race IS NOT NULL AND
    year >= 2000 AND year < 2010
GROUP BY
    estado00, raza00
)
SELECT
    r70.estado70,
    r70.raza70,
    r80.raza80,
    r90.raza90,
    r00.raza00
FROM (
    SELECT
    ARRAY_AGG(Race70 ORDER BY Race70.counter70 DESC LIMIT 1)[OFFSET(0)] AS r70,
    ARRAY_AGG(Race80 ORDER BY Race80.counter80 DESC LIMIT 1)[OFFSET(0)] AS r80,
    ARRAY_AGG(Race90 ORDER BY Race90.counter90 DESC LIMIT 1)[OFFSET(0)] AS r90,
    ARRAY_AGG(Race00 ORDER BY Race00.counter00 DESC LIMIT 1)[OFFSET(0)] AS r00
    FROM Race70, Race80, Race90, Race00
    WHERE Race70.estado70 = Race80.estado80
    GROUP BY Race70.estado70
    ORDER BY Race70.estado70
)
"""

In [63]:
query_job_razas = client.query(query_razas2)

In [64]:
dfrazas = query_job_razas.to_dataframe()
dfrazas

Unnamed: 0,estado,raza,raza_1,raza_2,raza_3
0,AK,White,White,Italiano/other...,Italiano/other...
1,AL,White,White,Italiano/other...,Italiano/other...
2,AR,White,White,Italiano/other...,Italiano/other...
3,AZ,White,White,Italiano/other...,Italiano/other...
4,CA,White,White,Italiano/other...,Italiano/other...
5,CO,White,White,Italiano/other...,Italiano/other...
6,CT,White,White,Italiano/other...,Italiano/other...
7,DC,Black,Black,Italiano/other...,Italiano/other...
8,DE,White,White,Italiano/other...,Italiano/other...
9,FL,White,White,Italiano/other...,Italiano/other...


In [68]:
df3 = pd.merge(left=df_nacimientos, right=dfu, left_on='estado', right_on='estado')
df3

Unnamed: 0,estado,B70,B80,B90,B00,Male,Female,Weight,raza,raza_1,raza_2,raza_3
0,AK,1529,4529,4971,2714,3391,3396,2.464,White,White,Italiano/other...,Italiano/other...
1,AL,14866,26254,33540,21033,23479,23407,2.277,White,White,Italiano/other...,Italiano/other...
2,AR,5703,14453,16504,10678,11598,11751,2.318,White,White,Italiano/other...,Italiano/other...
3,AZ,6523,17201,36109,25765,20775,20761,2.346,White,White,Italiano/other...,Italiano/other...
4,CA,56624,147762,274371,159740,156282,157163,2.405,White,White,Italiano/other...,Italiano/other...
5,CO,12704,23079,32601,22479,22324,22024,2.292,White,White,Italiano/other...,Italiano/other...
6,CT,6887,19684,30100,18537,18463,18173,2.384,White,White,Italiano/other...,Italiano/other...
7,DC,4105,7863,12071,6088,7373,7232,2.258,Black,Black,Italiano/other...,Italiano/other...
8,DE,1517,3398,7047,4769,4082,4052,2.302,White,White,Italiano/other...,Italiano/other...
9,FL,35721,67570,101555,65804,66529,66131,2.324,White,White,Italiano/other...,Italiano/other...


In [76]:
%%time
pruebamerge = pd.merge(df_nacimientos, dfu)
pruebamerge

CPU times: user 6.25 ms, sys: 1.09 ms, total: 7.34 ms
Wall time: 6.63 ms


Unnamed: 0,estado,B70,B80,B90,B00,Male,Female,Weight,raza,raza_1,raza_2,raza_3
0,AK,1529,4529,4971,2714,3391,3396,2.464,White,White,Italiano/other...,Italiano/other...
1,AL,14866,26254,33540,21033,23479,23407,2.277,White,White,Italiano/other...,Italiano/other...
2,AR,5703,14453,16504,10678,11598,11751,2.318,White,White,Italiano/other...,Italiano/other...
3,AZ,6523,17201,36109,25765,20775,20761,2.346,White,White,Italiano/other...,Italiano/other...
4,CA,56624,147762,274371,159740,156282,157163,2.405,White,White,Italiano/other...,Italiano/other...
5,CO,12704,23079,32601,22479,22324,22024,2.292,White,White,Italiano/other...,Italiano/other...
6,CT,6887,19684,30100,18537,18463,18173,2.384,White,White,Italiano/other...,Italiano/other...
7,DC,4105,7863,12071,6088,7373,7232,2.258,Black,Black,Italiano/other...,Italiano/other...
8,DE,1517,3398,7047,4769,4082,4052,2.302,White,White,Italiano/other...,Italiano/other...
9,FL,35721,67570,101555,65804,66529,66131,2.324,White,White,Italiano/other...,Italiano/other...


In [77]:
%%time
pruebajoin = df_nacimientos.join(dfu.set_index('estado'), on='estado')
pruebajoin

CPU times: user 3.87 ms, sys: 401 µs, total: 4.27 ms
Wall time: 4.14 ms


Unnamed: 0,estado,B70,B80,B90,B00,Male,Female,Weight,raza,raza_1,raza_2,raza_3
0,AK,1529,4529,4971,2714,3391,3396,2.464,White,White,Italiano/other...,Italiano/other...
1,AL,14866,26254,33540,21033,23479,23407,2.277,White,White,Italiano/other...,Italiano/other...
2,AR,5703,14453,16504,10678,11598,11751,2.318,White,White,Italiano/other...,Italiano/other...
3,AZ,6523,17201,36109,25765,20775,20761,2.346,White,White,Italiano/other...,Italiano/other...
4,CA,56624,147762,274371,159740,156282,157163,2.405,White,White,Italiano/other...,Italiano/other...
5,CO,12704,23079,32601,22479,22324,22024,2.292,White,White,Italiano/other...,Italiano/other...
6,CT,6887,19684,30100,18537,18463,18173,2.384,White,White,Italiano/other...,Italiano/other...
7,DC,4105,7863,12071,6088,7373,7232,2.258,Black,Black,Italiano/other...,Italiano/other...
8,DE,1517,3398,7047,4769,4082,4052,2.302,White,White,Italiano/other...,Italiano/other...
9,FL,35721,67570,101555,65804,66529,66131,2.324,White,White,Italiano/other...,Italiano/other...


In [75]:
%%time
pruebaconcat = pd.concat([df_nacimientos, dfu], axis=1)
pruebaconcat

CPU times: user 1.15 ms, sys: 160 µs, total: 1.31 ms
Wall time: 1.22 ms


Unnamed: 0,estado,B70,B80,B90,B00,Male,Female,Weight,estado.1,raza,raza_1,raza_2,raza_3
0,AK,1529,4529,4971,2714,3391,3396,2.464,AK,White,White,Italiano/other...,Italiano/other...
1,AL,14866,26254,33540,21033,23479,23407,2.277,AL,White,White,Italiano/other...,Italiano/other...
2,AR,5703,14453,16504,10678,11598,11751,2.318,AR,White,White,Italiano/other...,Italiano/other...
3,AZ,6523,17201,36109,25765,20775,20761,2.346,AZ,White,White,Italiano/other...,Italiano/other...
4,CA,56624,147762,274371,159740,156282,157163,2.405,CA,White,White,Italiano/other...,Italiano/other...
5,CO,12704,23079,32601,22479,22324,22024,2.292,CO,White,White,Italiano/other...,Italiano/other...
6,CT,6887,19684,30100,18537,18463,18173,2.384,CT,White,White,Italiano/other...,Italiano/other...
7,DC,4105,7863,12071,6088,7373,7232,2.258,DC,Black,Black,Italiano/other...,Italiano/other...
8,DE,1517,3398,7047,4769,4082,4052,2.302,DE,White,White,Italiano/other...,Italiano/other...
9,FL,35721,67570,101555,65804,66529,66131,2.324,FL,White,White,Italiano/other...,Italiano/other...


In [81]:
!pip install gcsfs

Collecting gcsfs
  Downloading gcsfs-0.7.1-py2.py3-none-any.whl (20 kB)
Collecting fsspec>=0.8.0
  Downloading fsspec-0.8.5-py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 2.0 MB/s eta 0:00:01
[?25hCollecting aiohttp
  Downloading aiohttp-3.7.3.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 3.1 MB/s eta 0:00:01     |█████████▊                      | 337 kB 3.1 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-0.4.2-py2.py3-none-any.whl (18 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.1.0.tar.gz (53 kB)
[K     |████████████████████████████████| 53 kB 2.1 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[