In [2]:
from google.cloud import bigquery as bq
import os
import pandas as pd
import pandas_gbq as pgbq
import util_functions as util
import matplotlib.pyplot as plt
import pprint

pp = pprint.PrettyPrinter()

%matplotlib inline
%load_ext google.cloud.bigquery

service_credentials = 'Service_Credentials/big-query-horse-play-f37757d450b8.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_credentials

# For your mess around dataset
horseplay_client = bq.Client()

# For BQ public datasets
bq_pub_client = bq.Client(project='bigquery-public-data')

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [None]:
%%bigquery birth_df

SELECT
    source_year AS year,
    mother_age AS mom_age,
    COUNT(cigarettes_per_day) AS cigs_per_day
FROM `bigquery-public-data.samples.natality`
GROUP BY year, mom_age
ORDER BY year DESC
LIMIT 5

In [None]:
birth_df[['mom_age', 'cigs_per_day']].groupby(by='mom_age').mean().plot(kind='bar')

In [None]:
# But the above is the same as this
sql_query = (
"""
SELECT
    source_year AS year,
    mother_age AS mom_age,
    COUNT(cigarettes_per_day) AS cigs_per_day
FROM `bigquery-public-data.samples.natality`
GROUP BY year, mom_age
ORDER BY year DESC
LIMIT 5
""")

query_job = horseplay_client.query(sql_query)

query_results = query_job.result()

# But then you need to process it extensively to get it into a pandas df

In [None]:
# So if you want the best of both worlds, you can just use the pgbq functions rather than line magic
pgbq.read_gbq(sql_query, dialect="standard")

In [8]:
# Okay, let's hit the GA bigquery sample
ga_query = (
"""
SELECT SUM(totals.pageviews) as TotalPageviews
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170101`
""")

pgbq.read_gbq(ga_query, dialect="standard")

Unnamed: 0,TotalPageviews
0,5362


In [None]:
util.get_table_ids(bq_pub_client, ga_sample_db_ref)

In [3]:
ga_sample_db_ref = util.get_dataset(bq_pub_client, 'google_analytics_sample')
ga_sample_table_ids = util.get_table_ids(bq_pub_client, ga_sample_db_ref)
ga_sample_table = util.get_table(bq_pub_client, ga_sample_db_ref, ga_sample_table_ids[365])

---------------------------------------
Dataset ID: google_analytics_sample
Friendly Name: None
Full ID: bigquery-public-data:google_analytics_sample
Labels: {}
Project: bigquery-public-data
Ref: DatasetReference('bigquery-public-data', 'google_analytics_sample')
---------------------------------------
---------------------------------------
Table ID: ga_sessions_20170801
Friendly Name: None
Full ID: bigquery-public-data:google_analytics_sample.ga_sessions_20170801
Type: TABLE
Rows: 2556


In [13]:
ga_sample_table.schema

[SchemaField('visitorId', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('visitNumber', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('visitId', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('visitStartTime', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('date', 'STRING', 'NULLABLE', None, ()),
 SchemaField('totals', 'RECORD', 'NULLABLE', None, (SchemaField('visits', 'INTEGER', 'NULLABLE', None, ()), SchemaField('hits', 'INTEGER', 'NULLABLE', None, ()), SchemaField('pageviews', 'INTEGER', 'NULLABLE', None, ()), SchemaField('timeOnSite', 'INTEGER', 'NULLABLE', None, ()), SchemaField('bounces', 'INTEGER', 'NULLABLE', None, ()), SchemaField('transactions', 'INTEGER', 'NULLABLE', None, ()), SchemaField('transactionRevenue', 'INTEGER', 'NULLABLE', None, ()), SchemaField('newVisits', 'INTEGER', 'NULLABLE', None, ()), SchemaField('screenviews', 'INTEGER', 'NULLABLE', None, ()), SchemaField('uniqueScreenviews', 'INTEGER', 'NULLABLE', None, ()), SchemaField('timeOnScreen', 'INTEGER', 'NULLAB

In [4]:
# Toss it over to BG
util.create_dataset(horseplay_client, 'ga_sample_table')

'ga_sample_table created in project - big-query-horse-play'

In [6]:
horseplay_db_ids = util.get_dataset_ids(horseplay_client, log=True)

Datasets in project - big-query-horse-play:
ga_sample_table
names_dataset
second_test


In [7]:
horseplay_db_ids

['ga_sample_table', 'names_dataset', 'second_test']

In [10]:
test_df = pgbq.read_gbq(ga_query, dialect="standard")

In [12]:
pgbq.to_gbq(test_df, "ga_sample_table.test_bq_ga_query")

In [24]:
test_bq_query = (
"""
SELECT device.deviceCategory
, SUM(totals.visits) AS Visits
, SUM(totals.pageviews) AS PVs
, SUM(totals.newVisits) AS NewVisits
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170101`
GROUP BY device.deviceCategory
""")

In [26]:
test_df2 = pgbq.read_gbq(test_bq_query, dialect="standard")

In [27]:
pgbq.to_gbq(test_df2, "ga_sample_table.test_bq_ga_query2")