## Google BigData Showcase
BigQuery ML Tutorial 01  
Edited by 김하제

In [1]:
# BigQuery 라이브러리 호출
from google.cloud import bigquery
client = bigquery.Client()

In [2]:
# 데이터셋 생성 "bqml_tutorial"
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
dataset.location = 'US'
client.create_dataset(dataset)

Dataset(DatasetReference('zinc-fold-201301', 'bqml_tutorial'))

In [3]:
# magic 명령어를 처리할 수 있도록 클라이언트 라이브러리 호출
%load_ext google.cloud.bigquery

In [4]:
# ML 모델 생성
%%bigquery
CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`
OPTIONS(model_type='logistic_reg') AS
SELECT
  IF(totals.transactions IS NULL, 0, 1) AS label,
  IFNULL(device.operatingSystem, "") AS os,
  device.isMobile AS is_mobile,
  IFNULL(geoNetwork.country, "") AS country,
  IFNULL(totals.pageviews, 0) AS pageviews
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'

In [5]:
# ML 학습 정보
%%bigquery
SELECT
  *
FROM
  ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)

Unnamed: 0,training_run,iteration,loss,eval_loss,duration_ms,learning_rate
0,0,8,0.043878,0.045448,33914,25.6
1,0,7,0.044654,0.045502,32857,25.6
2,0,6,0.047345,0.048277,31685,12.8
3,0,5,0.053888,0.05334,30331,6.4
4,0,4,0.067776,0.066409,29636,3.2
5,0,3,0.097545,0.096206,32295,1.6
6,0,2,0.169802,0.168851,29472,0.8
7,0,1,0.320692,0.320175,32634,0.4
8,0,0,0.521573,0.52138,27074,0.2


In [6]:
# ML 평가하기
%%bigquery
SELECT
  *
FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (
  SELECT
    IF(totals.transactions IS NULL, 0, 1) AS label,
    IFNULL(device.operatingSystem, "") AS os,
    device.isMobile AS is_mobile,
    IFNULL(geoNetwork.country, "") AS country,
    IFNULL(totals.pageviews, 0) AS pageviews
  FROM
    `bigquery-public-data.google_analytics_sample.ga_sessions_*`
  WHERE
    _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))

Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.468504,0.110801,0.985343,0.179217,0.046242,0.98273


In [7]:
# 예측하기
%%bigquery
SELECT
  country,
  SUM(predicted_label) as total_predicted_purchases
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
  SELECT
    IFNULL(device.operatingSystem, "") AS os,
    device.isMobile AS is_mobile,
    IFNULL(totals.pageviews, 0) AS pageviews,
    IFNULL(geoNetwork.country, "") AS country
  FROM
    `bigquery-public-data.google_analytics_sample.ga_sessions_*`
  WHERE
    _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
  GROUP BY country
  ORDER BY total_predicted_purchases DESC
  LIMIT 10

Unnamed: 0,country,total_predicted_purchases
0,United States,220
1,Taiwan,8
2,Canada,7
3,Turkey,2
4,India,2
5,Japan,2
6,St. Lucia,1
7,Guyana,1
8,Australia,1
9,Thailand,1


In [8]:
# Test 데이터 생성하기
%%bigquery
SELECT
  fullVisitorId,
  SUM(predicted_label) as total_predicted_purchases
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
  SELECT
    IFNULL(device.operatingSystem, "") AS os,
    device.isMobile AS is_mobile,
    IFNULL(totals.pageviews, 0) AS pageviews,
    IFNULL(geoNetwork.country, "") AS country,
    fullVisitorId
  FROM
    `bigquery-public-data.google_analytics_sample.ga_sessions_*`
  WHERE
    _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
  GROUP BY fullVisitorId
  ORDER BY total_predicted_purchases DESC
  LIMIT 10

Unnamed: 0,fullVisitorId,total_predicted_purchases
0,9417857471295131045,4
1,57693500927581077,2
2,456807427403774085,2
3,489038402765684003,2
4,806992249032686650,2
5,2158257269735455737,2
6,112288330928895942,2
7,7420300501523012460,2
8,5073919761051630191,2
9,2105122376016897629,2
