# GCPUG発表用資料

In [1]:
from google.cloud.bigquery import magics
from google.oauth2 import service_account
credentials = (service_account.Credentials.from_service_account_file('../scalable-data-science-8a106588da7e.json'))
magics.context.credentials = credentials

In [2]:
%load_ext google.cloud.bigquery

In [16]:
PROJECT ='scalable-data-science'
from google.cloud.bigquery import magics
magics.context.project = PROJECT

In [29]:
from google.cloud.bigquery import magics
# 指定したバイト以上のクエリを実行すると、失敗する
# https://cloud.google.com/bigquery/docs/best-practices-costs?hl=ja
magics.context.default_query_job_config.maximum_bytes_billed = 100000000

# データの概観を眺める

In [12]:
%%bigquery --project $PROJECT 
SELECT 
    species,
    COUNT(*) AS count,
    AVG(sepal_length) AS avg,
    STDDEV(sepal_length) AS std,
    MAX(sepal_length) AS max,
    MIN(sepal_length) AS min,
    CORR(sepal_length, sepal_width) AS corr
FROM `bigquery-public-data.ml_datasets.iris` GROUP BY species

Unnamed: 0,species,count,avg,std,max,min,corr
0,versicolor,50,5.936,0.516171,7.0,4.9,0.525911
1,virginica,50,6.588,0.63588,7.9,4.9,0.457228
2,setosa,50,5.006,0.35249,5.8,4.3,0.74678


In [30]:
%%bigquery data --project $PROJECT  
-- 変数名dataにBQの出力結果がDataFrameとして格納される
SELECT 
    species,
    AVG(sepal_length) AS avg
FROM `bigquery-public-data.ml_datasets.iris` GROUP BY species

In [31]:
# matplotlibで可視化
data.plot(x='species', y='avg', kind='bar', title='sepal_length by species', rot= 45)

<matplotlib.axes._subplots.AxesSubplot at 0x12a6b4fd0>

# 特徴量を作成するための関数

In [40]:
%%bigquery
SELECT
  ML.POLYNOMIAL_EXPAND(
      STRUCT(2 AS f1, 3 AS f2)
  ) 
AS output;

Unnamed: 0,output
0,"{'f1': 2.0, 'f1_f1': 4.0, 'f1_f2': 6.0, 'f2': ..."


In [36]:
%%bigquery

SELECT
  ML.FEATURE_CROSS(
      STRUCT('a' AS f1, 'b' AS f2, 'c' AS f3)
  )
AS output;

Unnamed: 0,output
0,"{'f1_f2': 'a_b', 'f1_f3': 'a_c', 'f2_f3': 'b_c'}"


In [37]:
%%bigquery

SELECT
  ML.NGRAMS(['a', 'b', 'c'], [2,3], '#') AS output;

Unnamed: 0,output
0,"[a#b, a#b#c, b#c]"


## 特徴量を変換するための関数

In [32]:
%%bigquery
SELECT
  f, ML.QUANTILE_BUCKETIZE(f, 3) OVER() AS bucket
FROM
  UNNEST([1,2,3,4,5]) AS f;

Unnamed: 0,f,bucket
0,1,bin_1
1,2,bin_2
2,4,bin_3
3,3,bin_2
4,5,bin_3


In [43]:
%%bigquery

SELECT
    f,
    IF(f>2, 1, 0) AS output
FROM
  UNNEST([1,2,3]) AS f;

Unnamed: 0,f,output
0,1,0
1,2,0
2,3,1


In [42]:
%%bigquery

SELECT
    f, 
    ML.MIN_MAX_SCALER(f) OVER() AS min_max,
    ML.STANDARD_SCALER(f) OVER() AS std
FROM
  UNNEST([1,2,3,4,5]) AS f;

Unnamed: 0,f,min_max,std
0,1,0.0,-1.264911
1,5,1.0,1.264911
2,2,0.25,-0.632456
3,4,0.75,0.632456
4,3,0.5,0.0


## ST_GEOHASH

In [60]:
%%bigquery

SELECT
    country,
    port_name,
    ST_GeoHash(port_geom, 2) AS hash_2,
    ST_GeoHash(port_geom, 4) AS hash_4,
    ST_GeoHash(port_geom, 8) AS hash_8
FROM
  `bigquery-public-data.geo_international_ports.world_port_index`
WHERE country="JP"
ORDER BY hash_8
LIMIT 5

Unnamed: 0,country,port_name,hash_2,hash_4,hash_8
0,JP,ISHIGAKI,wu,wu26,wu263mxr
1,JP,HIRARA KO,wu,wu3j,wu3j4fsw
2,JP,NAHA KO,wu,wudv,wudv1s85
3,JP,NISHIHARA,wu,wudv,wudvhht5
4,JP,NAKAGUSUKU,wu,wudv,wudvqvdn


# 前処理したデータをモデルに流し込む

In [None]:
CREATE MODEL `transform_tutorial.natality_model` OPTIONS (
  model_type = 'linear_reg',
  input_label_cols = ['weight_pounds']
) AS
SELECT
  weight_pounds,
  is_male,
  IF(plurality > 1, 1, 0) AS plurality,
  ML.BUCKETIZE(gestation_weeks, [37, 42]) AS gestation_weeks,
  ML.FEATURE_CROSS(
    STRUCT(
      CAST(alcohol_use AS STRING) AS alcohol_use,
      CAST(cigarette_use AS STRING) AS cigarette_use
    )
  ) AS alcohol_cigarette_use
FROM
  `bigquery-public-data.samples.natality`

In [None]:
SELECT
  predicted_weight_pounds
FROM
  ML.PREDICT(
    MODEL `transform_tutorial.natality_model`,
    (
      SELECT
        is_male,
        -- イチイチ同じ前処理を実行しなければいけない
        IF(plurality > 1, 1, 0) AS plurality,
        ML.BUCKETIZE(gestation_weeks, [37, 42]) AS gestation_weeks,
        ML.FEATURE_CROSS(
          STRUCT(
            CAST(alcohol_use AS STRING) AS alcohol_use,
            CAST(cigarette_use AS STRING) AS cigarette_use
          )
        ) AS alcohol_cigarette_use
      FROM
        `bigquery-public-data.samples.natality`
      LIMIT
        5
    )
  )

In [None]:
CREATE MODEL `transform_tutorial.natality_model_with_trans` TRANSFORM(
  -- 前処理の関数を定義
  weight_pounds,
  is_male,
  IF(plurality > 1, 1, 0) AS plurality,
  ML.BUCKETIZE(gestation_weeks, [37, 42]) AS gestation_weeks,
  ML.FEATURE_CROSS(
    STRUCT(
      CAST(alcohol_use AS STRING) AS alcohol_use,
      CAST(cigarette_use AS STRING) AS cigarette_use
    )
  ) AS alcohol_cigarette_use
) OPTIONS (
  model_type = 'linear_reg',
  input_label_cols = ['weight_pounds']
) AS
SELECT
  *
FROM
  `bigquery-public-data.samples.natality`
WHERE
  weight_pounds IS NOT NULL -- 適当にサンプリング
  AND RAND() < 0.001

In [None]:
SELECT
  predicted_weight_pounds
FROM
  ML.PREDICT(
    MODEL `transform_tutorial.natality_model_with_trans`,
    (
      SELECT *
      FROM
        `bigquery-public-data.samples.natality`
      LIMIT
        5
    )
  )