<a href="https://colab.research.google.com/github/kootr/ml-study-session/blob/0730_mf/bq_recommendation_mf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/kootr/ml-study-session/blob/main/20220730_reccomendation_mf/bq_recommendation_mf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [1]:
from google.cloud import bigquery
import pandas as pd

In [2]:
client = bigquery.Client(location="US", project="ml-session")

In [3]:
# GCP認証　(VertexAI workbench：必要なし、Googole collabo:必要、ローカル: 必要（Jsonクレデンシャルファイルを配置する）　)
import os
import sys

IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Google Cloud Notebooks（おそらくVertexAI Workbenchのこと）, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # どれも当てはまらない場合はローカル実行として、クレデンシャルファイルを指定
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS '/hoge/fuga/key_file.json'

In [4]:
# テストクエリ
query = """
SELECT
  vendor_id,
  passenger_count,
  trip_distance,
  rate_code,
  payment_type,
  total_amount,
  tip_amount
FROM
  `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE tip_amount >= 0
LIMIT 100
"""
query_job = client.query(
    query,
     location="US",
)

df = query_job.to_dataframe()
df.head(5)

Unnamed: 0,vendor_id,passenger_count,trip_distance,rate_code,payment_type,total_amount,tip_amount
0,2,1,8.47,1,1,27.09,1.29
1,1,1,9.9,1,1,43.26,7.2
2,1,2,12.3,1,1,50.0,4.94
3,1,1,6.6,1,1,27.95,4.65
4,2,1,7.2,1,1,28.56,4.76


### 参考にしたTutorial 
https://cloud.google.com/bigquery-ml/docs/bigqueryml-mf-explicit-tutorial#find_all_the_item_ratings_for_a_set_of_users

In [5]:
# まずはデータセットを作成

dataset_id = "session13"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"

dataset = client.create_dataset(dataset)  # Make an API request.
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Created dataset ml-session.session13


In [6]:
# チュートリアル通り、movielensのレーティングデータ、タイトルデータをダウンロード、区切り文字加工
!curl -O 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
!unzip ml-1m.zip

!sed 's/::/,/g' ml-1m/ratings.dat > ratings.csv
!sed 's/::/@/g' ml-1m/movies.dat > movie_titles.csv # 区切り文字 @ とする

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5778k  100 5778k    0     0  11.5M      0 --:--:-- --:--:-- --:--:-- 11.5M
Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [9]:
!head ratings.csv

1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
1,1197,3,978302268
1,1287,5,978302039
1,2804,5,978300719
1,594,4,978302268
1,919,4,978301368


In [10]:
!head movie_titles.csv

1@Toy Story (1995)@Animation|Children's|Comedy
2@Jumanji (1995)@Adventure|Children's|Fantasy
3@Grumpier Old Men (1995)@Comedy|Romance
4@Waiting to Exhale (1995)@Comedy|Drama
5@Father of the Bride Part II (1995)@Comedy
6@Heat (1995)@Action|Crime|Thriller
7@Sabrina (1995)@Comedy|Romance
8@Tom and Huck (1995)@Adventure|Children's
9@Sudden Death (1995)@Action
10@GoldenEye (1995)@Action|Adventure|Thriller


In [11]:
# BigQuery clientを使ってテーブル作成、CSVインポート
def load_local_csv_to_bigquery(csv_file_path, table_id, schema, field_delimiter=','):
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        schema=schema,
        field_delimiter=field_delimiter
    )

    with open(csv_file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)
    job.result()

In [12]:
# Create and load table for ratings.csv
rating_table ="ratings" 
table_id = f"{dataset_id}.{rating_table}"
csv_file_path = "./ratings.csv"
schema=[
        bigquery.SchemaField("user_id", "INT64"),
        bigquery.SchemaField("item_id", "INT64"),
        bigquery.SchemaField("rating", "FLOAT64"),
        bigquery.SchemaField("timestamp", "TIMESTAMP"),
    ]
load_local_csv_to_bigquery(csv_file_path, table_id, schema)

In [13]:
# Create and load table for movie_titles.csv
title_table ="movie_titles" 
table_id = f"{dataset_id}.{title_table}"
csv_file_path = "./movie_titles.csv"
schema=[
        bigquery.SchemaField("movie_id", "INT64"),
        bigquery.SchemaField("movie_title", "STRING"),
        bigquery.SchemaField("genre", "STRING"),
    ]
# 区切り文字 @ とする
load_local_csv_to_bigquery(csv_file_path, table_id, schema,'@')

In [None]:
# （本筋とは別）レーティングの基本情報を調べる

# rating_table
query = f"""
#standardSQL
SELECT
  user_id, item_id, rating
FROM
  {dataset_id}.{rating_table}
"""
query_job = client.query(
    query,
    location="US",
)
df_rating = query_job.to_dataframe()
df_rating.head()

# タイトルごとのレーティング取得
query = f"""
#standardSQL
SELECT
  {title_table}.movie_title,
  {rating_table}.rating,
FROM
  {dataset_id}.{rating_table}
  LEFT JOIN {dataset_id}.{title_table} ON {rating_table}.item_id = {title_table}.movie_id
"""
query_job = client.query(
    query,
    location="US",
)
df_movie_ratings = query_job.to_dataframe()
df_movie_ratings.head()
test = df_movie_ratings.groupby('movie_title').agg(['count', 'mean'])
#うまくマルチインデックスの集計ができなかった
df_movie_ratings.groupby('movie_title').agg(['count'])

In [None]:
# 以下のクエリを実行するにはスロットの購入が必要。
# https://cloud.google.com/bigquery/docs/reservations-intro?hl=ja
# スロット購入の際はFlexプランの選択が妥当。実行後、スロットを削除する。
# BQスロットコミットメント購入→予約作成→予約割り当て→クエリ実行→予約割り当て削除→予約削除→スロットコミットメント削除

In [18]:
# モデル作成　13分かかります。　途中経過を表示できるように修正したい。
model_name = "my_explicit_mf_model"
query = f"""
#standardSQL
CREATE OR REPLACE MODEL {dataset_id}.{model_name}
OPTIONS
  (model_type='matrix_factorization',
   user_col='user_id',
   item_col='item_id',
   l2_reg=9.83,
   num_factors=34,
   model_registry='vertex_ai',
   vertex_ai_model_id='movie_recommend',
   vertex_ai_model_version_aliases=['experimental']) AS
SELECT
  user_id,
  item_id,
  rating
FROM {dataset_id}.{rating_table}
"""

query_job = client.query(
    query,
    location="US",
)

In [23]:
# Evaluate the model
query = f"""
#standardSQL
SELECT
  *
FROM
  ML.EVALUATE(MODEL {dataset_id}.{model_name},
    (
    SELECT
      user_id,
      item_id,
      rating
     FROM
      {dataset_id}.{rating_table})
      )
"""

query_job = client.query(
    query,
    location="US",
)

df = query_job.to_dataframe()

In [25]:
df = query_job.to_dataframe()
df

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,0.48596,0.396012,0.025547,0.390904,0.682661,0.682661


In [30]:
# 5ユーザーに対して、全ての映画のレーティングを予測する
query = f"""
#standardSQL
SELECT
  *
FROM
  ML.RECOMMEND(MODEL {dataset_id}.{model_name},
    (
    SELECT
      DISTINCT(user_id)
    FROM
      {dataset_id}.{rating_table}
    LIMIT 5))
"""
query_job = client.query(
    query,
    location="US",
)
df = query_job.to_dataframe()
df

Unnamed: 0,predicted_rating,user_id,item_id
0,1.804184,2,2561
1,3.902917,2,3329
2,2.791661,2,258
3,2.755314,2,3842
4,3.199996,2,1284
...,...,...,...
18525,4.548385,5,246
18526,3.774170,5,1785
18527,3.981500,5,506
18528,2.993428,5,3322


In [None]:
# 全ユーザーに対して、全ての映画のレーティングを予測しテーブルに保存する（全ユーザー数×全映画数なので行数が膨大になります）
prediction_result_table = "all_prediction_result"
query = f"""
#standardSQL
CREATE OR REPLACE TABLE {dataset_id}.{prediction_result_table}
OPTIONS() AS
SELECT
  *
FROM
  ML.RECOMMEND(MODEL {dataset_id}.{model_name})
"""
query_job = client.query(
    query,
    location="US",
)

In [90]:
# ユーザーごとに予測レートが高い作品上位５つを提示
query = f"""
#standardSQL
SELECT
  user_id,
  ARRAY_AGG(STRUCT(movie_title, genre, predicted_rating)
ORDER BY predicted_rating DESC LIMIT 5)
FROM (
SELECT
  user_id,
  item_id,
  predicted_rating,
  movie_title,
  genre
FROM
  {dataset_id}.{prediction_result_table}
JOIN
  {dataset_id}.{title_table}
ON
  item_id = movie_id)
GROUP BY
  user_id
"""
query_job = client.query(
    query,
    location="US",
)
df_movie_recommend = query_job.to_dataframe()
df_movie_recommend.head()

Unnamed: 0,user_id,f0_
0,1386,"[{'movie_title': 'Big Carnival, The (1951)', '..."
1,1771,"[{'movie_title': 'Song of Freedom (1936)', 'ge..."
2,5096,"[{'movie_title': 'Cup, The (Phörpa) (1999)', '..."
3,1449,"[{'movie_title': 'Song of Freedom (1936)', 'ge..."
4,453,"[{'movie_title': 'Two Family House (2000)', 'g..."


In [94]:
# ジャンルがレーティングに与える影響の評価
query = f"""
#standardSQL
SELECT
  factor,
  ARRAY_AGG(STRUCT(feature, genre,
      weight)
  ORDER BY
    weight DESC
  LIMIT
    10) AS weights
FROM (
  SELECT
    * EXCEPT(factor_weights)
  FROM (
    SELECT
      *
    FROM (
      SELECT
        factor_weights,
        CAST(feature AS INT64) as feature
      FROM
        ML.WEIGHTS(model {dataset_id}.{model_name})
      WHERE
        processed_input= 'item_id')
    JOIN
      {dataset_id}.{title_table}
    ON
      feature = movie_id) weights
  CROSS JOIN
    UNNEST(weights.factor_weights)
  ORDER BY
    feature,
    weight DESC)
GROUP BY
  factor
"""
query_job = client.query(
    query,
    location="US",
)
df_genre_factor = query_job.to_dataframe()
df_genre_factor.head()

Unnamed: 0,factor,weights
0,34,"[{'feature': 806, 'genre': 'Drama', 'weight': ..."
1,33,"[{'feature': 2893, 'genre': 'Action|Drama', 'w..."
2,32,"[{'feature': 3544, 'genre': 'Comedy', 'weight'..."
3,31,"[{'feature': 2711, 'genre': 'Drama', 'weight':..."
4,30,"[{'feature': 3577, 'genre': 'Drama', 'weight':..."


Unnamed: 0_level_0,rating
Unnamed: 0_level_1,count
movie_title,Unnamed: 1_level_2
"$1,000,000 Duck (1971)",37
'Night Mother (1986),70
'Til There Was You (1997),52
"'burbs, The (1989)",303
...And Justice for All (1979),199
...,...
"Zed & Two Noughts, A (1985)",29
Zero Effect (1998),301
Zero Kelvin (Kjærlighetens kjøtere) (1995),2
Zeus and Roxanne (1997),23


# TMDB 5000 Movie Dataset
https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata

In [None]:
# Reffered from the following URLs
# https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system
# https://www.kaggle.com/code/rafffael/geting-started-movie-recommendation-with-eda-demo

# Load data from Cloud Strage

In [None]:
from google.cloud import storage
gcs_client = storage.Client(project=' ml-session')

In [None]:
import pandas as pd 
import numpy as np
# Get the uri from the left pane
df_credits=pd.read_csv('gs://ml_session9/TMDB/tmdb_5000_credits.csv')
df_movies=pd.read_csv("gs://ml_session9/TMDB/tmdb_5000_movies.csv")

In [None]:
df_credits.head()

In [None]:
df_movies.head(1)

The first dataset contains the following features:-

- movie_id - A unique identifier for each movie.
- cast - The name of lead and supporting actors.
- crew - The name of Director, Editor, Composer, Writer etc.

The second dataset has the following features:-
- budget - The budget in which the movie was made.
- genre - The genre of the movie, Action, Comedy ,Thriller etc.
- homepage - A link to the homepage of the movie.
- id - This is infact the movie_id as in the first dataset.
- keywords - The keywords or tags related to the movie.
- original_language - The language in which the movie was made.
- original_title - The title of the movie before translation or adaptation.
- overview - A brief description of the movie.
- popularity - A numeric quantity specifying the movie popularity.
- production_companies - The production house of the movie.
- production_countries - The country in which it was produced.
- release_date - The date on which it was released.
- revenue - The worldwide revenue generated by the movie.
- runtime - The running time of the movie in minutes.
- status - "Released" or "Rumored".
- tagline - Movie's tagline.
- title - Title of the movie.
- vote_average - average ratings the movie recieved.
- vote_count - the count of votes recieved.
Let's join the two dataset on the 'id' column

In [None]:
df_credits.columns = ['id','title2','cast','crew']

In [None]:
df_movies= df_movies.merge(df_credits,on='id')

In [None]:
df_movies.head(1)

In [None]:
!pip install ipywidgets --user

In [None]:
from pandas_profiling import ProfileReport

In [None]:
report = ProfileReport(df_movies)

In [None]:
# convert date appropriate format
df_movies["release_date"] = pd.to_datetime(df_movies['release_date'])
df_movies['release_year'] = df_movies['release_date'].dt.year
df_movies['release_month'] = df_movies['release_date'].dt.month_name()
del df_movies["release_date"]

In [None]:
import json
# transform all columns which had json string into json format
# several columns has id and name
# leave only name for them beacause id is not so important in this case
json_columns = {'cast', 'crew', 'genres', 'keywords', 'production_countries', 
                'production_companies', 'spoken_languages'}

for c in json_columns:
    df_movies[c] = df_movies[c].apply(json.loads)
    if c != "crew": # We need other information besides the name
        df_movies[c] = df_movies[c].apply(lambda row: [x["name"] for x in row])

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df_movies.head(1)

In [None]:
from functools import partial

In [None]:
# create director writer and producer columns 
def get_job(job, row):
    person_name = [x['name'] for x in row if x['job']==job] 
    return person_name[0] if len(person_name) else np.nan

df_movies["director"] = df_movies["crew"].apply(partial(get_job, "Director"))
df_movies["writer"]   = df_movies["crew"].apply(partial(get_job, "Writer"))
df_movies["producer"] = df_movies["crew"].apply(partial(get_job, "Producer"))
del df_movies["crew"]

# create profit column
df_movies["profit"] = df_movies["revenue"] - df_movies["budget"]

In [None]:
# fill some missing values by most frequent one
for col in ["runtime", "release_year", "release_month"]:
    df_movies[col] = df_movies[col].fillna(df_movies[col].mode().iloc[0])

In [None]:
df_movies.head(1)

In [None]:
df_movies.head(1).keywords.sum()[:5]

In [None]:
# import numpy as np
# import os
# import pickle
# from pywaffle import Waffle
# import scipy
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# pd.set_option("display.max_columns", None)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
top_movie_num = 10
top_movies_based_on = ["budget", "revenue", "profit", "vote_average", "vote_count", "popularity"]

fig, ax = plt.subplots(len(top_movies_based_on)//3, 3, figsize=(30,14))
colors = plt.cm.get_cmap('viridis', top_movie_num)

for i, col in enumerate(top_movies_based_on):
    top_movies_by = df_movies.sort_values(by=[col], ascending=False).head(top_movie_num)
    r, c = i//3, i%3
    ax[r][c].barh(top_movies_by["title"], top_movies_by[col], color=colors.colors)
    ax[r][c].set_title(f"top movies based on {col}", fontsize=24, pad=20)
    ax[r][c].tick_params(axis='both', which='major', labelsize=16)
    ax[r][c].invert_yaxis()    
    
plt.suptitle('Top movies based on some features',fontsize=30, y=1)
fig.tight_layout()

In [None]:
!pip install squarify --user

In [None]:
from collections import defaultdict, Counter
import squarify

In [None]:
df_movies["keywords"][0][:5]

In [None]:
df_movies["genres"][0]

In [None]:
df_movies["original_language"][0]

In [None]:
# plot genre frequncy with treemap 
genres_flatten = sum(df_movies["genres"].values, []) # genres are list of lists
genres_info = Counter(genres_flatten)

plt.figure(figsize=(15, 10))
plt.title("Genres", fontsize=25, pad=20)
squarify.plot(genres_info.values(), label=genres_info.keys(), text_kwargs={'fontsize':12}, bar_kwargs={'alpha':.7}, pad=True)
plt.axis("off");

In [None]:
top_genres_num = 15
genres_flatten = sum(df_movies["genres"].values, []) # genres are list of lists
genres_info = Counter(genres_flatten)
top_genres = dict(genres_info.most_common(top_genres_num))

plt.figure(figsize=(20, 6))
plt.xticks(rotation=85, fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Nb. of occurences", fontsize=18, labelpad=10)
plt.bar(top_genres.keys(), top_genres.values(), align='center', color=plt.cm.get_cmap('plasma', top_genres_num).colors)
# plt.bar(top_keywords.keys(), top_keywords.values(), align='center', color=plt.cm.get_cmap('plasma', top_keywords_num).colors)
plt.title("Genres", fontsize=25, pad=20)
plt.show()

In [None]:
top_keywords_num = 30
keywords_flatten = sum(df_movies["keywords"].values, []) # keywords are list of lists
keywords_info = Counter(keywords_flatten)
top_keywords = dict(keywords_info.most_common(top_keywords_num))

plt.figure(figsize=(20, 6))
plt.xticks(rotation=85, fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Nb. of occurences", fontsize=18, labelpad=10)
plt.bar(top_keywords.keys(), top_keywords.values(), align='center', color=plt.cm.get_cmap('plasma', top_keywords_num).colors)
plt.title("Keywords popularity", fontsize=25, pad=20)
plt.show()

In [None]:
df_movies["original_language"].value_counts()[:5]

In [None]:
plt.figure(figsize=(20, 6))
plt.xticks(rotation=85, fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Nb. of occurences", fontsize=18, labelpad=10)

plt.bar(df_movies["original_language"].value_counts()[:10].index,
        df_movies["original_language"].value_counts()[:10].values,
        align='center', color=plt.cm.get_cmap('plasma', top_keywords_num).colors)

plt.title("N of original language", fontsize=25, pad=20)
plt.show()

In [None]:
df_movies["production_countries"].values

In [None]:
top_contries_num = 10
production_contries_flatten = sum(df_movies["production_countries"].values, []) # production_countries are list of lists
production_contries_info = Counter(production_contries_flatten)
top_production_contries = production_contries_info.most_common(top_contries_num)

plt.figure(figsize=(30, 8))
plt.xticks(rotation=65, fontsize=15)
plt.ylabel("Nb. of occurences", fontsize=18)
plt.title("Production countries", fontsize=25, pad=20)
plt.bar(dict(top_production_contries).keys(),
        dict(top_production_contries).values(),
        color=plt.cm.get_cmap('cividis', top_contries_num).colors);

In [None]:
top_companies_num = 15
production_companies_flatten = sum(df_movies["production_companies"].values, []) # production_companies are list of lists
production_companies_info = Counter(production_companies_flatten)
top_production_companies = production_companies_info.most_common(10)


plt.figure(figsize=(30, 8))
plt.xticks(rotation=65, fontsize=15)
plt.ylabel("Nb. of occurences", fontsize=18)
plt.title("Production countries", fontsize=25, pad=20)
plt.bar(dict(top_production_companies).keys(),
        dict(top_production_companies).values(),
        color=plt.cm.get_cmap('cividis', top_companies_num).colors);

# fig = plt.figure(
#     FigureClass=Waffle, 
#     rows=30, 
#     values=dict(top_production_companies), 
#     legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1)},
#     icons='child', icon_size=14, 
#     icon_legend=True,
#     figsize=(24,8)
# )

In [None]:
!pip install wordcloud --user
from wordcloud import WordCloud

In [None]:
df_movies["cast"].values[0][:5]

In [None]:
# plot unique cast frequency
cast_flatten = sum(df_movies["cast"].values, []) # cast are list of lists
cast_info = Counter(cast_flatten)

wordcloud = WordCloud(background_color='white')
wordcloud.generate_from_frequencies(dict(cast_info.most_common(50)))

plt.figure(figsize=(15,10))
plt.title("Cast", fontsize=25, pad=20)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off");

In [None]:

plt.xticks(fontsize=12)
plt.ylabel("Nb. of occurences", fontsize=14)
plt.title("Movie release year", fontsize=25, pad=20)
df_movies["release_year"].plot.hist(figsize=(15,5), bins=100);

In [None]:
matrix = df_movies[["budget", "popularity", "revenue", "runtime", "vote_average", "vote_count"]].corr()

f, ax = plt.subplots(figsize=(12, 10))
plt.title("correlation between features", fontsize=20, pad=20)
plt.xticks(range(len(matrix.index)), matrix.index, fontsize=12)
plt.yticks(range(len(matrix.index)), matrix.index, fontsize=12)

# adding values
for i in range(len(matrix.index)):
    for j in range(len(matrix.index)):
        text = ax.text(j, i, round(matrix.iloc[i, j],3), ha="center", va="center", color="w", fontsize=12)
        
plt.imshow(matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar();

In [None]:
bq_table = df_movies[["budget", "popularity", "revenue", "runtime", "vote_average", "vote_count"]]

In [None]:
bq_table.info()

# Load the dataframe to BigQuery

In [None]:
bq_table.head(1)

In [None]:
test = client.get_dataset(dataset_ref='session9')

In [None]:
test

In [None]:
dataset_id = 'session9'

# dataset = client.create_dataset(dataset_id)  
dataset = client.get_dataset(dataset_ref=dataset_id)

In [None]:
dataset

In [None]:
# Optionally set explicit indices.
# If indices are not specified, a column will be created for the default
# indices created by pandas.
# df = pandas.DataFrame(records, index=pandas.Index(index, name="wikidata_id"))

table_ref = dataset.table("movie_database2")
job = client.load_table_from_dataframe(bq_table, table_ref, location="US")

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

In [None]:
query = """
SELECT
  *
FROM
  `ml-session.session9.movie_database2`
WHERE budget > 0
LIMIT 100
"""
query_job = client.query(
    query,
     location="US",
)

df = query_job.to_dataframe()
df.head(5)

In [None]:
# revenue を予測するモデルを作成
ml_query = """
CREATE OR REPLACE MODEL `session9.movie_revenue_pred`
OPTIONS (model_type='boosted_tree_regressor',
         input_label_cols=['revenue'],
         max_iterations = 10,
         tree_method = 'HIST',
         subsample = 0.85,
         enable_global_explain = TRUE
) AS
SELECT
  *
FROM
  `ml-session.session9.movie_database2`
WHERE budget > 0
LIMIT 10000
"""
query_job = client.query(
    query,
     location="US",
)
df = query_job.to_dataframe()

In [None]:
df

In [None]:
training_info = """
SELECT
  training_run,
  iteration,
  loss,
  eval_loss,
  duration_ms,
  learning_rate
FROM
  ML.TRAINING_INFO(MODEL `session9.movie_revenue_pred`)
ORDER BY iteration ASC
"""
client.query(training_info).to_dataframe()

In [None]:
global_explain = """
SELECT
  *
FROM
  ML.GLOBAL_EXPLAIN(MODEL `session9.movie_revenue_pred`)
"""

client.query(global_explain).to_dataframe()

In [None]:
explain_predict = """
SELECT *
FROM
ML.EXPLAIN_PREDICT(MODEL `session9.movie_revenue_pred`,
 (
 SELECT
   25000000 AS budget,
   14.613152 AS popularity,
   137.0 AS runtime,
   5.5 AS vote_average,
   301 AS vote_count
),
STRUCT(6 AS top_k_features))
"""
client.query(explain_predict).to_dataframe()


In [None]:
# Delete the dataset and its contents
client.delete_dataset(dataset, delete_contents=True)

print('Deleted dataset: {}'.format(dataset.path))