In [1]:
import google.auth
from google.cloud import bigquery

## Get the project_id
CREDENTIALS, PROJECT_ID = google.auth.default()
print(f"Detected Project ID: {PROJECT_ID}")

## Set various names
DATASET_ID = "bootcamp_challenge2"
TRAINING_DATA = "emergency_calls_data"

GCS_URI = "gs://labs.roitraining.com/data-to-ai-workshop/emergency_calls_response_times.csv"

## Connect BigQuery Client and create dataset

client = bigquery.Client(project=PROJECT_ID)

dataset_ref = bigquery.Dataset(f"{PROJECT_ID}.{DATASET_ID}")
dataset_ref.location = "US"

try:
    client.create_dataset(dataset_ref)
    print("Dataset created.")
except Exception:
    print("Dataset already exists.")

## Ingeset data
table_id = f"{PROJECT_ID}.{DATASET_ID}.{TRAINING_DATA}"

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    autodetect=True,
)

load_job = client.load_table_from_uri(
    GCS_URI,
    table_id,
    job_config=job_config,
)

load_job.result()
print("Raw data loaded into BigQuery.")

Detected Project ID: qwiklabs-gcp-00-c2e92c8fc9eb
Dataset already exists.
Raw data loaded into BigQuery.


In [2]:
## Load data from BigQuery

import pandas as pd

query = f"""
SELECT *
FROM `{PROJECT_ID}.{DATASET_ID}.{TRAINING_DATA}`
"""

df = client.query(query).to_dataframe()
df.head()

Unnamed: 0,call_id,call_timestamp,call_type,location,weather_condition,day_of_week,time_of_day,traffic_level,distance_to_station,units_available,response_time
0,35957,2023-01-01 00:05:53+00:00,Fire,Highland,Rainy,Sunday,0,High,21.45,3,23.41
1,20832,2023-01-01 00:20:47+00:00,Fire,Oakmont,Rainy,Sunday,0,High,22.29,6,20.11
2,27949,2023-01-01 00:33:27+00:00,Fire,Riverside,Windy,Sunday,0,High,17.19,14,19.75
3,20199,2023-01-01 00:48:29+00:00,Fire,Riverside,Windy,Sunday,0,High,17.39,14,20.76
4,46938,2023-01-01 00:50:44+00:00,Rescue,Brookfield,Sunny,Sunday,0,High,22.5,14,22.37


In [3]:
df.columns

Index(['call_id', 'call_timestamp', 'call_type', 'location',
       'weather_condition', 'day_of_week', 'time_of_day', 'traffic_level',
       'distance_to_station', 'units_available', 'response_time'],
      dtype='object')

In [4]:
## BigQuery ML Create Model Statement
model_id = f"{PROJECT_ID}.{DATASET_ID}.response_time_model"

sql_query = f"""
CREATE OR REPLACE MODEL `{model_id}`
OPTIONS (
  model_type = 'linear_reg',
  input_label_cols = ['response_time'],
  data_split_method = 'AUTO_SPLIT'
) AS
SELECT call_type, location,
       weather_condition, day_of_week, time_of_day, traffic_level,
       distance_to_station, units_available, response_time
        FROM `{table_id}`
WHERE response_time IS NOT NULL;
"""

print(f"Training model {model_id}...")
query_job = client.query(sql_query)
query_job.result()

print("Process Complete!")

Training model qwiklabs-gcp-00-c2e92c8fc9eb.bootcamp_challenge2.response_time_model...
Process Complete!


In [5]:
## BigQuery ML.EVALUATE Statement
eval_query = f"SELECT * FROM ML.EVALUATE(MODEL `{model_id}`)"
df_metrics = client.query(eval_query).to_dataframe()
print(df_metrics)

   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0             1.761934            4.827846                0.015117   

   median_absolute_error  r2_score  explained_variance  
0                1.50111  0.831417             0.83146  


In [6]:
## BigQuery ML.PREDICT Statement

predict_query = f"""
SELECT
  *
FROM
  ML.PREDICT(MODEL `{PROJECT_ID}.{DATASET_ID}.response_time_model`, (
    SELECT
      *
    FROM
      `{table_id}`
    LIMIT 10 -- Just looking at the first 10 for a quick check
  ))
"""

predictions_df = client.query(predict_query).to_dataframe()

print(predictions_df[['response_time', 'predicted_response_time']].head())

   response_time  predicted_response_time
0          23.41                24.384363
1          20.11                23.862711
2          19.75                19.531299
3          20.76                19.621486
4          22.37                20.435925
