In [209]:
from google.cloud import bigquery
from google.cloud.bigquery import magics
%load_ext bigquery_magics

data_set = "testing_set"
project_name = "emerald-entity-468916-f9"
library_path = "gdelt-bq.internetarchivebooks"

job_config = bigquery.QueryJobConfig(default_dataset = f"{project_name}.{data_set}")
client = bigquery.Client(project = project_name, default_query_job_config = job_config)
magics.context.default_query_job_config = job_config

The bigquery_magics extension is already loaded. To reload it, use:
  %reload_ext bigquery_magics


In [210]:
%%bigquery

DELETE FROM tmp_correction_chunks WHERE 1 = 1;

Query is running:   0%|          |

In [211]:
from utility import chunk_text_with_overlaps
from concurrent.futures import ThreadPoolExecutor, as_completed

insert_sql = f"INSERT INTO tmp_correction_chunks(book_id, chunk_number, prefix, original_txt, suffix) VALUES(@id, @idx, @prefix, @txt, @suffix)"

job_configs = []

select_query_job = client.query(query = f"select * from books WHERE corrected_txt IS NULL")
for row in select_query_job.result():
    print(f"\nChunking book: {row["title"]}")
    text_to_split = row["original_txt"]
    book_id = row["book_id"]
    chunks = chunk_text_with_overlaps(text_to_split, max_chunk_tokens = 5000, max_overlap_tokens = 1000)
    print(f"  Number of chunks: {len(chunks)}")
    print(f"  Processed: ", end = "")
    for idx, chunk in enumerate(chunks):
        print(f"{idx} ", end = "")
        job_config = bigquery.QueryJobConfig(query_parameters=[
            bigquery.ScalarQueryParameter("id", "STRING", book_id),
            bigquery.ScalarQueryParameter("idx", "INTEGER", idx),
            bigquery.ScalarQueryParameter("prefix", "STRING", chunk.before),
            bigquery.ScalarQueryParameter("txt", "STRING", chunk.main),
            bigquery.ScalarQueryParameter("suffix", "STRING", chunk.after)])
        job_configs.append(job_config)

print(f"\n\nExecuting {len(job_configs)} INSERT jobs...")
print("Jobs Completed: ", end = "")

def execute_insert_job(job_config):
    client.query_and_wait(insert_sql, job_config = job_config)

completed_jobs = 0
with ThreadPoolExecutor(max_workers = 10) as executor:
    future_to_config = {executor.submit(execute_insert_job, config): config for config in job_configs}
    for future in as_completed(future_to_config):
        future.result()
        completed_jobs += 1
        print(f"{completed_jobs} ", end = "")

print("\nAll INSERT jobs completed.")        


Chunking book: The Book of the Months: A Gift for the Young
  Number of chunks: 8
  Processed: 0 1 2 3 4 5 6 7 
Chunking book: Theistic Problems: Being Essays on the Existence of God and His Relationship to Man
  Number of chunks: 11
  Processed: 0 1 2 3 4 5 6 7 8 9 10 
Chunking book: Little Comedies
  Number of chunks: 7
  Processed: 0 1 2 3 4 5 6 

Executing 26 INSERT jobs...
Jobs Completed: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
All INSERT jobs completed.


In [212]:
%%bigquery
CALL phase2_correction_correct();

Query is running:   0%|          |