In [89]:
%%script false --no-raise-error
import json
from google.colab import userdata
from google.oauth2 import service_account
from google.cloud.bigquery import magics

credentials_json = userdata.get('BIGQUERY_CREDENTIALS')
credentials = service_account.Credentials.from_service_account_info(json.loads(credentials_json))
magics.context.credentials = credentials

Couldn't find program: 'false'


In [90]:
from google.cloud import bigquery
from google.cloud.bigquery import magics
%load_ext bigquery_magics

data_set = "testing_set8"
project_name = "emerald-entity-468916-f9"

job_config = bigquery.QueryJobConfig(default_dataset = f"{project_name}.{data_set}", job_timeout_ms = 3600000)
client = bigquery.Client(project = project_name, default_query_job_config = job_config, credentials = globals().get('credentials', None))
magics.context.default_query_job_config = job_config
magics.context.project = project_name

The bigquery_magics extension is already loaded. To reload it, use:
  %reload_ext bigquery_magics


#### Phase 2 - Correcting Books
Book correction performs correction of OCR book scanning errors. Such errors are quite common in books from GDELT Processes Internet Archive.

In [91]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from pydantic import BaseModel
from typing import List

nltk.download('punkt')
nltk.download('punkt_tab')

class Chunk(BaseModel):
    before: str  
    main: str  
    after: str

def chunk_text(sentences: list, sentences_lenghts: list, max_tokens) -> tuple:

    chunks = []
    lenghts = []

    current_sentences = []
    current_lenghts = []
    current_length = 0
    
    for length, sentence in zip(sentences_lenghts, sentences):
        if current_length + length > max_tokens and current_length > 0:
            chunks.append(current_sentences)
            lenghts.append(current_lenghts)
            current_sentences = []
            current_lenghts = []
            current_length = 0

        current_sentences.append(sentence)
        current_lenghts.append(length)
        current_length += length

    if current_length > 0:
        chunks.append(current_sentences)
        lenghts.append(current_lenghts)

    return (chunks, lenghts)

def get_prefix(sentences: list, lenghts: list, max_tokens):
    current_sentences = []
    current_length = 0
    
    for length, sentence in zip(lenghts, sentences):
        if current_length + length > max_tokens and current_length > 0:
            break

        current_sentences.append(sentence)
        current_length += length

    return current_sentences

def get_prefixes(chunks: list, lengths: list, max_tokens) -> list:
    return [" ".join(get_prefix(chunk_sentences, chunk_lengths, max_tokens)) for chunk_sentences, chunk_lengths in zip(chunks, lengths)]

def get_suffixes(chunks: list, lengths: list, max_tokens) -> list:
    return [" ".join(reversed(get_prefix(chunk_sentences[::-1], chunk_lengths[::-1], max_tokens))) for chunk_sentences, chunk_lengths in zip(chunks, lengths)]

def add_overlaps(chunks: list, prefixes: list, suffixes: list):
    result = []    
    for idx, chunk in enumerate(chunks):
        result.append(Chunk(before = "" if idx == 0 else suffixes[idx - 1], main = chunk, after = "" if idx == len(chunks) - 1 else prefixes[idx + 1]))
    return result

# divides long text to chunks with overlaps, but as opposite to common implementations, overlaps are not added to chunks, but provided separately
def chunk_text_with_overlaps(long_text, max_chunk_tokens, max_overlap_tokens) -> List[Chunk]:
    sentences = sent_tokenize(long_text)
    sentences_lenghts = [len(word_tokenize(sentence)) for sentence in sentences]

    chunks_and_lenghts = chunk_text(sentences, sentences_lenghts, max_chunk_tokens)
    chunked_sentences = chunks_and_lenghts[0]
    chunked_sentences_lenghts = chunks_and_lenghts[1]

    prefixes = get_prefixes(chunked_sentences, chunked_sentences_lenghts, max_overlap_tokens)
    suffixes = get_suffixes(chunked_sentences, chunked_sentences_lenghts, max_overlap_tokens)
    chunks = [" ".join(chunk_sentences) for chunk_sentences in chunked_sentences]

    return add_overlaps(chunks, prefixes, suffixes)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jurow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jurow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [92]:
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed

clean_sql = "CREATE OR REPLACE TABLE tmp_correction_chunks(book_id STRING, chunk_number INTEGER, prefix STRING, original_txt STRING, suffix STRING, corrected_txt STRING)"
insert_sql = f"INSERT INTO tmp_correction_chunks(book_id, chunk_number, prefix, original_txt, suffix) VALUES(@id, @idx, @prefix, @txt, @suffix)"

job_configs = []

client.query_and_wait(query = clean_sql)
select_query_job = client.query(query = f"select * from books WHERE corrected_txt IS NULL")
any_row = False

for row in select_query_job.result():
    any_row = True
    print(f"\nChunking book: {row["title"]}")
    text_to_split = row["original_txt"]
    book_id = row["book_id"]
    chunks = chunk_text_with_overlaps(text_to_split, max_chunk_tokens = 3000, max_overlap_tokens = 1000)
    print(f"  Number of chunks: {len(chunks)}")
    print(f"  Processed: ", end = "")
    for idx, chunk in enumerate(chunks):
        print(f"{idx} ", end = "")
        job_config = bigquery.QueryJobConfig(query_parameters=[
            bigquery.ScalarQueryParameter("id", "STRING", book_id),
            bigquery.ScalarQueryParameter("idx", "INTEGER", idx),
            bigquery.ScalarQueryParameter("prefix", "STRING", chunk.before),
            bigquery.ScalarQueryParameter("txt", "STRING", chunk.main),
            bigquery.ScalarQueryParameter("suffix", "STRING", chunk.after)])
        job_configs.append(job_config)

if any_row:
    print(f"\n\nExecuting {len(job_configs)} INSERT jobs...")
    print("Jobs Completed: ", end = "")

    def execute_insert_job(job_config):
        client.query_and_wait(insert_sql, job_config = job_config)

    completed_jobs = 0
    with ThreadPoolExecutor(max_workers = 10) as executor:
        future_to_config = {executor.submit(execute_insert_job, config): config for config in job_configs}
        for future in as_completed(future_to_config):
            future.result()
            completed_jobs += 1
            print(f"{completed_jobs} ", end = "")

    print("\nAll INSERT jobs completed.")
else:
    print("No books to correct.")


Chunking book: The theatre of ideas, a burlesque allegory, and three one-act plays: The goal, Her tongue, Grace Mary
  Number of chunks: 16
  Processed: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
Chunking book: Liberal Judaism and Social Service
  Number of chunks: 17
  Processed: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 

Executing 33 INSERT jobs...
Jobs Completed: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 
All INSERT jobs completed.


In [93]:
%%bigquery
CALL phase2_correction_correct();

Query is running:   0%|          |

### Phase 3 - Chunking books
Simple divide of each book to overlaping fragments.

In [94]:
%%bigquery

call phase3_prepare_chunks();

Query is running:   0%|          |

### Phase 4 - Summarizing
Prepare concise summary of each book. It will be supplementary information in several inference operations later.

In [95]:
%%bigquery

call phase4_fragments_summarization();
call phase4_summarize_fragments_summaries();

Query is running:   0%|          |

### Phase 5 - Character identification
By far the most difficult part of whole project. 
Can take a lot of time, therefore it is divided to multiple small cells.
Note that BigQuery AI usage limits may block execution for some time - nevertheless it should eventually finish correctly.

In [96]:
%%bigquery

call phase5_prepare_character_ids_from_fragments();
call phase5_prepare_character_ids_initial_embeddings();

Query is running:   0%|          |

In [97]:
%%bigquery

call phase5_merge_characters_duplicates();

Query is running:   0%|          |

In [98]:
%%bigquery

call phase5_merge_characters_duplicates();
delete from identifiers where importance <= 1;
delete from tmp_characters_id_embeddings where id not in (select id from identifiers);

Query is running:   0%|          |

In [99]:
%%bigquery

call phase5_merge_characters_duplicates();
delete from identifiers where importance <= 2;
delete from tmp_characters_id_embeddings where id not in (select id from identifiers);

Query is running:   0%|          |

In [100]:
%%bigquery

call phase5_merge_characters_duplicates();
delete from identifiers where importance <= 3;
delete from tmp_characters_id_embeddings where id not in (select id from identifiers);

Query is running:   0%|          |

In [101]:
%%bigquery

call phase5_merge_characters_duplicates();
delete from identifiers where importance <= 4;
delete from tmp_characters_id_embeddings where id not in (select id from identifiers);

Query is running:   0%|          |

In [102]:
%%bigquery

call phase5_merge_characters_duplicates();
delete from identifiers where importance <= 5;
delete from tmp_characters_id_embeddings where id not in (select id from identifiers);

Query is running:   0%|          |

In [103]:
%%bigquery

call phase5_merge_characters_duplicates();
delete from identifiers where importance <= 6;
delete from tmp_characters_id_embeddings where id not in (select id from identifiers);

Query is running:   0%|          |

In [None]:
%%bigquery

BEGIN
  DECLARE merged INT64 DEFAULT 0;
  REPEAT
    call phase5_merge_characters_duplicates_with_return_param(merged);
  UNTIL merged = 0
  END REPEAT;
END;

call phase5_rebuild_indentifier_jsons();

Executing query with job ID: d298c25a-07e7-4d09-881f-cd9c7ee7aba4
Query executing: 1203.00s

### Phase 6 - Extracting information
This stage is finally gathering the data we want. In this project we chose as an example: gender, financial status, social class and moral values.
In real life applications, any chosen traits can be chosen, by adapting prompts and param values in clustering stage.

In [None]:
%%bigquery

CALL phase6_gather_characters_full_data_from_chunks();

CALL phase6_group_the_same_characters_data_for_merging();
CALL phase6_merge_data();

CALL phase6_copy_final_data();

Query is running:   0%|          |

### Phase 7 - Clustering
It takes the raw data gathered for each interesting us trait and clusters them together into consistent groups.
The cluster sizes are chosen arbitrary and can be changed in last three cells (first line in each cell)
Running clustering again, will regenerate clusters without damaging the core data.

In [None]:
%%bigquery

UPDATE characters SET social_class_cluster_id = NULL, wealth_cluster_id = NULL, values_cluster_id = NULL WHERE 1 = 1;
DELETE FROM clusters WHERE 1 = 1;
DELETE FROM character_cluster_details WHERE 1 = 1;

Query is running:   0%|          |

In [None]:
%%bigquery --params {"clusters": 10, "trait_type": "values", "trait_desc": 'values: "Core principles, moral compass, priorities (can include both positive and negative values)"'}

create or replace table tmp_copied_traits as SELECT c.book_id, c.id, c.values as traits from characters c WHERE c.values is not null;
CALL phase7_split_traits(@trait_desc);
CALL phase7_identify_clusters(@clusters);
CALL phase7_name_clusters(@trait_type, @trait_desc);
CALL phase7_assign_clusters(@trait_type);
UPDATE characters c SET values_cluster_id = ct.cluster_id FROM tmp_combined_traits ct WHERE c.book_id = ct.book_id AND c.id = ct.id AND ct.cluster_type = @trait_type;

Query is running:   0%|          |

In [None]:
%%bigquery --params {"clusters": 7, "trait_type": "wealth", "trait_desc": 'wealth: "Economic position, assets, property, financial struggles or abundance with information how wealth/income is obtained (inheritance, labor, trade, crime, patronage, etc.)"'}

create or replace table tmp_copied_traits as SELECT c.book_id, c.id, c.wealth as traits from characters c WHERE c.wealth is not null;
CALL phase7_split_traits(@trait_desc);
CALL phase7_identify_clusters(@clusters);
CALL phase7_name_clusters(@trait_type, @trait_desc);
CALL phase7_assign_clusters(@trait_type);
UPDATE characters c SET wealth_cluster_id = ct.cluster_id FROM tmp_combined_traits ct WHERE c.book_id = ct.book_id AND c.id = ct.id AND ct.cluster_type = @trait_type;

Query is running:   0%|          |

In [None]:
%%bigquery --params {"clusters": 7, "trait_type": "social_class", "trait_desc": 'social_class: "Economic and social standing (e.g., nobility, working class, merchant class)"'}

create or replace table tmp_copied_traits as SELECT c.book_id, c.id, c.social_class as traits from characters c WHERE c.social_class is not null;
CALL phase7_split_traits(@trait_desc);
CALL phase7_identify_clusters(@clusters);
CALL phase7_name_clusters(@trait_type, @trait_desc);
CALL phase7_assign_clusters(@trait_type);
UPDATE characters c SET social_class_cluster_id = ct.cluster_id FROM tmp_combined_traits ct WHERE c.book_id = ct.book_id AND c.id = ct.id AND ct.cluster_type = @trait_type;

Query is running:   0%|          |