In [293]:
%%script false --no-raise-error
import json
from google.colab import userdata
from google.oauth2 import service_account
from google.cloud.bigquery import magics

credentials_json = userdata.get('BIGQUERY_CREDENTIALS')
credentials = service_account.Credentials.from_service_account_info(json.loads(credentials_json))
magics.context.credentials = credentials

Couldn't find program: 'false'


In [294]:
from google.cloud import bigquery
from google.cloud.bigquery import magics
%load_ext bigquery_magics

data_set = "testing_set8"
project_name = "emerald-entity-468916-f9"
endpoint = "gemini-2.5-flash-lite"
connection_id = "us.ai_connection"

job_config = bigquery.QueryJobConfig(default_dataset = f"{project_name}.{data_set}")
client = bigquery.Client(project = project_name, default_query_job_config = job_config, credentials = globals().get('credentials', None))
magics.context.default_query_job_config = job_config
magics.context.project = project_name

The bigquery_magics extension is already loaded. To reload it, use:
  %reload_ext bigquery_magics


### Tables creation
Initial tables creation, necessary to satisfy sql procedures verification

In [295]:
%%bigquery

CREATE TABLE IF NOT exists prompts(code STRING, prompt STRING);
CREATE TABLE IF NOT EXISTS books(book_id STRING, date INTEGER, title STRING, original_txt STRING, corrected_txt STRING, summary STRING, processed BOOL);
CREATE TABLE IF NOT EXISTS chunks(book_id STRING, chunk_number INTEGER, txt STRING, summary STRING,
    fragment_number STRING, characters_id_data STRING, characters_full_data STRING);
CREATE TABLE IF NOT EXISTS identifiers(id INTEGER, book_id STRING, full_name STRING, importance INTEGER, chunk_number INTEGER, fragment_number STRING,
    characters_id_data STRING, information STRING);
CREATE TABLE IF NOT EXISTS characters(book_id STRING, id INT64, full_name STRING, sex STRING, importance INT64,
  social_class STRING, wealth STRING, values STRING, information STRING, social_class_cluster_id INT64, wealth_cluster_id INT64, values_cluster_id INT64);
CREATE TABLE IF NOT exists clusters(cluster_type STRING, cluster_id INT64, name STRING, description STRING);
CREATE TABLE IF NOT exists character_cluster_details(book_id STRING, id INT64, trait STRING, trait_id INT64, cluster_type STRING, cluster_id INT64);

Query is running:   0%|          |

In [None]:
%%bigquery

CREATE OR REPLACE TABLE tmp_correction_chunks(book_id STRING, chunk_number INTEGER, prefix STRING, original_txt STRING, suffix STRING, corrected_txt STRING);
CREATE OR REPLACE TABLE tmp_grouped_full_data(id INTEGER, book_id STRING, parts INTEGER, full_data_array STRING, character_full_data STRING, information STRING);
CREATE OR REPLACE TABLE tmp_characters_id_embeddings(ml_generate_embedding_result ARRAY<FLOAT64>, ml_generate_embedding_statistics JSON, ml_generate_embedding_status STRING,
  content STRING, title STRING, id INT64, book_id STRING, chunk_number INT64, is_query BOOL);
CREATE OR REPLACE TABLE tmp_copied_traits(id INTEGER, book_id STRING, traits STRING);
CREATE OR REPLACE TABLE tmp_clustered_traits(CENTROID_ID INT64, NEAREST_CENTROIDS_DISTANCE ARRAY<STRUCT<CENTROID_ID INT64, DISTANCE FLOAT64>>,
  ml_generate_embedding_result ARRAY<FLOAT64>, content STRING, trait_id INT64);
CREATE OR REPLACE TABLE tmp_split_traits_final(book_id STRING, id INT64, trait STRING, trait_id INT64, cluster_id INT64);
CREATE OR REPLACE TABLE tmp_combined_traits(book_id STRING, id INT64, cluster_type STRING, traits STRING, cluster_id INT64);
CREATE OR REPLACE TABLE tmp_combined_traits_clustered(CENTROID_ID INT64, NEAREST_CENTROIDS_DISTANCE ARRAY<STRUCT<CENTROID_ID INT64, DISTANCE FLOAT64>>,
  ml_generate_embedding_result ARRAY<FLOAT64>, content STRING, id INT64, book_id STRING);
CREATE OR REPLACE TABLE tmp_chunks_character_full_data(book_id STRING, chunk_number INT64, characters_full_data STRING);

Query is running:   0%|          |

### Inference prompts

Simple json format correction prompt

In [None]:
%%bigquery
MERGE prompts p USING (SELECT 
'''You are an expert in JSON format .
Please analyse given JSON %s and correct format errors, if any.

The JSON to be analysed is below between <json> tags.
<json>
%s
</json>

Return only the corrected JSON as a response, without any comments.
''' prompt, 'json' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Correction of scanned books OCR results

In [None]:
%%bigquery
MERGE prompts p USING (SELECT 
'''Your task is to fix scanning or optical character recognition (OCR) errors in given book fragment.
- Example of scanned text with errors: "which* Jterhapf". Correct text: "which perhaps" or "which, perhaps"
Please also change forms of old english (if any) to modern ones, if it doesn't change the meaning of the text. 
- For example, please change "long s" (ſ or f) to normal "s".

There can be many different types of errors, so please analyze the text carefully to establish the correct content.

Below you will receive a fragment of some book, and your job is to correct all mistakes and output valid, corrected text.
The fragment is between the tags <fragment_to_fix> and </fragment_to_fix>.

To better understand the context and meaning of the fragment, we also provide some portions of text before and after it.
These are placed between the tags <fragment_before> and </fragment_before> for the portion before the main text, and <fragment_after> and </fragment_after> for the portion after the main text.
The portions before and after may be empty if the main fragment is from the very beginning or the very end of the book.
Please do not try to correct the portions before and after — they are only for you to better understand the context and meaning of the main fragment.

Take into consideration that some books are over 200 years old, so the language may be slightly different from used currently.
After preparing the corrected text, read it again to double-check its correctness.

<fragment_before>
%s
</fragment_before>

<fragment_to_fix>
%s
</fragment_to_fix>

<fragment_after>
%s
</fragment_after>

Return corrected text only, without any additional comments.
''' prompt, 'correct' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Book fragment summarization

In [None]:
%%bigquery
MERGE prompts p USING (SELECT   
'''You are an expert in summarizing books.
Your goal is to summarize a book fragment. Summary should contain no more then %d characters.

The fragment is provided between <book_fragment> tags below:
<book_fragment>
%s
</book_fragment>

Return just a summary wihout any additional comments.
''' prompt, 'summarize' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Whole book summarization

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''You are an expert in summarizing books.
Your goal is to prepare book summary based on summaries of all book fragments.
The result summary should contain around 20000 characters.
Avoid redundancy while maintaining comprehensiveness when summarizing.

Concatenated summaries of all book fragments are placed below, between <fragment_summaries> tags:
<fragment_summaries>
%s
</fragment_summaries>

Return just a summary without any additional comments.
''' prompt, 'reduce_summary' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Extracting characters identifying information from given book fragment

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''You are an expert in extracting information from books and manipulating JSON structures.
Your task is to provide identifying information of significant human characters (called later Individuals) from given book fragment.

Provide the output as a JSON array, single Individual should be described in separate JSON object in the array.
The schema definition is below.
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "full_name": {"type": "string", "maxLength": 300, "description": "Full name including titles, nicknames, aliases, maiden names, or pseudonyms if mentioned. Include all names if Individual has many names."},
	    "information": {"type": "string", "maxLength": 1500, "description": "Any information that may help uniquely identify given Individual, e.g. sex, age, origin, physical appearance, distinguishing marks or features and other helpful information"},
	    "importance": {"type": "integer", "description": "Number of sentences being related in any way to given Individual in the text"},
    }
  }
}

The book fragment for Individuals analysis is provided between <book_fragment> tags.
<book_fragment>
%s
</book_fragment>

Important Guidelines for choosing Individuals to be included in the JSON array:
1. Omit unnamed crowd members or minor Individuals mentioned only in passing.
2. Add only Individuals who play meaningful roles or are described in details in given book fragment. 

Other important Guidelines:
1. Do not duplicate JSON objects for the same Individual.
2. Strictly respect the maximum character limits for each field, especially max 1500 characters for `information`. Summarize to reduce size, if necessary.

Before returning, fix all format errors in JSON array, if any.
Return only corrected JSON array as a response, without any additional comments.   
''' prompt, 'characters_id_data' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Identifying the same characters in duplicate candidates pairs

In [None]:
%%bigquery
MERGE prompts p USING (SELECT   
'''You are an expert in comparing human characters (called later "Individuals") based on descriptions from different fragments of the same book.
Your task is to analyse each pair of Individuals information and names, and provide judgement whether both Individuals are in fact the same person, or not.

The input data are provided as a JSON array. Each Individuals pair is described in separate JSON object with a flat structure.
The input schema definition is below.
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
	"id": {"type": "integer", "description": "A unique identifier of a pair"},
		"first_individual_full_name": {"type": "string", "description": "Full name of the first Individual"},
		"first_individual_information": {"type": "string", "description": "Additional information describing first Individual"},
		"second_individual_full_name": {"type": "string", "description": "Full name of the second Individual"},
		"second_individual_information": {"type": "string", "description": "Additional information describing second Individual"}
    }
  }
}

The input data for analysis is provided between <individual_pairs> tags.
<individual_pairs>
%s
</individual_pairs>

As a supplementary information you can use the summary of the whole book. It is placed between <summary> tags.
<summary>
%s
</summary>

As the result please return JSON array containing only pairs where first and second Individual are in fact the same person. If unsure, assume they are different persons and do not return.
If there are no pairs containing the same Individual, then return empty array.
Return only JSON array, without any additional comments.   
''' prompt, 'find_the_same_characters' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Identifying different characters in duplicate candidates pairs - double check of previous prompt result

In [None]:
%%bigquery
MERGE prompts p USING (SELECT   
'''You are an expert in comparing human characters (called later "Individuals") based on descriptions from different fragments of the same book.
Your task is to analyse each pair of Individuals information and names, and provide judgement whether both Individuals are in fact the same person, or not.

The input data are provided as a JSON array. Each Individuals pair is described in separate JSON object with a flat structure.
The input schema definition is below.
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
	"id": {"type": "integer", "description": "A unique identifier of a pair"},
		"first_individual_full_name": {"type": "string", "description": "Full name of the first Individual"},
		"first_individual_information": {"type": "string", "description": "Additional information describing first Individual"},
		"second_individual_full_name": {"type": "string", "description": "Full name of the second Individual"},
		"second_individual_information": {"type": "string", "description": "Additional information describing second Individual"}
    }
  }
}

The input data for analysis is provided between <individual_pairs> tags.
<individual_pairs>
%s
</individual_pairs>

As a supplementary information you can use the summary of the whole book. It is placed between <summary> tags.
<summary>
%s
</summary>

As the result please return JSON array containing only pairs where first and second Individual are different persons. If unsure, assume they are different.
Before returning, check and fix all format errors in JSON array, if any.
Return only JSON array, without any additional comments.   
''' prompt, 'find_different_characters' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Another check of ready to merge array with identifying information of (most probably) the same character

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''You are an expert in comparing human characters (called later "Individuals") based on descriptions from different fragments of the same book.
As an input data to analysis you have an array of JSON objects representing Individuals. Previous analysis indicated that each Individual in the array is in fact the same person.
Your task is to analyse all JSON objects in the array and double check if all the objects indeed represent the same person.

Please see input JSON array schema definition:
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "full_name": {"type": "string", "description": "Full name including titles, nicknames, aliases, maiden names, or pseudonyms if mentioned. Include all names if Individual has many names."},
	    "information": {"type": "string", "description": "Any information that may help uniquely identify given Individual, e.g. sex, age, origin, physical appearance, distinguishing marks or features and other helpful information"}
    }
  }
}

JSON array with Individuals suspected to be the same person is below, between <character> tags:
<character>
%s
</character>

As a supplementary information you can use the summary of the whole book. It is placed between <summary> tags below.
<summary>
%s
</summary>

Please return "true" if you think that all Individuals indeed represent the same person, return "false" otherwise. If in doubt return "false"
Return only one word as a response, without any additional comments.   
''' prompt, 'merge_character_ids_double_check' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Merging array with identifying information of the same character to single character data

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''You are an expert in merging human characters names and descriptions, comming from different fragments of the same book. (human characters are called later "Individuals") 
Your task is to merge single Individual identifying information based on a book.

The input data is an array of JSON objects each representing the same Individual but based on different fragment of the same book.
The order of JSON objects is the same as order of fragments in the book.
Please see input data schema definition:
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "full_name": {"type": "string", "maxLength": 300, "description": "Full name including titles, nicknames, aliases, maiden names, or pseudonyms if mentioned. Include all names if Individual has many names."},
	    "information": {"type": "string", "maxLength": 2500, "description": "Any information that may help uniquely identify given Individual, e.g. sex, age, origin, physical appearance, distinguishing marks or features and other helpful information"}
    }
  }
}

Between <character> tags is input JSON array with Individual data to be merged.
<character>
%s
</character>

As a supplementary information you can use the summary of the whole book. It is placed between <summary> tags below.
<summary>
%s
</summary>

Important Guidelines for merging data:

1. The result should be just one single JSON object (not array) containing summarized full_name and information from all JSON objects in the array.
2. Avoid redundancy while maintaining comprehensiveness when merging.
3. The format of result JSON object should be the same as the format on JSON objects in the array.
4. Strictly maximum character limits for both fields. Summarize if necessary.

Before returning, check and fix all format errors in JSON array, if any.
Return only merged JSON object as a response, without any additional comments.   

''' prompt, 'merge_character_ids' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Extract desired characters traits from given book fragment

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''Your task is to provide given human characters analysis based on given book fragment. (human characters are called later "Individuals")
The analysis serves academic research on understanding human characteristics across different historical periods, geographical regions, and social statuses.

Data format:
Data are stored in a JSON array. Each Individual is described in separate JSON object with a flat structure as specified in schema below.
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "id": {"type": "integer", "description": "A unique identifier of an Individual"},
      "full_name": {"type": "string", "maxLength": 300, "description": "Full name including titles, nicknames, aliases, maiden names, or pseudonyms if mentioned, e.g.: 'Victor Frankenstein, M.D.'"},
      "information": {"type": "string", "maxLength": 1500, "description": "Additional information describing given Individual"},
      "sex": {"type": "string", "maxLength": 100, "description": "'male', 'female' or 'non-binary'"},
      "social_class": {"type": "string", "maxLength": 800, "description": "Economic and social standing (e.g., 'nobility', 'working class', 'merchant class')"},
      "wealth": {"type": "string", "maxLength": 800, "description": "Economic position, assets, property, financial struggles or abundance with information how wealth/income is obtained (inheritance, labor, trade, crime, patronage, etc.)"},
      "values": {"type": "string", "maxLength": 1600, "description": "Core principles, moral compass, priorities (can include both positive and negative values)"}
    }
  }
}

Between <example> tags is the output example is just to show the output structure, the real analysis results will be probably significantly larger and reacher in details.
<example>
[
  {
    "id": 78,
    "full_name": "Victor Frankenstein, M.D.",
    "sex": "male",
    "social_class": "Upper class, Geneva aristocracy",
    "wealth": "Wealthy through family fortune, owns estate in Geneva, sufficient funds for extended travels and education. Wealth obtained thanks family inheritance, father's position as syndic, old Geneva money",
    "values": "Knowledge, scientific progress, family loyalty, later: justice and revenge"
  },
  {
    "id": 3,
    "full_name": "James Johnson, known as 'Old Jim the Miller'",
    "sex": "male",
    "social_class": "Middle class tradesman, respected in village",
    "wealth": "Comfortable middle class, owns mill and cottage, savings of approximately 200 pounds. Wealth sources: milling fees, grain trading profits, small loans to farmers at harvest time",
    "values": "Hard work, family legacy, honest trade, community solidarity, tradition"
  }
]
</example>

Between tags <characters> is input JSON array with Individuals to be analysed. Only `id`, `full_name` and `information` fields are prepopulated.
<characters>
%s
</characters>

The current book fragment for analysis is provided between <book_fragment> tags.
<book_fragment>
%s
</book_fragment>

You main task is to add missing fields in each Individual JSON object based on book fragment analysis.
Additional guidelines:
1. Fields `full_name` and `information` are already populated and should be used to identify Individuals in given book fragment
2. Please do not modify content of the fields `full_name` and `id`.
3. Please do not add any new Individuals to the JSON array
4. Strictly observe the maximum character count for each field, comparing it with "maxLength" size. Summarize if necessary
5. Leave fields empty rather than speculating
6. Avoid redundancy while maintaining comprehensiveness
7. Please remove `information` fields in output array.

Return only supplemented JSON array as a response, without any additional comments.   
''' prompt, 'extract_data' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Merge given character traits gathered from all book fragments

In [None]:
%%bigquery
MERGE prompts p USING (SELECT   
'''Your task is to merge human character analyses based on a book, preserving source information, while avoiding redundancy.

The input data is an array of JSON objects each representing the same character but based on different fragment of the same book.
The order of JSON objects is the same as order of fragments in the book.
Please see input data schema definition:
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "id": {"type": "integer", "description": "A unique identifier of an Individual. Can be ignored."},
      "full_name": {"type": "string", "maxLength": 300, "description": "Full name including titles, nicknames, aliases, maiden names, or pseudonyms if mentioned, e.g.: 'Victor Frankenstein, M.D.'"},
      "sex": {"type": "string", "maxLength": 100, "description": "'male', 'female' or 'non-binary'"},
      "social_class": {"type": "string", "maxLength": 800, "description": "Economic and social standing (e.g., 'nobility', 'working class', 'merchant class')"},
      "wealth": {"type": "string", "maxLength": 800, "description": "Economic position, assets, property, financial struggles or abundance with information how wealth/income is obtained (inheritance, labor, trade, crime, patronage, etc.)"},
      "values": {"type": "string", "maxLength": 1600, "description": "Core principles, moral compass, priorities (can include both positive and negative values)"}
    }
  }
}

Between <character> tags is input JSON array with character data to be merged.
<character>
%s
</character>

As a supplementary information you can use:
- the summary of the whole book. It is placed between <summary> tags.
- the short overall information about given character. It is placed between <information> tags.
<summary>
%s
</summary>

<information>
%s
</information>


Important Guidelines for merging data:

1. The result should be just one single JSON object (not array) containing summarized information from all JSON objects in the array.
2. For each field please prepare a comprehensive summary of source fields, preserving the informations from each object, but avoiding redundancy.
3. The format of result JSON object should be the same as the format on JSON objects in the array.
4. Leave fields empty if they are empty in each JSON object, rather than speculating.

After preparing the JSON object, please again check each field against redundancy. Summarize it again if necessary to reduce redundant information.

Return only merged JSON object as a response, without any additional comments.   
''' prompt, 'merge_character' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Correct potential errors in final array with character traits

In [None]:
%%bigquery
MERGE prompts p USING (SELECT   
'''Your task is to check and correct errors in input JSON object, if any.
Please see schema definition:
{
  "type": "object",
  "properties": {
    "id": {"type": "integer", "description": "A unique identifier of an Individual. Can be ignored."},
    "full_name": {"type": "string", "description": "Full name including titles, nicknames, aliases, maiden names, or pseudonyms if mentioned, e.g.: 'Victor Frankenstein, M.D.'"},
    "sex": {"type": "string", "description": "'male', 'female' or 'non-binary'"},
    "social_class": {"type": "string", "description": "Economic and social standing (e.g., 'nobility', 'working class', 'merchant class')"},
    "wealth": {"type": "string", "description": "Economic position, assets, property, financial struggles or abundance with information how wealth/income is obtained (inheritance, labor, trade, crime, patronage, etc.)"},
    "values": {"type": "string", "description": "Core principles, moral compass, priorities (can include both positive and negative values)"}
  }
}

The JSON object to be checked is between <character> tags below:
<character>
%s
</character>

Guidelines:
1. Please check if all fields (except `id`) are of text type, if not, then please convert them to text, preserving all the information. For example: if value is an array, please convert it to semicolon separated text containing all array elements.
2. If the field doesn't contain any real information, please remove the field altogether. For example, the field value may be: "Unknown", "not specified", "null", "None mentioned", empty text, etc.
3. Please correct JSON format errors, if any.

Return only corrected JSON object as a response, without any additional comments.   
''' prompt, 'json_final_check' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);


Query is running:   0%|          |

Rank books based of inclusion of chuman characters - used in phase 1 when randomly searching books suitable for processing

In [None]:
%%bigquery
MERGE prompts p USING (SELECT   
'''You are the assistant analysing book data.
Your task is to predict if given book contains meaningful human characters information, no matter if fictitious or real.
You need to do the prediction based on first fragment of the book, its title and metadata containing themes from the book.

The task is part of the project aimed at understanding human characteristics across different historical periods, geographical regions, and social statuses, based on human descriptions from books (both fictional and real).
Many books, like scientific ones, financial reports, etc., do not contain human characters at all or only vaguely mentions humans. We need to exclude such books from further analysis.
We are interested in book containing rich descriptions of individual humans. For example: it can be biographies, all kind of novels with lively human characters, investigative journalism or reporting focused on humans, and others.

The book data is provided below between <book> tags, in a JSON format.
note: the book was scanned and may contain many optical character recognition (OCR) errors.
<book>
%s
</book>

As an answer please return number from 1 to 3 where:
  - "1" means: the book is not about humans
  - "2" means: human characters occur in book, but scarcely described.
  - "3" means: human characters occur and are depicted in details

Please return just a number without additional comments.
''' prompt, 'rank_books' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Split traits to semantically different parts and sanitize them by removing irrelevant information

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''You are an expert in splitting information based on semantic meaning.

Your task is to divide the given text into semantically distinct informations.

The text describes human characters %s.

Rules:
1. Split ONLY if the text contains semantically different parts
2. If the text describes a single coherent information, return it as one item
3. Preserve context - don't split related concepts that belong together
4. Remove redundant words but keep meaning intact

Examples for three different types of traits; values, social class and wealth:
- Input: "Kingdom's strength, victory, leadership; Righteousness, religious observance, victory through God's help."
- Output: ["Kingdom's strength", "victory and leadership", "righteousness and religious observance", "God's help"]

- Input: "Lieutenant, Officer in Kmita's company. Son of the Kokosinski family who use the seal of Pypka. Former outlaw."  
- Output: ["Lieutenant, Officer", "Former outlaw", "Probably belongs to wealty family"]

- Input: "Described as a poor exile without a roof over his head, implying no significant wealth. Also noted as being part of Kmita's company and having implied noble standing."
- Output: ["poor, no significant wealth", "homeless"]

Provide the output as a JSON array of strings. The schema definition is below:
{"type": "array", "items": {"type": "string"}}

The text for analysis is provided between <traits> tags below:
<traits>
%s
</traits>

As a suplementary information you can use character description provided between <information> tags below:
<information>
%s
</information>

Esure that the JSON array is correctly formatted.
Return only JSON array as a response, without any additional comments.
''' prompt, 'split_traits' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

Analyze traits sample from single cluster to name cluster and provide short description

In [None]:
%%bigquery
MERGE prompts p USING (SELECT
'''You are an expert in text clusters classification.
Your task is to name and describe each cluster extracted from large amount of examples describing human characters %s.

As an input you have a JSON array containing cluster id (field `cluster_id`) and examples of traits fitting into this cluster (field: `examples`).
The input is between <clusters> tag.
<clusters>
%s
</clusters>

For each cluster please: 
  - invent a short name, using one or few words, describing it most adequately
  - provide, a one or few sentences long, adequate description
Please output the result as a JSON array with following schema:
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "cluster_id": {"type": "integer"},
      "cluster_name": {"type": "string", "maxLength": 50}
      "cluster_description": {"type": "string", "maxLength": 1000}
    }
  }
}

Ensure that the JSON array is correctly formatted.
Return only JSON array as a response, without any additional comments.
''' prompt, 'cluster_traits' code) ip ON p.code = ip.code
WHEN MATCHED THEN UPDATE SET prompt = ip.prompt
WHEN NOT MATCHED THEN INSERT (code, prompt) VALUES(code, prompt);

Query is running:   0%|          |

In [312]:
%%bigquery --pyformat

CREATE OR REPLACE MODEL `gemini-embedding-001`
REMOTE WITH CONNECTION `{connection_id}`
OPTIONS(ENDPOINT = 'gemini-embedding-001');

Query is running:   0%|          |

### Phase 2 - Correcting Books
Book correction performs correction of OCR book scanning errors. Such errors are quite common in books from GDELT Processes Internet Archive.

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase2_correction_correct()
BEGIN
    DECLARE correct_model_params JSON DEFAULT JSON '''
    {{"systemInstruction": {{"parts": [{{"text": "You are an assistant helping to fix the content of scanned books."}}]}}}}
    ''';

    -- corrects OCR error in given book fragment
    UPDATE {data_set}.tmp_correction_chunks b SET b.corrected_txt = 
        AI.GENERATE(FORMAT(p.prompt, b.prefix, b.original_txt, b.suffix), 
        connection_id => '{connection_id}', endpoint => '{endpoint}',
        model_params => correct_model_params).result
    FROM {data_set}.prompts p WHERE b.corrected_txt IS NULL AND p.code = 'correct';

    -- concatenates corrected fragments and saves it in 'book' table
    MERGE {data_set}.books b
    USING (SELECT book_id, STRING_AGG(corrected_txt, ' ' ORDER BY chunk_number) AS aggregated_txt
    FROM {data_set}.tmp_correction_chunks GROUP BY book_id) cb
    ON b.book_id = cb.book_id AND b.corrected_txt IS NULL
    WHEN MATCHED THEN UPDATE SET corrected_txt = cb.aggregated_txt;
END;

Query is running:   0%|          |

### Phase 3 - Chunking books
Simple divide of each book to overlaping fragments.

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase3_prepare_chunks()
BEGIN
    -- divides book to 40000 long fragments
    CREATE OR REPLACE TABLE {data_set}.tmp_overlapped_chunks AS
    SELECT book_id, chunk_num - 1 as chunk_number, SUBSTR(corrected_txt, (chunk_num - 1) * 40000 + 1, 40000) as txt, CAST(null AS STRING) text_with_overlap
    FROM {data_set}.books, UNNEST(GENERATE_ARRAY(1, CAST(CEIL(LENGTH(corrected_txt) / 40000) AS INT64))) as chunk_num
    WHERE processed = False or processed is null;

    -- adds predecessing overlap to each fragment
    UPDATE {data_set}.tmp_overlapped_chunks curr set curr.text_with_overlap = curr.txt || ' ' || RIGHT(prev.txt, 4000)
    FROM {data_set}.tmp_overlapped_chunks prev WHERE curr.book_id = prev.book_id AND curr.chunk_number = prev.chunk_number + 1;
    -- the very first fragment has no overlap
    UPDATE {data_set}.tmp_overlapped_chunks set text_with_overlap = txt where text_with_overlap is null;

    -- stores fragments in 'chunks' table
    CREATE OR REPLACE TABLE {data_set}.chunks(book_id STRING, chunk_number INTEGER, txt STRING, summary STRING,
        fragment_number STRING, characters_id_data STRING, characters_full_data STRING);

    INSERT INTO {data_set}.chunks (book_id, chunk_number, txt)
    (SELECT book_id, chunk_number, text_with_overlap FROM {data_set}.tmp_overlapped_chunks
    EXCEPT DISTINCT SELECT book_id, chunk_number, txt FROM {data_set}.chunks);
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE PROCEDURE phase4_fragments_summarization()
BEGIN
	-- summarize the fragment to given size, calculated in such way that all fragments summaries together have no more then 80000 chars
  	UPDATE {data_set}.chunks c SET summary = 
		COALESCE(AI.GENERATE(FORMAT(p.prompt, s.size, c.txt), 
		connection_id => '{connection_id}', endpoint => '{endpoint}').result, '')
    FROM (select book_id, cast(80000 / count(*) as INT64) size from {data_set}.chunks group by book_id) s, {data_set}.prompts p
    WHERE c.book_id = s.book_id AND c.summary IS NULL AND p.code = 'summarize';
END;

Query is running:   0%|          |

### Phase 4 - Summarizing
Prepare concise summary of each book. It will be supplementary information in several inference operations later.

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase4_summarize_fragments_summaries()
BEGIN
	-- summarize the whole book based on fragments summaries, by the way ensuring that the final summary size is reduced to 20000 chars
	UPDATE {data_set}.books b SET b.summary = COALESCE(AI.GENERATE(FORMAT(p.prompt, s.summary), 
			connection_id => '{connection_id}', endpoint => '{endpoint}').result, '')
	FROM {data_set}.prompts p,
        (SELECT book_id, STRING_AGG(summary, '/n' ORDER BY chunk_number) summary FROM {data_set}.chunks GROUP BY book_id) s 
    WHERE p.code = 'reduce_summary' AND b.book_id = s.book_id AND b.summary is null;
END;

Query is running:   0%|          |

### Phase 5 - Character identification
By far the most difficult part of whole project identifying consistent characters across the whole book

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase5_prepare_character_ids_from_fragments()
BEGIN
    -- sets string fragment number, used later to identify fragments where given character appears
    UPDATE {data_set}.chunks SET fragment_number = FORMAT('%03d', chunk_number + 1) where fragment_number is null;

    -- extract full names and other identification-helping information from book fragments
    UPDATE {data_set}.chunks c SET c.characters_id_data = 
        AI.GENERATE(FORMAT(p.prompt, c.txt), connection_id => '{connection_id}', endpoint => '{endpoint}').result
    FROM {data_set}.prompts p, {data_set}.books b
    WHERE p.code = 'characters_id_data' and c.book_id = b.book_id;

    -- split result: each character in separate row
    CREATE OR REPLACE TABLE {data_set}.tmp_characters_id_data AS
    (select book_id, chunk_number, fragment_number, x as characters_id_data,
    CAST(NULL AS STRING) full_name, CAST(NULL AS STRING) information, CAST(NULL AS INT64) importance
    FROM {data_set}.chunks, UNNEST(JSON_QUERY_ARRAY(TRIM(characters_id_data, '`json\n'), '$')) x);

    -- add id (unique together with book id)
    CREATE OR REPLACE TABLE {data_set}.identifiers AS
    SELECT row_number() OVER(ORDER BY book_id, chunk_number, full_name) id, * from {data_set}.tmp_characters_id_data;

    -- extract fields from json
    UPDATE {data_set}.identifiers SET full_name = json_value(characters_id_data, '$.full_name'), information = json_value(characters_id_data, '$.information'),
        importance = CAST(JSON_VALUE(characters_id_data, '$.importance') AS INT64) where full_name is null;

    -- correct potential errors in 'importance' count
    UPDATE {data_set}.identifiers SET importance = 1 where importance = 0 or importance is null;
END;
-- author: jj123451

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase5_prepare_character_ids_initial_embeddings()
BEGIN
  -- create embeddings for all characters identifying information (with full names)
  CREATE OR REPLACE TABLE {data_set}.tmp_characters_id_embeddings as SELECT * FROM ML.GENERATE_EMBEDDING(
    MODEL `{data_set}.gemini-embedding-001`,
    (SELECT characters_id_data as content, full_name as title, id, book_id, chunk_number, FALSE is_query FROM {data_set}.identifiers),
    STRUCT(TRUE AS flatten_json_output, 'RETRIEVAL_DOCUMENT' as task_type));
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase5_merge_characters_duplicates_with_return_param(OUT updated INT64)  
BEGIN
  -- search characters most similar to each other
	CREATE OR REPLACE TABLE {data_set}.tmp_characters_id_distance as
  SELECT query.id query_id, query.title query_full_name, base.id, base.title full_name, base.chunk_number, base.book_id, query.book_id query_book_id, distance FROM
  VECTOR_SEARCH(
	(SELECT id, book_id, chunk_number, title, ml_generate_embedding_result FROM {data_set}.tmp_characters_id_embeddings),
	'ml_generate_embedding_result',
	(SELECT id, title, book_id, ml_generate_embedding_result FROM {data_set}.tmp_characters_id_embeddings),
	'ml_generate_embedding_result',
	top_k => 2);
  -- !TODO! - achange above query to search within data of the same book. It very rarely match characters from different books, but still
  DELETE FROM {data_set}.tmp_characters_id_distance where book_id <> query_book_id;

  -- delete distances to self and duplicate distances
  DELETE FROM {data_set}.tmp_characters_id_distance where query_id = id;
  DELETE FROM {data_set}.tmp_characters_id_distance d where d.id < d.query_id and exists (select 1 from {data_set}.tmp_characters_id_distance dd where dd.id = d.query_id and d.id = dd.query_id);

  -- prepara pairs of duplicate candidates, each pair in group belongs to the same fragment and group is no longer then 50 characters
  CREATE OR REPLACE TABLE {data_set}.tmp_characters_duplicate_candidates as
  select string_agg(pair, ',\n') pairs, book_id, chunk_number from
  (select TO_JSON_STRING(STRUCT(query.id, query.full_name as first_individual_full_name, query.information as first_individual_information, 
  base.full_name as second_individual_full_name, base.information as second_individual_information), true) pair, embed.book_id, query.chunk_number, query.fragment_number,
  CEIL(ROW_NUMBER() OVER (PARTITION BY embed.book_id, query.chunk_number ORDER BY query.id) / 50.0) chunk_50_nr
  from {data_set}.tmp_characters_id_distance embed
  JOIN {data_set}.identifiers query ON embed.query_id = query.id
  JOIN {data_set}.identifiers base ON embed.id = base.id)
  group by book_id, chunk_number, chunk_50_nr;

  -- use AI to check the duplicate candidates and assess which ones are in fact the same person
  CREATE OR REPLACE TABLE {data_set}.tmp_the_same_characters as SELECT pairs.book_id, pairs.chunk_number,
  AI.GENERATE(FORMAT(p.prompt, pairs.pairs, cb.summary), connection_id => '{connection_id}', endpoint => '{endpoint}').result pairs
  FROM {data_set}.prompts p, {data_set}.tmp_characters_duplicate_candidates pairs, {data_set}.books cb
  WHERE p.code = 'find_the_same_characters' and pairs.book_id = cb.book_id;
  -- check AI results from above (AI sometimes makes mistakes, rarely, but consequence is mergin two different characters together!)
  CREATE OR REPLACE TABLE {data_set}.tmp_different_characters as SELECT pairs.book_id, pairs.chunk_number,
  AI.GENERATE(FORMAT(p.prompt, pairs.pairs, cb.summary), connection_id => '{connection_id}', endpoint => '{endpoint}').result pairs
  FROM {data_set}.prompts p, {data_set}.tmp_the_same_characters pairs, {data_set}.books cb
  WHERE p.code = 'find_different_characters' and pairs.book_id = cb.book_id;

  -- split both results of AI inference
  CREATE OR REPLACE TABLE {data_set}.tmp_the_same_characters_split AS
  select book_id, chunk_number, pair, CAST(NULL AS INT64) id
  from {data_set}.tmp_the_same_characters, UNNEST(JSON_QUERY_ARRAY(TRIM(pairs, '`json\n'), '$')) pair;		
  UPDATE {data_set}.tmp_the_same_characters_split SET id = CAST(JSON_VALUE(pair, '$.id') AS INT64) where id is null;

  CREATE OR REPLACE TABLE {data_set}.tmp_different_characters_split AS
  select book_id, chunk_number, pair, CAST(NULL AS INT64) id
  from {data_set}.tmp_different_characters, UNNEST(JSON_QUERY_ARRAY(TRIM(pairs, '`json\n'), '$')) pair;		
  UPDATE {data_set}.tmp_different_characters_split SET id = CAST(JSON_VALUE(pair, '$.id') AS INT64) where id is null;

  -- remove false positives, detected by second check
  DELETE FROM {data_set}.tmp_the_same_characters_split WHERE ID IN (SELECT id FROM {data_set}.tmp_different_characters_split);		
  -- let's leave only real duplicates in vector search resulting table
  DELETE FROM {data_set}.tmp_characters_id_distance where query_id NOT IN (SELECT id from {data_set}.tmp_the_same_characters_split);

  -- prepare for recursive query by creating table with duplicate pairs in both direction
  CREATE OR REPLACE TABLE {data_set}.tmp_bidirectional_edges as (
  SELECT query_id, id FROM {data_set}.tmp_characters_id_distance UNION DISTINCT SELECT id, query_id FROM {data_set}.tmp_characters_id_distance);

  -- run recursive query to find whole graph of connected duplicates (A may be duplicate of B, then B may be duplicate of C and so on)
  CREATE OR REPLACE TABLE {data_set}.tmp_grouped_duplicates_graph as
  WITH RECURSIVE connected_components AS (
      SELECT query_id AS node, query_id AS root, 0 AS iteration FROM {data_set}.tmp_bidirectional_edges
      UNION ALL
      SELECT edges.id AS node, LEAST(comp.root, edges.id) AS root, comp.iteration + 1
      FROM connected_components comp INNER JOIN {data_set}.tmp_bidirectional_edges edges ON comp.node = edges.query_id
      WHERE comp.iteration < 10
  ),
  unique_graph AS (SELECT node, MIN(root) AS group_id FROM connected_components GROUP BY node)
  SELECT group_id, ARRAY_AGG(DISTINCT node ORDER BY node) AS duplicate_ids_array FROM unique_graph GROUP BY group_id;

  -- prepare json arrays with all duplicated characters: one array per one "real" person
  CREATE OR REPLACE TABLE {data_set}.tmp_grouped_duplicates_graph_enriched as
  WITH duplicates AS (select group_id, id from {data_set}.tmp_grouped_duplicates_graph, UNNEST(duplicate_ids_array) id)
  SELECT d.group_id, "[\n" || STRING_AGG(TO_JSON_STRING(STRUCT(i.full_name, i.information), true), ",\n" ORDER BY i.chunk_number) || "\n]" duplicated_ids
  FROM {data_set}.identifiers i join duplicates d on i.id = d.id group by group_id;

  -- another, final AI check, if they are really all duplicates
  CREATE OR REPLACE TABLE {data_set}.tmp_merged_duplicates_double_check as SELECT x.group_id,
  AI.GENERATE_BOOL(FORMAT(p.prompt, x.duplicated_ids, b.summary), connection_id => '{connection_id}', endpoint => '{endpoint}').result check
  FROM {data_set}.prompts p, {data_set}.tmp_grouped_duplicates_graph_enriched x, {data_set}.identifiers i, {data_set}.books b WHERE p.code = 'merge_character_ids_double_check' and i.id = x.group_id and i.book_id = b.book_id;

  -- get rid of ones which failed the above check
  DELETE FROM {data_set}.tmp_grouped_duplicates_graph WHERE group_id IN (
    select group_id from {data_set}.tmp_merged_duplicates_double_check where check = FALSE);
  DELETE FROM {data_set}.tmp_grouped_duplicates_graph_enriched WHERE group_id IN (
    select group_id from {data_set}.tmp_merged_duplicates_double_check where check = FALSE);

  -- let's merge all duplicates into one final character: multiple personality disorder finally cured
  CREATE OR REPLACE TABLE {data_set}.tmp_merged_duplicates as SELECT x.group_id,
  AI.GENERATE(FORMAT(p.prompt, x.duplicated_ids, b.summary), connection_id => '{connection_id}', endpoint => '{endpoint}').result character_id
  FROM {data_set}.prompts p, {data_set}.tmp_grouped_duplicates_graph_enriched x, {data_set}.identifiers i, {data_set}.books b WHERE p.code = 'merge_character_ids' and i.id = x.group_id and i.book_id = b.book_id;

  -- extract some data from json result to columns
  CREATE OR REPLACE TABLE {data_set}.tmp_merged_duplicates_split AS
  select group_id, TRIM(character_id, '`json\n') character_id, CAST(NULL AS STRING) full_name, CAST(NULL AS STRING) information
  from {data_set}.tmp_merged_duplicates;
  UPDATE {data_set}.tmp_merged_duplicates_split SET full_name = json_value(character_id, '$.full_name'), information = json_value(character_id, '$.information') where full_name is null;

  -- update the chosen-to-remain duplicate (it's the one with lowest id) data with merged data
  UPDATE {data_set}.identifiers chid SET chid.full_name = duplicate.full_name, chid.information = duplicate.information,
  chid.characters_id_data = TO_JSON_STRING(STRUCT(duplicate.full_name, duplicate.information), true)
  FROM {data_set}.tmp_merged_duplicates_split duplicate WHERE chid.id = duplicate.group_id;

  -- split the duplicates graph array to multiple rows
  CREATE OR REPLACE TABLE {data_set}.tmp_grouped_ids_split as select group_id, id from {data_set}.tmp_grouped_duplicates_graph, UNNEST(duplicate_ids_array) id;
  -- count new 'importance' (concatenate all from duplicate and 'fragment_number' (concatenate all from duplicate)
  UPDATE {data_set}.identifiers chid SET chid.importance = dp.importance, chid.fragment_number = dp.fragment_number
  FROM (select chid.id, sum(di.importance) importance, string_agg(distinct di.fragment_number, ',') fragment_number
  FROM {data_set}.identifiers chid JOIN {data_set}.tmp_grouped_ids_split d ON d.group_id = chid.id
  JOIN {data_set}.identifiers di ON d.id = di.id group by chid.id) dp
  WHERE chid.id = dp.id;

  -- exterminate all duplicates but the chosen one
  DELETE FROM {data_set}.tmp_grouped_ids_split where group_id = id;
  DELETE FROM {data_set}.identifiers WHERE id IN (SELECT id from {data_set}.tmp_grouped_ids_split);
  DELETE FROM {data_set}.tmp_characters_id_embeddings where id IN (SELECT id from {data_set}.tmp_grouped_ids_split);

  -- updates embedding for merged duplicates
  CREATE OR REPLACE TABLE {data_set}.tmp_characters_id_changed_embeddings as SELECT * FROM ML.GENERATE_EMBEDDING(
    MODEL `{data_set}.gemini-embedding-001`,
    (SELECT chid.characters_id_data as content, chid.full_name as title, chid.id FROM {data_set}.identifiers chid
    JOIN {data_set}.tmp_grouped_duplicates_graph duplicate ON chid.id = duplicate.group_id),
    STRUCT(TRUE AS flatten_json_output, 'RETRIEVAL_DOCUMENT' as task_type));

  UPDATE {data_set}.tmp_characters_id_embeddings t SET ml_generate_embedding_result = tnew.ml_generate_embedding_result, ml_generate_embedding_statistics = tnew.ml_generate_embedding_statistics,
    ml_generate_embedding_status = tnew.ml_generate_embedding_status, title = tnew.title, content = tnew.content
  FROM {data_set}.tmp_characters_id_changed_embeddings tnew WHERE tnew.id = t.id;

  -- if there were no duplicates, it will tell the algorithm that we may stop searching
  SET updated = @@row_count;
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase5_merge_characters_duplicates()  
BEGIN
  -- helper method for first few executions when we ignore output
  DECLARE merged INT64 DEFAULT 0;
  call {data_set}.phase5_merge_characters_duplicates_with_return_param(merged);
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase5_rebuild_indentifier_jsons()  
BEGIN
    -- update 'identifiers' table with finall, corect data format in json column
    UPDATE {data_set}.identifiers SET characters_id_data = TO_JSON_STRING(STRUCT(id, full_name, information), true) WHERE full_name is not null;
END;

Query is running:   0%|          |

### Phase 6 - Extracting information
This stage is finally gathering the data we want. In this project we chose as an example: gender, financial status, social class and moral values.
In real life applications, any chosen traits can be chosen, by adapting prompts and param values in clustering stage.

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase6_gather_characters_full_data_from_chunks()  
BEGIN
    -- lets gather characters into single group for each chunk (but no longer then 15 people for better inference later)
    CREATE OR REPLACE TABLE {data_set}.tmp_chunks_character_id_data as 
    SELECT book_id, chunk_number, "[\n" || STRING_AGG(characters_id_data, ",\n") || "\n]" characters_id_data
    FROM (
        SELECT ecb.book_id, ecb.chunk_number, i.characters_id_data, CEIL(ROW_NUMBER() OVER (PARTITION BY ecb.book_id, ecb.chunk_number ORDER BY i.id) / 15.0) chunk_15_nr        
        FROM {data_set}.chunks ecb, {data_set}.identifiers i  
        WHERE i.book_id = ecb.book_id and i.fragment_number LIKE '%' || ecb.fragment_number || '%')
    group by book_id, chunk_number, chunk_15_nr;

    -- extract desired data for given characters from given fragments
    CREATE OR REPLACE TABLE {data_set}.tmp_chunks_character_full_data as 
    select ccid.book_id, ccid.chunk_number, COALESCE(AI.GENERATE(FORMAT(p.prompt, ccid.characters_id_data, c.txt), 
        connection_id => '{connection_id}', endpoint => '{endpoint}',
        model_params => JSON '{{"systemInstruction": {{"parts": [{{"text": "You are an expert in extracting information from books."}}]}}}}').result, '[]') characters_full_data
    FROM {data_set}.tmp_chunks_character_id_data ccid, {data_set}.prompts p, {data_set}.chunks c
    WHERE p.code = 'extract_data' and c.book_id = ccid.book_id and c.chunk_number = ccid.chunk_number;

    -- AI had important task above so it sometimes forgets to make result perfectly json formatted - no worry, we fix it here
    UPDATE {data_set}.tmp_chunks_character_full_data b SET b.characters_full_data = 
    TRIM(AI.GENERATE(FORMAT(p.prompt, 'array', b.characters_full_data), connection_id => '{connection_id}', endpoint => '{endpoint}').result, '`json\n')
    FROM {data_set}.prompts p WHERE b.characters_full_data IS NOT NULL AND p.code = 'json';
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase6_group_the_same_characters_data_for_merging()  
BEGIN
    -- split the book fragment gangs into separate persons
    CREATE OR REPLACE TABLE {data_set}.tmp_split_full_data as
    select book_id, character_full_data, chunk_number, CAST(NULL AS INT64) id, CAST(NULL AS STRING) full_name from {data_set}.tmp_chunks_character_full_data,
    UNNEST(JSON_QUERY_ARRAY(characters_full_data, '$')) character_full_data where characters_full_data is not null;
    -- and extract some data from json
    UPDATE {data_set}.tmp_split_full_data SET full_name = json_value(character_full_data, '$.full_name'), id = CAST(JSON_VALUE(character_full_data, '$.id') AS INT64) WHERE full_name IS NULL;

    -- group together the same person handling our precious information from different fragments
    CREATE OR REPLACE TABLE {data_set}.tmp_grouped_full_data AS
    SELECT id, book_id, count(id) as parts, "[\n" || STRING_AGG(character_full_data, ",\n" ORDER BY chunk_number) || "\n]" full_data_array,
    CAST(NULL AS STRING) character_full_data,  CAST(NULL AS STRING) information
    FROM {data_set}.tmp_split_full_data GROUP BY book_id, id;
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase6_merge_data()
BEGIN
    -- merger the information gathered in previous procedure into one conscise data
    UPDATE {data_set}.tmp_grouped_full_data data SET data.character_full_data = 
    AI.GENERATE(FORMAT(p.prompt, data.full_data_array, cb.SUMMARY, ids.information), connection_id => '{connection_id}', endpoint => '{endpoint}').result
    FROM {data_set}.prompts p, {data_set}.books cb, {data_set}.identifiers ids
    WHERE data.character_full_data IS NULL AND data.full_data_array IS NOT NULL and data.parts > 1
    AND p.code = 'merge_character' and data.book_id = cb.book_id and data.book_id = ids.book_id AND ids.id = data.id;
    -- but there is no need if given person appeared in only one book fragment
    UPDATE {data_set}.tmp_grouped_full_data SET character_full_data = TRIM(full_data_array, '[] \n') 
    where character_full_data IS NULL AND full_data_array IS NOT NULL and parts = 1;

    -- AI sometimes puts information into array of strings instead of single text field, check this and other errors
    UPDATE {data_set}.tmp_grouped_full_data data SET data.character_full_data = 
    TRIM(AI.GENERATE(FORMAT(p.prompt, data.character_full_data), connection_id => '{connection_id}', endpoint => '{endpoint}').result, '`json\n')
    FROM {data_set}.prompts p WHERE data.character_full_data IS NOT NULL AND p.code = 'json_final_check';
    -- put final date into 'information' table
    UPDATE {data_set}.tmp_grouped_full_data a SET a.information = ids.information
    FROM {data_set}.identifiers ids WHERE ids.id = a.id;
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase6_copy_final_data()
BEGIN
  -- 'characters' table is the very table with final results that real users will be browsing
  INSERT INTO {data_set}.characters (book_id, id, full_name, sex, social_class, wealth, values, information)
  select
    book_id,
    id,
    json_value(character_full_data, '$.full_name') full_name,
    json_value(character_full_data, '$.sex') sex,
    json_value(character_full_data, '$.social_class') social_class,
    json_value(character_full_data, '$.wealth') wealth,
    json_value(character_full_data, '$.values') values,
    information
  from {data_set}.tmp_grouped_full_data a where not exists (
    select 1 from {data_set}.characters fa where a.id = fa.id AND a.book_id = fa.book_id
  );

  UPDATE {data_set}.characters fa SET fa.importance = ids.importance
  FROM {data_set}.identifiers ids where ids.id = fa.id AND ids.book_id = fa.book_id AND fa.importance is null;
  
  -- we mark the books as processed, so it will not be processed again if whole notebook will be restarted
  UPDATE {data_set}.books set processed = TRUE where book_id IN (
    SELECT book_id from {data_set}.tmp_grouped_full_data);
END;

Query is running:   0%|          |

### Phase 7 - Clustering
It takes the raw data gathered for each interesting us trait and clusters them together into consistent groups.

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase7_split_traits(trait_desc STRING)
BEGIN
    -- lets ask AI to split and sanitize our precious data, it is first step to allow some statistical analysis
    CREATE OR REPLACE TABLE {data_set}.tmp_split_traits as 
    SELECT t.book_id, t.id, t.traits as original_traits,
    COALESCE(AI.GENERATE(FORMAT(p.prompt, trait_desc, t.traits, i.information), connection_id => '{connection_id}', endpoint => '{endpoint}').result, '[]') traits
    FROM {data_set}.prompts p, {data_set}.tmp_copied_traits t, {data_set}.characters i
    WHERE p.code = 'split_traits' AND t.id = i.id AND t.book_id = i.book_id;

    -- put split traits elements int separate rows
    CREATE OR REPLACE TABLE {data_set}.tmp_split_traits_final AS
    SELECT *, row_number() OVER(ORDER BY id, book_id) trait_id, CAST(NULL AS INT64) cluster_id FROM
    (select book_id, id, trait
    FROM {data_set}.tmp_split_traits, UNNEST(JSON_QUERY_ARRAY(TRIM(traits, '`json\n'), '$')) as trait);	

    UPDATE {data_set}.tmp_split_traits_final set trait = TRIM(trait, '"') where 1=1;
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase7_identify_clusters(clusters INT64)
BEGIN
    -- let's do some embedding on split traits, they are usually very short so 128 length should be good
    CREATE OR REPLACE TABLE {data_set}.tmp_traits_embeddings as SELECT * FROM ML.GENERATE_EMBEDDING(
    MODEL `{data_set}.gemini-embedding-001`,
    (SELECT trait as content, trait_id FROM {data_set}.tmp_split_traits_final),
    STRUCT(TRUE AS flatten_json_output, 'SEMANTIC_SIMILARITY' as task_type, 128 as OUTPUT_DIMENSIONALITY));
    
    -- prepare random sample of max 2000 embeddings
    CREATE OR REPLACE TABLE {data_set}.tmp_traits_embeddings_modeling_sample as
    WITH size as (select count(*) count from {data_set}.tmp_traits_embeddings)
    SELECT ml_generate_embedding_result, content, trait_id FROM {data_set}.tmp_traits_embeddings traits, size
    where rand() < (2000 / size.count);
    -- and use it to train KMEANS model
    CREATE OR REPLACE MODEL `{data_set}.kmeans_traits_model`
    OPTIONS(MODEL_TYPE = 'KMEANS', NUM_CLUSTERS = clusters, KMEANS_INIT_METHOD = 'KMEANS++', DISTANCE_TYPE = 'COSINE') AS
    SELECT ml_generate_embedding_result FROM {data_set}.tmp_traits_embeddings_modeling_sample;

    -- lets find clusters for each split values (execute immediate to avoid procedure verification error on unknown model)
    EXECUTE IMMEDIATE ('''CREATE OR REPLACE TABLE `{data_set}.tmp_clustered_traits` AS
        SELECT * FROM ML.PREDICT(MODEL `{data_set}.kmeans_traits_model`,
        (SELECT ml_generate_embedding_result, content, trait_id FROM `{data_set}.tmp_traits_embeddings`))''');

    UPDATE {data_set}.tmp_split_traits_final v set v.cluster_id = cv.centroid_id
    FROM {data_set}.tmp_clustered_traits cv WHERE v.trait_id = cv.trait_id;
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase7_name_clusters(trait_type STRING, trait_desc STRING)
BEGIN
    -- for each claster gather 200 random representatives
    CREATE OR REPLACE TABLE {data_set}.tmp_trait_clusters as
    select centroid_id as cluster_id, ARRAY_TO_STRING(ARRAY_AGG(content LIMIT 200), ', ') examples from {data_set}.tmp_clustered_traits group by centroid_id;

    -- throw the representatives at the AI and ask it to give them one common name and description
    CREATE OR REPLACE TABLE {data_set}.tmp_clusters_analysis as
    SELECT AI.GENERATE(FORMAT(p.prompt, trait_desc, c.json_examples), connection_id => '{connection_id}', endpoint => '{endpoint}').result
    FROM (select '[\n' || string_agg(json_example, ',\n') || '\n]' json_examples
    FROM (select cluster_id, TO_JSON_STRING(STRUCT(cluster_id, examples)) json_example from {data_set}.tmp_trait_clusters order by cluster_id)) c
    join {data_set}.prompts p ON p.code = 'cluster_traits';

    -- extract from json and store the invented name and description in 'clusters' table
    CREATE OR REPLACE TABLE {data_set}.tmp_cluster_names as
    select CAST(json_value(clusters, '$.cluster_id') AS INT64) cluster_id, json_value(clusters, '$.cluster_name') name, json_value(clusters, '$.cluster_description') description
    FROM {data_set}.tmp_clusters_analysis, UNNEST(JSON_QUERY_ARRAY(TRIM(result, '`json\n'), '$')) as clusters;
    DELETE FROM {data_set}.clusters where cluster_type = trait_type;
    INSERT INTO {data_set}.clusters (cluster_type, cluster_id, name, description)
    SELECT trait_type, cluster_id, name, description FROM {data_set}.tmp_cluster_names;
END;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE PROCEDURE phase7_assign_clusters(trait_type STRING)
BEGIN
    -- set the cluster_ids in 'character_cluster_details'
    DELETE FROM {data_set}.character_cluster_details where book_id IN (SELECT book_id FROM {data_set}.tmp_split_traits_final);
    INSERT INTO {data_set}.character_cluster_details(book_id, id, trait, trait_id, cluster_type, cluster_id)
    SELECT book_id, id, trait, trait_id, trait_type, cluster_id FROM {data_set}.tmp_split_traits_final;

    -- combine traits of each person together (we already have one, but this one is sanitized)
    CREATE OR REPLACE TABLE {data_set}.tmp_combined_traits as
    SELECT book_id, id, cluster_type, string_agg(trait, ', ') traits, CAST(NULL AS INT64) cluster_id
    FROM {data_set}.character_cluster_details GROUP BY book_id, id, cluster_type;

    -- prepare embeddings for concatenated traits
    CREATE OR REPLACE TABLE {data_set}.tmp_combined_traits_embeddings as SELECT * FROM ML.GENERATE_EMBEDDING(
    MODEL `{data_set}.gemini-embedding-001`,
    (SELECT traits as content, book_id, id FROM {data_set}.tmp_combined_traits),
    STRUCT(TRUE AS flatten_json_output, 'SEMANTIC_SIMILARITY' as task_type, 128 as OUTPUT_DIMENSIONALITY));
    
    -- and find the main cluster_id for given person using previously trained
    EXECUTE IMMEDIATE ('''CREATE OR REPLACE TABLE `{data_set}.tmp_combined_traits_clustered` AS
        SELECT * FROM ML.PREDICT(MODEL `{data_set}.kmeans_traits_model`,
        (SELECT ml_generate_embedding_result, book_id, id FROM `{data_set}.tmp_combined_traits_embeddings`))''');

    UPDATE {data_set}.tmp_combined_traits ct set ct.cluster_id = ctc.centroid_id
    FROM {data_set}.tmp_combined_traits_clustered ctc WHERE ctc.id = ct.id AND ctc.book_id = ct.book_id;
END;

Query is running:   0%|          |

### Statistical views
Helping to get meaning of the data

Almost the same with 'characters' table but with cluster names and sanitized gender value

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_characters_enriched AS
WITH cluster_names AS (SELECT cluster_type, cluster_id, name as cluster_name, description FROM {data_set}.clusters),
social_class_names AS (SELECT cluster_id, cluster_name as social_class_name, description as social_class_desc FROM cluster_names WHERE cluster_type = 'social_class'),
wealth_names AS (SELECT cluster_id, cluster_name as wealth_name, description as wealth_desc FROM cluster_names WHERE cluster_type = 'wealth'),
values_names AS (SELECT cluster_id, cluster_name as values_name, description as values_desc FROM cluster_names WHERE cluster_type = 'values')
SELECT c.social_class_cluster_id, c.wealth_cluster_id, c.values_cluster_id, 
  CASE 
    WHEN LOWER(TRIM(c.sex)) IN ('male', 'm', 'man') THEN 'male'
    WHEN LOWER(TRIM(c.sex)) IN ('female', 'f', 'woman') THEN 'female'
    ELSE 'unknown'
  END as sex,
  COALESCE(sc.social_class_name, 'unknown') social_class_name, COALESCE(sc.social_class_desc, '-') social_class_desc, COALESCE(w.wealth_name, 'unknown') wealth_name,
  COALESCE(w.wealth_desc, '-') wealth_desc, COALESCE(v.values_name, 'unknown') values_name, COALESCE(v.values_desc, '-') values_desc
FROM {data_set}.characters c
LEFT JOIN social_class_names sc ON c.social_class_cluster_id = sc.cluster_id
LEFT JOIN wealth_names w ON c.wealth_cluster_id = w.cluster_id  
LEFT JOIN values_names v ON c.values_cluster_id = v.cluster_id;

Query is running:   0%|          |

Shows all the clusters identified by KMean analysis

In [331]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_cluster_statistics AS
WITH stats AS (
  SELECT 'sex' as dimension, NULL as cluster_id, sex as cluster_name, '-' as cluster_description, COUNT(*) as count,
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as percentage, RANK() OVER (PARTITION BY 'sex' ORDER BY COUNT(*) DESC) as rank_within_dimension
  FROM {data_set}.v_characters_enriched GROUP BY sex
  UNION ALL
  SELECT 'social_class', social_class_cluster_id, social_class_name, social_class_desc, COUNT(*),
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100, RANK() OVER (PARTITION BY 'social_class' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY social_class_cluster_id, social_class_name, social_class_desc
  UNION ALL
  SELECT 'wealth', wealth_cluster_id, wealth_name, wealth_desc, COUNT(*),
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100, RANK() OVER (PARTITION BY 'wealth' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY wealth_cluster_id, wealth_name, wealth_desc
  UNION ALL
  SELECT 'values', values_cluster_id, values_name, values_desc, COUNT(*),
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100, RANK() OVER (PARTITION BY 'values' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY values_cluster_id, values_name, values_desc
)
SELECT * FROM stats ORDER BY dimension, rank_within_dimension;

Query is running:   0%|          |

Groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons

In [332]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_character_profiles AS
WITH profiles AS (
  SELECT sex, social_class_name, wealth_name, values_name, COUNT(*) as character_count
  FROM {data_set}.v_characters_enriched
  GROUP BY social_class_cluster_id, wealth_cluster_id, values_cluster_id, sex, social_class_name, wealth_name, values_name)
SELECT *, character_count / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as percentage,
  RANK() OVER (ORDER BY character_count DESC) as popularity_rank,
FROM profiles WHERE character_count > 1 ORDER BY character_count DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [333]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_social AS
WITH crosstab_data AS (SELECT sex, social_class_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, social_class_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name)
SELECT cd.sex, cd.social_class_name, cd.frequency,
  cd.frequency / st.sex_total * 100 as sex_percentage,
  cd.frequency / sct.social_class_total * 100 as social_class_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.social_class_name = sct.social_class_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [334]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_wealth AS
WITH crosstab_data AS (SELECT sex, wealth_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, wealth_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name)
SELECT cd.sex, cd.wealth_name, cd.frequency,
  cd.frequency / st.sex_total * 100 as sex_percentage,
  cd.frequency / sct.wealth_total * 100 as wealth_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.wealth_name = sct.wealth_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [335]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_values AS
WITH crosstab_data AS (SELECT sex, values_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, values_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.sex, cd.values_name, cd.frequency,
  cd.frequency / st.sex_total * 100 as sex_percentage,
  cd.frequency / sct.values_total * 100 as values_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.values_name = sct.values_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [336]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_social_wealth AS
WITH crosstab_data AS (SELECT social_class_name, wealth_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY social_class_name, wealth_name),
totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name),
wealth_totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name)
SELECT cd.social_class_name, cd.wealth_name, cd.frequency,
  cd.frequency / t.social_class_total * 100 as social_class_percentage,
  cd.frequency / wt.wealth_total * 100 as wealth_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.social_class_name = t.social_class_name JOIN wealth_totals wt ON cd.wealth_name = wt.wealth_name
ORDER BY cd.social_class_name, cd.frequency DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [337]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_social_values AS
WITH crosstab_data AS (SELECT social_class_name, values_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY social_class_name, values_name),
totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name),
values_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.social_class_name, cd.values_name, cd.frequency,
  cd.frequency / t.social_class_total * 100 as social_class_percentage,
  cd.frequency / wt.values_total * 100 as values_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.social_class_name = t.social_class_name JOIN values_totals wt ON cd.values_name = wt.values_name
ORDER BY cd.social_class_name, cd.frequency DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [338]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_wealth_values AS
WITH crosstab_data AS (SELECT wealth_name, values_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY wealth_name, values_name),
totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name),
values_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.wealth_name, cd.values_name, cd.frequency,
  cd.frequency / t.wealth_total * 100 as wealth_percentage,
  cd.frequency / wt.values_total * 100 as values_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.wealth_name = t.wealth_name JOIN values_totals wt ON cd.values_name = wt.values_name
ORDER BY cd.wealth_name, cd.frequency DESC;

Query is running:   0%|          |

- `v_formatted_cluster_statistics` - shows all the clusters identified by KMean analysis
- `v_formatted_crosstab_sex_social` - shows relationships between gender and social class
- `v_formatted_crosstab_sex_wealth` - shows relationships between gender and wealth
- `v_formatted_crosstab_sex_values` - shows relationships between gender and values
- `v_formatted_crosstab_social_wealth` - shows relationships between social class and wealth
- `v_formatted_crosstab_social_values` - shows relationships between social class and values
- `v_formatted_crosstab_wealth_values` - shows relationships between social wealth and values
- `v_formatted_character_profiles` - it groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons
- `v_formatted_gender_analysis` - similar to above but with gender related counts and percentage columns

In [339]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_gender_analysis AS
WITH gender_profiles AS (SELECT sex, social_class_name, wealth_name, values_name, COUNT(*) as count
  FROM {data_set}.v_characters_enriched GROUP BY sex, social_class_name, wealth_name, values_name),
gender_totals AS (SELECT sex, SUM(count) as total_by_gender FROM gender_profiles GROUP BY sex)
SELECT gp.*, gt.total_by_gender,
  gp.count / gt.total_by_gender * 100 as percentage_within_gender,
  RANK() OVER (PARTITION BY gp.sex ORDER BY gp.count DESC) as rank_within_gender
FROM gender_profiles gp JOIN gender_totals gt ON gp.sex = gt.sex
WHERE gp.count > 1 ORDER BY gp.sex, gp.count DESC;

Query is running:   0%|          |

The same views as above but with nicely formatted percentage for human analysis

In [340]:
%%bigquery --pyformat
CREATE OR REPLACE VIEW v_formatted_cluster_statistics AS
SELECT dimension, cluster_id, cluster_name, cluster_description, count, FORMAT('%0.2f', percentage) percentage, rank_within_dimension from {data_set}.v_cluster_statistics;

CREATE OR REPLACE VIEW v_formatted_character_profiles AS
SELECT sex, social_class_name, wealth_name, values_name, character_count, FORMAT('%0.2f', percentage) percentage, popularity_rank from {data_set}.v_character_profiles;

CREATE OR REPLACE VIEW v_formatted_crosstab_sex_social AS
SELECT sex, social_class_name, frequency, FORMAT('%0.2f', sex_percentage) sex_percentage, FORMAT('%0.2f', social_class_percentage) social_class_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_sex_social;

CREATE OR REPLACE VIEW v_formatted_crosstab_sex_wealth AS
SELECT sex, wealth_name, frequency, FORMAT('%0.2f', sex_percentage) sex_percentage, FORMAT('%0.2f', wealth_percentage) wealth_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_sex_wealth;

CREATE OR REPLACE VIEW v_formatted_crosstab_sex_values AS
SELECT sex, values_name, frequency, FORMAT('%0.2f', sex_percentage) sex_percentage, FORMAT('%0.2f', values_percentage) values_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_sex_values;

CREATE OR REPLACE VIEW v_formatted_crosstab_social_wealth AS
SELECT social_class_name, wealth_name, frequency, FORMAT('%0.2f', social_class_percentage) social_class_percentage, FORMAT('%0.2f', wealth_percentage) wealth_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_social_wealth;

CREATE OR REPLACE VIEW v_formatted_crosstab_social_values AS
SELECT social_class_name, values_name, frequency, FORMAT('%0.2f', social_class_percentage) social_class_percentage, FORMAT('%0.2f', values_percentage) values_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_social_values;

CREATE OR REPLACE VIEW v_formatted_crosstab_wealth_values AS
SELECT wealth_name, values_name, frequency, FORMAT('%0.2f', wealth_percentage) wealth_percentage, FORMAT('%0.2f', values_percentage) values_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_wealth_values;

CREATE OR REPLACE VIEW v_formatted_gender_analysis AS
SELECT sex, social_class_name, wealth_name, values_name, count, total_by_gender, FORMAT('%0.2f', percentage_within_gender) percentage_within_gender, rank_within_gender
from {data_set}.v_gender_analysis;

Query is running:   0%|          |