In [2]:
from google.cloud import bigquery
from google.cloud.bigquery import magics
%load_ext bigquery_magics

data_set = "testing_set"
project_name = "emerald-entity-468916-f9"
library_path = "gdelt-bq.internetarchivebooks"

job_config = bigquery.QueryJobConfig(default_dataset = f"{project_name}.{data_set}")
client = bigquery.Client(project = project_name, default_query_job_config = job_config)
magics.context.default_query_job_config = job_config

In [6]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_characters_enriched AS
WITH cluster_names AS (SELECT cluster_type, cluster_id, name as cluster_name, description FROM {data_set}.clusters),
social_class_names AS (SELECT cluster_id, cluster_name as social_class_name, description as social_class_desc FROM cluster_names WHERE cluster_type = 'social_class'),
wealth_names AS (SELECT cluster_id, cluster_name as wealth_name, description as wealth_desc FROM cluster_names WHERE cluster_type = 'wealth'),
values_names AS (SELECT cluster_id, cluster_name as values_name, description as values_desc FROM cluster_names WHERE cluster_type = 'values')
SELECT c.social_class_cluster_id, c.wealth_cluster_id, c.values_cluster_id, c.sex, sc.social_class_name, sc.social_class_desc,
  w.wealth_name, w.wealth_desc, v.values_name, v.values_desc
FROM {data_set}.characters c
JOIN social_class_names sc ON c.social_class_cluster_id = sc.cluster_id
JOIN wealth_names w ON c.wealth_cluster_id = w.cluster_id  
JOIN values_names v ON c.values_cluster_id = v.cluster_id
WHERE c.social_class_cluster_id IS NOT NULL AND c.wealth_cluster_id IS NOT NULL AND c.values_cluster_id IS NOT NULL AND c.sex IN ('male', 'female');

Query is running:   0%|          |

In [14]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_cluster_statistics AS
WITH stats AS (
  SELECT 'sex' as dimension, NULL as cluster_id, sex as cluster_name, '-' as cluster_description, COUNT(*) as count,
    FORMAT('%0.2f', COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as percentage, RANK() OVER (PARTITION BY 'sex' ORDER BY COUNT(*) DESC) as rank_within_dimension
  FROM {data_set}.v_characters_enriched GROUP BY sex
  UNION ALL
  SELECT 'social_class', social_class_cluster_id, social_class_name, social_class_desc, COUNT(*),
    FORMAT('%0.2f', COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100), RANK() OVER (PARTITION BY 'social_class' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY social_class_cluster_id, social_class_name, social_class_desc
  UNION ALL
  SELECT 'wealth', wealth_cluster_id, wealth_name, wealth_desc, COUNT(*),
    FORMAT('%0.2f', COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100), RANK() OVER (PARTITION BY 'wealth' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY wealth_cluster_id, wealth_name, wealth_desc
  UNION ALL
  SELECT 'values', values_cluster_id, values_name, values_desc, COUNT(*),
    FORMAT('%0.2f', COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100), RANK() OVER (PARTITION BY 'values' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY values_cluster_id, values_name, values_desc
)
SELECT * FROM stats ORDER BY dimension, rank_within_dimension;

Query is running:   0%|          |

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_character_profiles AS
WITH profiles AS (
  SELECT sex, social_class_name, wealth_name, values_name, COUNT(*) as character_count
  FROM {data_set}.v_characters_enriched
  GROUP BY social_class_cluster_id, wealth_cluster_id, values_cluster_id, sex, social_class_name, wealth_name, values_name)
SELECT *, FORMAT('%0.2f', character_count / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as percentage,
  RANK() OVER (ORDER BY character_count DESC) as popularity_rank,
FROM profiles WHERE character_count > 1 ORDER BY character_count DESC;

Query is running:   0%|          |

In [24]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_social AS
WITH crosstab_data AS (SELECT sex, social_class_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, social_class_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name)
SELECT cd.sex, cd.social_class_name, cd.frequency,
  FORMAT('%0.2f', cd.frequency / st.sex_total * 100) as sex_percentage,
  FORMAT('%0.2f', cd.frequency / sct.social_class_total * 100) as social_class_percentage,
  FORMAT('%0.2f', cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.social_class_name = sct.social_class_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

In [25]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_wealth AS
WITH crosstab_data AS (SELECT sex, wealth_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, wealth_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name)
SELECT cd.sex, cd.wealth_name, cd.frequency,
  FORMAT('%0.2f', cd.frequency / st.sex_total * 100) as sex_percentage,
  FORMAT('%0.2f', cd.frequency / sct.wealth_total * 100) as wealth_percentage,
  FORMAT('%0.2f', cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.wealth_name = sct.wealth_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

In [26]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_values AS
WITH crosstab_data AS (SELECT sex, values_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, values_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.sex, cd.values_name, cd.frequency,
  FORMAT('%0.2f', cd.frequency / st.sex_total * 100) as sex_percentage,
  FORMAT('%0.2f', cd.frequency / sct.values_total * 100) as values_percentage,
  FORMAT('%0.2f', cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.values_name = sct.values_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

In [35]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_social_wealth AS
WITH crosstab_data AS (SELECT social_class_name, wealth_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY social_class_name, wealth_name),
totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name),
wealth_totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name)
SELECT cd.social_class_name, cd.wealth_name, cd.frequency,
  FORMAT('%0.2f', cd.frequency / t.social_class_total * 100) as social_class_percentage,
  FORMAT('%0.2f', cd.frequency / wt.wealth_total * 100) as wealth_class_percentage,
  FORMAT('%0.2f', cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.social_class_name = t.social_class_name JOIN wealth_totals wt ON cd.wealth_name = wt.wealth_name
ORDER BY cd.social_class_name, cd.frequency DESC;

Query is running:   0%|          |

In [36]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_social_values AS
WITH crosstab_data AS (SELECT social_class_name, values_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY social_class_name, values_name),
totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name),
values_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.social_class_name, cd.values_name, cd.frequency,
  FORMAT('%0.2f', cd.frequency / t.social_class_total * 100) as social_class_percentage,
  FORMAT('%0.2f', cd.frequency / wt.values_total * 100) as values_class_percentage,
  FORMAT('%0.2f', cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.social_class_name = t.social_class_name JOIN values_totals wt ON cd.values_name = wt.values_name
ORDER BY cd.social_class_name, cd.frequency DESC;

Query is running:   0%|          |

In [37]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_wealth_values AS
WITH crosstab_data AS (SELECT wealth_name, values_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY wealth_name, values_name),
totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name),
values_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.wealth_name, cd.values_name, cd.frequency,
  FORMAT('%0.2f', cd.frequency / t.wealth_total * 100) as wealth_percentage,
  FORMAT('%0.2f', cd.frequency / wt.values_total * 100) as values_class_percentage,
  FORMAT('%0.2f', cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100) as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.wealth_name = t.wealth_name JOIN values_totals wt ON cd.values_name = wt.values_name
ORDER BY cd.wealth_name, cd.frequency DESC;

Query is running:   0%|          |

In [40]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_gender_analysis AS
WITH gender_profiles AS (SELECT sex, social_class_name, wealth_name, values_name, COUNT(*) as count
  FROM {data_set}.v_characters_enriched GROUP BY sex, social_class_name, wealth_name, values_name),
gender_totals AS (SELECT sex, SUM(count) as total_by_gender FROM gender_profiles GROUP BY sex)
SELECT gp.*, gt.total_by_gender,
  FORMAT('%0.2f', gp.count / gt.total_by_gender * 100) as percentage_within_gender,
  RANK() OVER (PARTITION BY gp.sex ORDER BY gp.count DESC) as rank_within_gender
FROM gender_profiles gp JOIN gender_totals gt ON gp.sex = gt.sex
WHERE gp.count > 1 ORDER BY gp.sex, gp.count DESC;

Query is running:   0%|          |