In [15]:
%%script false --no-raise-error
import json
from google.colab import userdata
from google.oauth2 import service_account
from google.cloud.bigquery import magics

credentials_json = userdata.get('BIGQUERY_CREDENTIALS')
credentials = service_account.Credentials.from_service_account_info(json.loads(credentials_json))
magics.context.credentials = credentials

Couldn't find program: 'false'


In [16]:
from google.cloud import bigquery
from google.cloud.bigquery import magics
%load_ext bigquery_magics

data_set = "testing_set"
project_name = "emerald-entity-468916-f9"

job_config = bigquery.QueryJobConfig(default_dataset = f"{project_name}.{data_set}")
client = bigquery.Client(project = project_name, default_query_job_config = job_config, credentials = globals().get('credentials', None))
magics.context.default_query_job_config = job_config
magics.context.project = project_name

The bigquery_magics extension is already loaded. To reload it, use:
  %reload_ext bigquery_magics


### Statistical views
**Helping to get meaning of the data**

**Almost the same as 'characters' table but with cluster names and sanitized gender value**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_characters_enriched AS
WITH cluster_names AS (SELECT cluster_type, cluster_id, name as cluster_name, description FROM {data_set}.clusters),
social_class_names AS (SELECT cluster_id, cluster_name as social_class_name, description as social_class_desc FROM cluster_names WHERE cluster_type = 'social_class'),
wealth_names AS (SELECT cluster_id, cluster_name as wealth_name, description as wealth_desc FROM cluster_names WHERE cluster_type = 'wealth'),
values_names AS (SELECT cluster_id, cluster_name as values_name, description as values_desc FROM cluster_names WHERE cluster_type = 'values')
SELECT c.social_class_cluster_id, c.wealth_cluster_id, c.values_cluster_id, 
  CASE 
    WHEN LOWER(TRIM(c.sex)) IN ('male', 'm', 'man') THEN 'male'
    WHEN LOWER(TRIM(c.sex)) IN ('female', 'f', 'woman') THEN 'female'
    ELSE 'unknown'
  END as sex,
  COALESCE(sc.social_class_name, 'unknown') social_class_name, COALESCE(sc.social_class_desc, '-') social_class_desc, COALESCE(w.wealth_name, 'unknown') wealth_name,
  COALESCE(w.wealth_desc, '-') wealth_desc, COALESCE(v.values_name, 'unknown') values_name, COALESCE(v.values_desc, '-') values_desc
FROM {data_set}.characters c
LEFT JOIN social_class_names sc ON c.social_class_cluster_id = sc.cluster_id
LEFT JOIN wealth_names w ON c.wealth_cluster_id = w.cluster_id  
LEFT JOIN values_names v ON c.values_cluster_id = v.cluster_id;

Query is running:   0%|          |

**Shows all the clusters identified by KMean analysis**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_cluster_statistics AS
WITH stats AS (
  SELECT 'sex' as dimension, NULL as cluster_id, sex as cluster_name, '-' as cluster_description, COUNT(*) as count,
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as percentage, RANK() OVER (PARTITION BY 'sex' ORDER BY COUNT(*) DESC) as rank_within_dimension
  FROM {data_set}.v_characters_enriched GROUP BY sex
  UNION ALL
  SELECT 'social_class', social_class_cluster_id, social_class_name, social_class_desc, COUNT(*),
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100, RANK() OVER (PARTITION BY 'social_class' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY social_class_cluster_id, social_class_name, social_class_desc
  UNION ALL
  SELECT 'wealth', wealth_cluster_id, wealth_name, wealth_desc, COUNT(*),
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100, RANK() OVER (PARTITION BY 'wealth' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY wealth_cluster_id, wealth_name, wealth_desc
  UNION ALL
  SELECT 'values', values_cluster_id, values_name, values_desc, COUNT(*),
    COUNT(*) / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100, RANK() OVER (PARTITION BY 'values' ORDER BY COUNT(*) DESC)
  FROM {data_set}.v_characters_enriched GROUP BY values_cluster_id, values_name, values_desc
)
SELECT * FROM stats ORDER BY dimension, rank_within_dimension;

Query is running:   0%|          |

**Groups the characters by all the four dimensions (gender, wealth, values, social class) that contains at least 2 persons**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_character_profiles AS
WITH profiles AS (
  SELECT sex, social_class_name, wealth_name, values_name, COUNT(*) as character_count
  FROM {data_set}.v_characters_enriched
  GROUP BY social_class_cluster_id, wealth_cluster_id, values_cluster_id, sex, social_class_name, wealth_name, values_name)
SELECT *, character_count / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as percentage,
  RANK() OVER (ORDER BY character_count DESC) as popularity_rank,
FROM profiles WHERE character_count > 1 ORDER BY character_count DESC;

Query is running:   0%|          |

**Shows relationships between gender and social class**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_social AS
WITH crosstab_data AS (SELECT sex, social_class_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, social_class_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name)
SELECT cd.sex, cd.social_class_name, cd.frequency,
  cd.frequency / st.sex_total * 100 as sex_percentage,
  cd.frequency / sct.social_class_total * 100 as social_class_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.social_class_name = sct.social_class_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

**Shows relationships between gender and wealth**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_wealth AS
WITH crosstab_data AS (SELECT sex, wealth_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, wealth_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name)
SELECT cd.sex, cd.wealth_name, cd.frequency,
  cd.frequency / st.sex_total * 100 as sex_percentage,
  cd.frequency / sct.wealth_total * 100 as wealth_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.wealth_name = sct.wealth_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

**Shows relationships between gender and moral values**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_sex_values AS
WITH crosstab_data AS (SELECT sex, values_name, COUNT(*) as frequency FROM {data_set}.v_characters_enriched GROUP BY sex, values_name),
sex_totals AS (SELECT sex, SUM(frequency) as sex_total FROM crosstab_data GROUP BY sex),
social_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.sex, cd.values_name, cd.frequency,
  cd.frequency / st.sex_total * 100 as sex_percentage,
  cd.frequency / sct.values_total * 100 as values_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN sex_totals st ON cd.sex = st.sex JOIN social_totals sct ON cd.values_name = sct.values_name
ORDER BY cd.sex, cd.frequency DESC;

Query is running:   0%|          |

**Shows relationships between social class and wealth**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_social_wealth AS
WITH crosstab_data AS (SELECT social_class_name, wealth_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY social_class_name, wealth_name),
totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name),
wealth_totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name)
SELECT cd.social_class_name, cd.wealth_name, cd.frequency,
  cd.frequency / t.social_class_total * 100 as social_class_percentage,
  cd.frequency / wt.wealth_total * 100 as wealth_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.social_class_name = t.social_class_name JOIN wealth_totals wt ON cd.wealth_name = wt.wealth_name
ORDER BY cd.social_class_name, cd.frequency DESC;

Query is running:   0%|          |

**Shows relationships between social class and values**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_social_values AS
WITH crosstab_data AS (SELECT social_class_name, values_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY social_class_name, values_name),
totals AS (SELECT social_class_name, SUM(frequency) as social_class_total FROM crosstab_data GROUP BY social_class_name),
values_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.social_class_name, cd.values_name, cd.frequency,
  cd.frequency / t.social_class_total * 100 as social_class_percentage,
  cd.frequency / wt.values_total * 100 as values_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.social_class_name = t.social_class_name JOIN values_totals wt ON cd.values_name = wt.values_name
ORDER BY cd.social_class_name, cd.frequency DESC;

Query is running:   0%|          |

**Shows relationships between social wealth and values**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_crosstab_wealth_values AS
WITH crosstab_data AS (SELECT wealth_name, values_name, COUNT(*) as frequency
  FROM {data_set}.v_characters_enriched GROUP BY wealth_name, values_name),
totals AS (SELECT wealth_name, SUM(frequency) as wealth_total FROM crosstab_data GROUP BY wealth_name),
values_totals AS (SELECT values_name, SUM(frequency) as values_total FROM crosstab_data GROUP BY values_name)
SELECT cd.wealth_name, cd.values_name, cd.frequency,
  cd.frequency / t.wealth_total * 100 as wealth_percentage,
  cd.frequency / wt.values_total * 100 as values_percentage,
  cd.frequency / (SELECT COUNT(*) FROM {data_set}.v_characters_enriched) * 100 as total_percentage
FROM crosstab_data cd JOIN totals t ON cd.wealth_name = t.wealth_name JOIN values_totals wt ON cd.values_name = wt.values_name
ORDER BY cd.wealth_name, cd.frequency DESC;

Query is running:   0%|          |

**Similar to `v_formatted_character_profiles` but with gender related counts and percentage columns**

In [None]:
%%bigquery --pyformat

CREATE OR REPLACE VIEW v_gender_analysis AS
WITH gender_profiles AS (SELECT sex, social_class_name, wealth_name, values_name, COUNT(*) as count
  FROM {data_set}.v_characters_enriched GROUP BY sex, social_class_name, wealth_name, values_name),
gender_totals AS (SELECT sex, SUM(count) as total_by_gender FROM gender_profiles GROUP BY sex)
SELECT gp.*, gt.total_by_gender,
  gp.count / gt.total_by_gender * 100 as percentage_within_gender,
  RANK() OVER (PARTITION BY gp.sex ORDER BY gp.count DESC) as rank_within_gender
FROM gender_profiles gp JOIN gender_totals gt ON gp.sex = gt.sex
WHERE gp.count > 1 ORDER BY gp.sex, gp.count DESC;

Query is running:   0%|          |

**The same views as above but with nicely formatted percentage for human analysis**

In [None]:
%%bigquery --pyformat
CREATE OR REPLACE VIEW v_formatted_cluster_statistics AS
SELECT dimension, cluster_id, cluster_name, cluster_description, count, FORMAT('%0.2f', percentage) percentage, rank_within_dimension from {data_set}.v_cluster_statistics;

CREATE OR REPLACE VIEW v_formatted_character_profiles AS
SELECT sex, social_class_name, wealth_name, values_name, character_count, FORMAT('%0.2f', percentage) percentage, popularity_rank from {data_set}.v_character_profiles;

CREATE OR REPLACE VIEW v_formatted_crosstab_sex_social AS
SELECT sex, social_class_name, frequency, FORMAT('%0.2f', sex_percentage) sex_percentage, FORMAT('%0.2f', social_class_percentage) social_class_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_sex_social;

CREATE OR REPLACE VIEW v_formatted_crosstab_sex_wealth AS
SELECT sex, wealth_name, frequency, FORMAT('%0.2f', sex_percentage) sex_percentage, FORMAT('%0.2f', wealth_percentage) wealth_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_sex_wealth;

CREATE OR REPLACE VIEW v_formatted_crosstab_sex_values AS
SELECT sex, values_name, frequency, FORMAT('%0.2f', sex_percentage) sex_percentage, FORMAT('%0.2f', values_percentage) values_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_sex_values;

CREATE OR REPLACE VIEW v_formatted_crosstab_social_wealth AS
SELECT social_class_name, wealth_name, frequency, FORMAT('%0.2f', social_class_percentage) social_class_percentage, FORMAT('%0.2f', wealth_percentage) wealth_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_social_wealth;

CREATE OR REPLACE VIEW v_formatted_crosstab_social_values AS
SELECT social_class_name, values_name, frequency, FORMAT('%0.2f', social_class_percentage) social_class_percentage, FORMAT('%0.2f', values_percentage) values_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_social_values;

CREATE OR REPLACE VIEW v_formatted_crosstab_wealth_values AS
SELECT wealth_name, values_name, frequency, FORMAT('%0.2f', wealth_percentage) wealth_percentage, FORMAT('%0.2f', values_percentage) values_percentage,
    FORMAT('%0.2f', total_percentage) total_percentage from {data_set}.v_crosstab_wealth_values;

CREATE OR REPLACE VIEW v_formatted_gender_analysis AS
SELECT sex, social_class_name, wealth_name, values_name, count, total_by_gender, FORMAT('%0.2f', percentage_within_gender) percentage_within_gender, rank_within_gender
from {data_set}.v_gender_analysis;

Query is running:   0%|          |