<a href="https://colab.research.google.com/github/jazoza/cultural-data-analysis/blob/main/02_CDA_HH_narratives.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cultural Data Analysis

Introduction to working with datasets

In [None]:
# import necessary libraries
import os, re, csv
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import spacy
import string
import pickle

## Loading the dataset: heritage homes webistes

The dataset is stored in a shared google drive:
https://drive.google.com/drive/folders/11Shm0edDOiWrOe56fzJQRZi-v_BPSW8E?usp=drive_link

Add it to your drive.

To access it, load your gdrive in 'Files' (see left pane of the notebook in google colab) and navigate to the shared folder. You may need to click on 'refresh' to make it appear on the list.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Country code: change here between 'NL' and 'UK'
cc = 'UK'

In [None]:
gdrive_path = '/content/gdrive/MyDrive/CDA/'

In [None]:
# open the data for one country (cc)
raw_data_file = gdrive_path+cc+'_dataset_website-content-crawler.json'

In [None]:
# Import json data
df=pd.read_json(raw_data_file)

# Print the DataFrame
df.head()

In [None]:
# check if there are further datasets to add per country

!ls "$gdrive_path" | grep 'UK'

In [None]:
df_missing1 = pd.read_json(gdrive_path+'/UK_EH_dataset_website-content-crawler_2025-03-26_09-11-52-434.json')
df_missing2 = pd.read_json(gdrive_path+'/UK_NH_dataset_website-content-crawler_2025-03-26_16-28-44-248.json')
df_missing3 = pd.read_json(gdrive_path+'/UK_PC_dataset_website-content-crawler_2025-03-11_12-28-08-810.json')
result = pd.concat([df, df_missing1, df_missing2, df_missing3])
df = result
df.head()

In [None]:
# select only two columns for analysis: url and text
df=df[['url','text']]
df.head()

Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [None]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?:\w+\.)*\w+\.\w*', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [None]:
# Load the list of domains from a csv file:
cc_column = cc+' domains'
#print(cc_column)

urls = pd.read_csv(gdrive_path+'url_lists/'+cc+'_urls.csv')[cc_column].values.tolist()

# Extract main domains from nl_urls
domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}

# Check if main domains in list_of_links match any domain in nl_domains
matching_links = [link for link in df.url if extract_main_domain(link) in domains]

In [None]:
# this cell can be skipped, it is only for verification

# check how many lines in the dataframe have a matching link to the list of urls
print(len(matching_links))

In [None]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column
df['domain'] = df['url'].apply(extract_main_domain)
df.head()

## A. Scrape webpages: screenshots

Automatically make screenshots of all urls, and save them to your local drive for later analysis.

Code to do this is available elsewhere.

Analyze: https://medium.com/@sehjadkhoja0/title-exploring-and-analyzing-image-data-with-python-79a7f72f4d2b

In [None]:
# Install Playwright library
!pip install playwright
# Install Playwright's browser binaries (Chromium, Firefox, WebKit)
!playwright install
!playwright install-deps

In [None]:
import asyncio
from playwright.async_api import async_playwright

# Define the output directory
output_dir = '/content/sample_data/'

In [None]:
async def take_screenshots():
    async with async_playwright() as p:
        # Use Chromium as the browser
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        for i, url in enumerate(urls):
            try:
                print(f"Navigating to: {url}")
                await page.goto(url, wait_until='networkidle') # wait for network to be idle

                # Sanitize the URL to create a valid filename
                filename = f"screenshot_{i}_{url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_')}.png"
                filepath = os.path.join(output_dir, filename)

                # Take full-page screenshot
                await page.screenshot(path=filepath, full_page=True)
                print(f"Screenshot saved: {filepath}")
            except Exception as e:
                print(f"Error taking screenshot for {url}: {e}")
        await browser.close()
        print("Screenshot process completed.")

# Run the async function
await take_screenshots()

In [None]:
# OPTIONAL
# compress all image files and download the 'sample_data_screenshots' zip file to your local folder for later
# you have to save it manually by navigating to the 'folder' icon (left pane) and selecting the 3 vertical dots next to sample_data_screenshots.zip

import os
import zipfile
import glob

# Define the output directory and zip filename
zip_filename = 'sample_data_screenshots.zip'

# Create a new zip file and add only .png files
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Find all .png files recursively starting from output_dir
    png_files = glob.glob(os.path.join(output_dir, '**', '*.png'), recursive=True)
    for file_path in png_files:
        # Ensure the path inside the zip is relative to the output_dir
        arcname = os.path.relpath(file_path, output_dir)
        zipf.write(file_path, arcname=arcname)


### A.1 Visualize image properties

Calculate and visualize the size (MB) and resolution (width, height in pixels) for all images

In [None]:
# Visualize the size (MB)

import cv2
import plotly.express as px

# Root directory path
root_path = "sample_data/"

# List to store file sizes
sizes = []

# Iterate over each image file in each subdirectory
for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:
        if filename.lower().endswith(('.png')):
            # Load the image file using OpenCV
            img_path = os.path.join(dirpath, filename)
            img = cv2.imread(img_path)

            # Extract the size of the image
            size = os.path.getsize(img_path)
            sizes.append(size)

# Convert the lists to numpy arrays for easier manipulation
sizes = np.array(sizes)

# Create a histogram figure with plotly
fig = px.histogram(x=sizes, nbins=50, title="Distribution of Image Sizes")

# Customize the plot
fig.update_layout(
    xaxis_title="File Size (MB)",
    yaxis_title="Number of Images",
    showlegend=False,
    bargap=0.1,
    bargroupgap=0.1
)

# Show the plot
fig.show()

In [None]:
# Calculate resolution

resolutions = [] # this will tell us how 'long' the pages are, given that they all have the same width in the browser screenshot

# Iterate over each image file in each subdirectory
for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:
        if filename.lower().endswith(('.png')):
            # Load the image file using OpenCV
            img_path = os.path.join(dirpath, filename)
            img = cv2.imread(img_path)

            # Extract the resolution of the image
            resolution = img.shape[:2]
            resolutions.append(resolution)

# Convert the lists to numpy arrays for easier manipulation
resolutions = np.array(resolutions)

In [None]:
# check the 'width' of images - it should be the same for all
resolutions[:, 1]

In [None]:
# Visualize the resolution

import plotly.express as px

# Create a scatter plot figure with plotly
fig = px.scatter(x=resolutions[:, 1], y=resolutions[:, 0], title="Distribution of Image Resolutions")

# Customize the plot
fig.update_layout(
    xaxis_title="Width (pixels)",
    yaxis_title="Height (pixels)",
    showlegend=False,
    hovermode="closest",
    width=800,
    height=600,
    margin=dict(l=50, r=50, b=50, t=50, pad=4)
)

# Show the plot
fig.show()

### A.2 Analyze the brightness and colour distribution of all images

Calculate greyscale representations, channel distribution (Red, Green, Blue) and real colour distribution

In [None]:
# Greyscale analysis: calculate the color distribution from 0-255

color_distributions = []

# Iterate over each image file in each subdirectory
for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:
        if filename.lower().endswith(('.png')):
            # Load the image file using OpenCV
            img_path = os.path.join(dirpath, filename)
            img = cv2.imread(img_path)

            # Extract the color distribution of the image
            color_distribution = np.bincount(img.flatten(), minlength=256)
            color_distributions.append(color_distribution)

# Convert the lists to numpy arrays
color_distributions = np.array(color_distributions)

In [None]:
import plotly.graph_objs as go

# Create a list of bar traces for each color value
traces = []
for i in range(256):
    # Create an RGB color string for grayscale (e.g., rgb(0,0,0) for 0, rgb(255,255,255) for 255)
    color = f"rgb({i}, {i}, {i})"
    traces.append(go.Bar(x=np.arange(len(color_distributions)), y=color_distributions[:, i], name=str(i), marker_color=color))

# Create the figure and set the layout
fig = go.Figure(data=traces)
fig.update_layout(title="Grayscale Tone Distributions for Each Image", xaxis_title="Image Index", yaxis_title="Number of Pixels", barmode="stack")
# Show the plot
fig.show()

### A.3 Analyze the colour composition of website screenshots

In [None]:
import cv2

# This list will store color distributions for each channel (B, G, R) for each image
# It will be a list of lists, where each inner list contains three arrays (one for each channel)
channel_color_distributions = []

# Iterate over each image file in each subdirectory
for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:
        if filename.lower().endswith(('.png')):
            # Load the image file using OpenCV
            img_path = os.path.join(dirpath, filename)
            img = cv2.imread(img_path) # OpenCV reads images as BGR by default

            if img is not None:
                # Split the image into its B, G, R channels
                b_channel, g_channel, r_channel = cv2.split(img)

                # Calculate color distribution for each channel
                b_hist = np.bincount(b_channel.flatten(), minlength=256)
                g_hist = np.bincount(g_channel.flatten(), minlength=256)
                r_hist = np.bincount(r_channel.flatten(), minlength=256)

                channel_color_distributions.append([b_hist, g_hist, r_hist])

# Convert the list of lists to a NumPy array for easier manipulation
# The shape will be (num_images, 3, 256) where 3 is for B, G, R channels
channel_color_distributions = np.array(channel_color_distributions)

In [None]:
import plotly.graph_objects as go

# Calculate the total pixel count for each channel for each image
# The shape of channel_color_distributions is (num_images, 3, 256)
# Summing along the last axis (axis=2) gives us (num_images, 3) total pixel counts
total_channel_pixels = np.sum(channel_color_distributions, axis=2)

# Separate the total pixel counts for each channel
total_b_pixels = total_channel_pixels[:, 0]
total_g_pixels = total_channel_pixels[:, 1]
total_r_pixels = total_channel_pixels[:, 2]

# Create a list of image indices for the x-axis
image_indices = [f"Image {i+1}" for i in range(len(channel_color_distributions))]

# Create traces for each channel, with each image on the x-axis
trace_b = go.Bar(x=image_indices, y=total_b_pixels, name='R', marker_color='blue')
trace_g = go.Bar(x=image_indices, y=total_g_pixels, name='G', marker_color='green')
trace_r = go.Bar(x=image_indices, y=total_r_pixels, name='B', marker_color='red')

# Create the figure and set the layout
fig = go.Figure(data=[trace_b, trace_g, trace_r])
fig.update_layout(
    title="Total RGB pixel counts ",
    xaxis_title="Image",
    yaxis_title="Total Number of Pixels",
    barmode='stack' # Stack the bars for each image
)

# Show the plot
fig.show()

In [None]:
image_data = []

for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:
        if filename.lower().endswith(('.png')):
            img_path = os.path.join(dirpath, filename)
            img = cv2.imread(img_path)
            if img is not None:
                image_data.append(img)

print(f"Loaded {len(image_data)} images. First image shape: {image_data[0].shape}")

In [None]:
def categorize_color(b, g, r):
    # Convert BGR to RGB for easier human-readable categorization
    rgb = (r, g, b)

    # Grayscale check (near-equal R, G, B values)
    # Cast to np.int16 to prevent overflow warnings during subtraction
    if abs(np.int16(r) - np.int16(g)) < 10 and abs(np.int16(r) - np.int16(b)) < 10 and abs(np.int16(g) - np.int16(b)) < 10:
        if r > 200: return 'White'
        if r < 50: return 'Black'
        return 'Gray'

    # Primary/Secondary Color checks
    if r > 200 and g < 100 and b < 100: return 'Red'
    if r < 100 and g > 200 and b < 100: return 'Green'
    if r < 100 and g < 100 and b > 200: return 'Blue'

    if r > 200 and g > 200 and b < 100: return 'Yellow'
    if r < 100 and g > 200 and b > 200: return 'Cyan'
    if r > 200 and g < 100 and b > 200: return 'Magenta'

    # Other common shades (simplified)
    if r > 150 and g > 100 and b < 100: return 'Orange'
    if r > 100 and g < 100 and b < 50: return 'Brown'

    return 'Other'

In [None]:
# WARNING: THIS TAKES LONG TIME (30-45 minutes)
# YOU CAN FOLLOW THE PROGRESS ON THE PROGRESS BAR, BELOW

from collections import defaultdict
from tqdm.notebook import tqdm # Import tqdm for progress bar

image_color_counts = []
color_categories = ['White', 'Black', 'Gray', 'Red', 'Green', 'Blue', 'Yellow', 'Cyan', 'Magenta', 'Orange', 'Brown', 'Other']

# Wrap the image_data iteration with tqdm for a progress bar
for img_index, img in tqdm(enumerate(image_data), total=len(image_data), desc="Processing Images"):
    current_image_counts = defaultdict(int)
    # Reshape image to a list of pixels (height * width, 3)
    pixels = img.reshape(-1, 3)

    for pixel in pixels:
        b, g, r = pixel # OpenCV stores as BGR
        category = categorize_color(b, g, r)
        current_image_counts[category] += 1

    # Convert defaultdict to a regular dict and append to the list
    # Ensure all categories are present, even if their count is 0
    full_counts = {category: current_image_counts[category] for category in color_categories}
    image_color_counts.append(full_counts)

print(f"Processed color counts for {len(image_color_counts)} images.")
# print(image_color_counts[0]) # Print first image's color counts to verify

In [None]:
# check whether the image_color_counts includes all images (62) and all categories (12)
len(image_color_counts[0])

In [None]:
import plotly.graph_objects as go
import numpy as np

# Assuming image_color_counts is a list of dictionaries, and color_categories is defined
# (both populated from previous steps)

# Create a list of image labels for the x-axis
image_labels = [f'Image {i+1}' for i in range(len(image_color_counts))]

# Define a mapping from category names to representative colors for plotting
color_map = {
    'White': 'rgb(255, 255, 255)',
    'Black': 'rgb(0, 0, 0)',
    'Gray': 'rgb(128, 128, 128)',
    'Red': 'rgb(255, 0, 0)',
    'Green': 'rgb(0, 255, 0)',
    'Blue': 'rgb(0, 0, 255)',
    'Yellow': 'rgb(255, 255, 0)',
    'Cyan': 'rgb(0, 255, 255)',
    'Magenta': 'rgb(255, 0, 255)',
    'Orange': 'rgb(255, 165, 0)',
    'Brown': 'rgb(165, 42, 42)',
    'Other': 'rgb(255, 224, 200)' # A neutral color for 'Other'
}

traces = []

# Iterate through each color category to create a trace for it
for category in color_categories:
    # Extract the counts for the current category across all images
    counts_for_category = [img_counts[category] for img_counts in image_color_counts]

    # Create a bar trace for this category
    traces.append(go.Bar(
        x=image_labels,
        y=counts_for_category,
        name=category,
        marker_color=color_map.get(category, 'rgb(150, 150, 150)') # Use defined color or a default grey
    ))

# Create the figure and set the layout
fig = go.Figure(data=traces)
fig.update_layout(
    title="Distribution of colors (categorized w-o) across all images",
    xaxis_title="Image",
    yaxis_title="Number of Pixels",
    barmode='stack', # Stack the bars for each image
    hovermode='x unified' # Show hover info for all stacks at once
)

# Show the plot
fig.show()

## B. Discourse Analysis

### B.1 Close reading one document
(we will consider one website as a 'document')

In [None]:
# list all unique domain names
df['domain'].unique()

In [None]:
# extract all rows (lines) where the value of 'domain' is 'lancashire.gov.uk'
df[df['domain'] == 'lancashire.gov.uk']

In [None]:
# then combine these into a list of pages
document = df[df['domain'] == 'lancashire.gov.uk']['text'].tolist()

In [None]:
document

In [None]:
clean_document = []
for i in document:
  j = i.replace('\n', ' ').replace('\r', '')
  clean_document.append(j)

In [None]:
clean_document

### B.2 Topic Modelling

In [None]:
import sklearn.feature_extraction.text as text

# min_df: ignore words occurring in fewer than `n` documents
# stop_words: ignore very common words ("the", "and", "or", "to", ...)
vec = text.CountVectorizer(lowercase=True, min_df=100, stop_words='english')
dtm = vec.fit_transform(df['text'])

In [None]:
print(f'Shape of document-term matrix: {dtm.shape}. '
      f'Number of tokens {dtm.sum()}')

In [None]:
import sklearn.decomposition as decomposition
NUM_TOPICS = 10
lda_model = decomposition.LatentDirichletAllocation(
    n_components=NUM_TOPICS, learning_method='online', random_state=1)
lda_Z = lda_model.fit_transform(dtm)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

In [None]:
document_topic_distributions = lda_model.fit_transform(dtm)

In [None]:
top_n=10
for idx, topic in enumerate(lda_model.components_):
  print("Topic %d:" % (idx))
  print([(vec.get_feature_names_out()[i], topic[i])
  for i in topic.argsort()[:-top_n - 1:-1]])

In [None]:
no_top_words = 12
no_top_documents = 5
lda_H = lda_model.components_
tf_feature_names = vec.get_feature_names_out()

def display_topics(H, Z, feature_names, docs, no_top_words, no_top_documents):
    for idx, topic in enumerate(H):
        print("Topic %d:" % (idx))
        print("KEYWORDS", " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( Z[:,
                                        idx] )[::-1][0:no_top_documents]
        # good for checking which documents are the most characteristic for certain topics
        for doc_index in top_doc_indices:
            print("TOP DOCS", docs[doc_index])

display_topics(lda_H, lda_Z, tf_feature_names, df['url'].tolist(), no_top_words, no_top_documents)

### B.5 Word2Vec model

In [None]:
nltk.download('punkt_tab')

In [None]:
!pip install gensim

In [None]:
import gensim
from nltk.tokenize import word_tokenize

# X is a list of tokenized texts (i.e. list of lists of tokens)
X = [word_tokenize(item) for item in df.text.tolist()]
#print(X[0:3])
model = gensim.models.Word2Vec(X, min_count=6, vector_size=200) # min_count: how many times a word appears in the corpus; size: number of dimensions

Now try out some keywords that may be characteristic in the corpus on heritage homes, such as 'castle', 'garden', 'party', 'princess'; try also words related to less obvious themes, like 'servant'

You can ask for 'negative' or 'positive' similarity, and explore how these bring up terms that are opposite to the meaning in a variety of ways.

In [None]:
model.wv.most_similar(positive=["castle"], topn=12)

In [None]:
model.wv.most_similar(positive=["garden"], topn=12)

In [None]:
model.wv.most_similar(positive=["servant"], topn=12)

In [None]:
model.wv.most_similar(negative=["princess"], topn=12)