# Cultural Data Analysis

Introduction to working with datasets

In [None]:
# import necessary libraries
import os, re, csv
import pandas as pd
import numpy as np
import gensim, nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import string
import pickle

## Loading the dataset: heritage homes webistes

The dataset is stored in a shared google drive:
https://drive.google.com/drive/folders/11Shm0edDOiWrOe56fzJQRZi-v_BPSW8E?usp=drive_link

Add it to your drive.

To access it, load your gdrive in 'Files' (see left pane of the notebook in google colab) and navigate to the shared folder. You may need to click on 'refresh' to make it appear on the list.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
gdrive_path = '/content/gdrive/MyDrive/CDA/'

In [None]:
# Country code: change here between 'NL' and 'UK'
cc_list = ['NL', 'UK', 'DE', 'FR']

In [None]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?:\w+\.)*\w+\.\w*', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [None]:
# Import json data from Aipfy scraping into 4 separate dataframes
df0=pd.read_json(gdrive_path+cc_list[0]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df0=df0[['url','text']]

df1=pd.read_json(gdrive_path+cc_list[1]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df1=df1[['url','text']]

df2=pd.read_json(gdrive_path+cc_list[2]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df2=df2[['url','text']]

df3=pd.read_json(gdrive_path+cc_list[3]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df3=df3[['url','text']]

df0.head()

Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [None]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column

# first, create a mapping of dataframes which could be addressed in a loop
df_dict = {'0':df0, '1':df1, '2':df2, '3':df3}

# then, loop through the df_dict to update each dataframe
for k, v in df_dict.items():
  cc_column = cc_list[int(k[-1])]+' domains'
  cc = cc_list[int(k[-1])]
  # print(cc_column, cc)
  urls = pd.read_csv(gdrive_path+cc_list[int(k[-1])]+'_urls.csv')[cc_column].values.tolist()
  domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}
  matching_links = [link for link in v.url if extract_main_domain(link) in domains]
  # update the dataframe
  v['domain'] = v['url'].apply(extract_main_domain)

In [None]:
# check one of the dataframes
df1.head()

#### Preparing the text (frequencies, stopwords)

In [None]:
# make all stopword files stored in github available in this notebook:
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/NL.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/UK.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/DE.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/FR.txt'

In [None]:
# load a list of 'stopwords' in the language you are analyzing
def get_stopwords_list(stop_file_path):
    """load stop words """
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))

sw_dict = {}
for i, cc in enumerate(cc_list):
  stopwords_path = cc + ".txt"
  sw_dict[str(i)] = get_stopwords_list(stopwords_path)

In [None]:
sw_dict.values()

In [None]:
# convert the list of lists into one comprehensive list of stopwords in all languages
def flatten_comprehension(dict):
  return [item for row in dict for item in row]

sw_all = flatten_comprehension(sw_dict.values())
# verify if the result is a 1-word string by printing, for example, the 104th item
print(sw_all[3])

In [None]:
# extend the stopwords list with any other words you want to exclude from analysis
special_stop_words = ['nbsp', ' ', '', '—', '\’s', 'ii', 'iii', 'iiii', 'www']
sw_all.extend(special_stop_words)

In [None]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

## Compare corpus structure across corpora

In [None]:
# Calculate total number of words, number of words without stopwords (according to the list sw_all) and number of unique words

# Create a dictionary to store the results
corpus_structure = {}

for key, df in df_dict.items():
  vectorizer = CountVectorizer(stop_words=None)
  X = vectorizer.fit_transform(df['text'])
  # total number of words
  total_words = X.sum()
  # number of words without stopwords
  vectorizer_no_stopwords = CountVectorizer(stop_words=sw_all)
  X_no_stopwords = vectorizer_no_stopwords.fit_transform(df['text'])
  words_without_stopwords = X_no_stopwords.sum()
  # number of unique words
  unique_words = len(vectorizer.vocabulary_)

  corpus_structure[key] = {
      'total_words': total_words,
      'words_without_stopwords': words_without_stopwords,
      'unique_words': unique_words
  }

In [None]:
corpus_structure

In [None]:
# Print or use the results
for key, value in corpus_structure.items():
    print(f"Corpus {key}:")
    print(f"  Total words: {value['total_words']}")
    print(f"  Words without stopwords: {value['words_without_stopwords']}")
    print(f"  Unique words: {value['unique_words']}")

In [None]:
for k,v in corpus_structure.items():
  print(list(v.values())[0])

In [None]:
# Visualize corpus structure per country
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

# Extract data for plotting
labels = cc_list # Use cc_list for labels
total_words = [corpus_structure[str(i)]['total_words'] for i in range(len(cc_list))]
words_without_stopwords = [corpus_structure[str(i)]['words_without_stopwords'] for i in range(len(cc_list))]
unique_words = [corpus_structure[str(i)]['unique_words'] for i in range(len(cc_list))]

x = range(len(labels))
width = 0.2

plt.bar(x, total_words, width, label='Total Words')
plt.bar([i + width for i in x], words_without_stopwords, width, label='Words without Stopwords')
plt.bar([i + 2 * width for i in x], unique_words, width, label='Unique Words')


plt.xticks([i + width for i in x], labels)
plt.xlabel('Country Code')
plt.ylabel('Word Count')
plt.title('Corpus Structure Comparison')
plt.legend()
plt.tight_layout() # improve spacing
plt.show()

In [None]:
# Visualize counties in corpus structure
plt.figure(figsize=(10, 6))

categories = ['total_words', 'words_without_stopwords', 'unique_words']
bar_width = 0.2

for i, (key, value) in enumerate(corpus_structure.items()):
    x_pos = np.arange(len(categories)) + i * bar_width
    plt.bar(x_pos, list(value.values()), width=bar_width, label=cc_list[int(key)]) # Use cc_list for labels

plt.xticks(np.arange(len(categories)) + bar_width, categories)
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.title('Corpus Structure')
plt.legend()
plt.tight_layout()
plt.show()

## Compare collocations across corpora

In [None]:
# load saved pickles
lemma_dict = {}
for i, cc in enumerate(cc_list):
  lemma_path = gdrive_path+'jar/'+cc+'_unlist_documents.pickle'
  with open(lemma_path, 'rb') as handle_u:
    lemma_dict[str(i)] = pickle.load(handle_u)

len(lemma_dict)

In [None]:
# check if word 'kasteel' appears in one of the lemma_dict items '0'='NL', '1'='UK', '2'='DE', '3'='FR']
if 'kasteel' in lemma_dict['3']:
  print('yes')

In [None]:
lemma_all = flatten_comprehension(lemma_dict.values())
# verify if the result is a 1-word string by printing, for example, the 1044th word
# in the comprehensive flat list of all lemmatized words in the four languages
print(sw_all[1044])

In [None]:
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# initiate bigrams and trigrams
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [None]:
# identify all collocations in the flat list of words from all documents
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(lemma_all)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(lemma_all)

In [None]:
# compute basic bigram fequency
bigramFreqTable = pd.DataFrame(list(bigramFinder.ngram_fd.items()), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
# compute basic tri fequency
trigramFreqTable = pd.DataFrame(list(trigramFinder.ngram_fd.items()), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [None]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    for word in ngram:
        _removeNonAscii(word)
        if word in sw_all:
            return False
        if len(word) <= 2:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [None]:
def rightTypesTri(ngram):
    for word in ngram:
        _removeNonAscii(word)
        if word in sw_all:
            return False
        if len(word) <= 2:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

#filter trigrams
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypes(x))]

### Find meaningful bi- and tri-grams for specific search words

In [None]:
# search for words from this list or use another list
search_words = ['kasteel', 'castle', 'château', 'schloss']

In [None]:
# empty dictionary to store the four sections of filtered bigrams
search_dict = {}
for term in search_words:
  # Filter for rows where the 'bigram' column contains 'royal'
  search_bigrams = filtered_bi[filtered_bi['bigram'].astype(str).str.contains(term)]
  search_dict[term] = search_bigrams

In [None]:
search_dict

In [None]:
# empty list to store the top 20 collocations for each term
collocations_list = []

# Iterate through each term in the search_dict
for term, df in search_dict.items():
    # Sort the dataframe by frequency in descending order and take the top 50
    top_50 = df.sort_values(by='freq', ascending=False).head(50)
    # Append the top 50 bigrams as a list to the collocations_list
    collocations_list.append(top_50['bigram'].tolist())

max_len = max(len(lst) for lst in collocations_list)
for i in range(len(collocations_list)):
  if len(collocations_list[i]) < max_len:
    collocations_list[i].extend([''] * (max_len - len(collocations_list[i])))

new_df = pd.DataFrame(collocations_list, index=search_words).T
# Rename the columns to match the search terms
new_df.columns = search_words
new_df

In [None]:
search_tri_dict = {}
for term in search_words:
  search_trigrams = filtered_tri[filtered_tri['trigram'].astype(str).str.contains(term)]
  search_tri_dict[term] = search_trigrams

In [None]:
# empty list to store the top 20 collocations for each term
collocations_list = []

# Iterate through each term in the search_dict
for term, df in search_tri_dict.items():
    # Sort the dataframe by frequency in descending order and take the top 50
    top_50 = df.sort_values(by='freq', ascending=False).head(50)
    # Append the top 50 bigrams as a list to the collocations_list
    collocations_list.append(top_50['trigram'].tolist())

max_len = max(len(lst) for lst in collocations_list)
for i in range(len(collocations_list)):
  if len(collocations_list[i]) < max_len:
    collocations_list[i].extend([''] * (max_len - len(collocations_list[i])))

new_df = pd.DataFrame(collocations_list, index=search_words).T
# Rename the columns to match the search terms
new_df.columns = search_words
new_df

In [None]:
search_tri_dict

In [None]:
# suggested search words (in Dutch) for further exploration
general_words = ['architectuur', 'collectie', 'geschiedenis', 'tuin', 'onderzoek', 'stijl']
family_words = ['kinder', 'spel', 'familie', 'koffie', 'lunch', 'kinderfeestjes', 'huwelijk', 'bruid', 'bruidegom', 'high tea']
epoch_words = ['eeuw', '12de', '13de', '14de', '15de', '16de', 'Barroke', 'Renaissance', 'Romantiek', 'Verlichting', 'Rococo', 'Middeleeuwen', 'schatkamer', 'Floris', 'droom','hofdame']