# New Narratives for old Buildings

---



Finding narratives through comparison of term frequency across countries

In [3]:
import os, re, csv
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import spacy
import string
import pickle
import matplotlib.pyplot as plt

## Loading the dataset: heritage homes webistes


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
path = '/content/gdrive/MyDrive/CDA/' # working with google drive and colab

# path = 'datasets/' # working locally

In [6]:
# Country code: change here between 'NL' and 'UK'
cc_list = ['NL', 'UK', 'DE', 'FR']

In [7]:
# Import json data from Aipfy scraping into 4 separate dataframes
df0=pd.read_json(path+cc_list[0]+'_dataset_website-content-crawler.json') # NL
# select only two columns for analysis: url and text
df0=df0[['url','text']]

df1=pd.read_json(path+cc_list[1]+'_dataset_website-content-crawler.json') # UK
# select only two columns for analysis: url and text
df1=df1[['url','text']]

df2=pd.read_json(path+cc_list[2]+'_dataset_website-content-crawler.json') # DE
# select only two columns for analysis: url and text
df2=df2[['url','text']]

df3=pd.read_json(path+cc_list[3]+'_dataset_website-content-crawler.json') # FR
# select only two columns for analysis: url and text
df3=df3[['url','text']]

df0.head()

Unnamed: 0,url,text
0,http://weldam.nl/,"Introduction - Weldam\nIntroduction\nWeldam, s..."
1,http://weldam.nl/nederlands.html,Nederlands - Weldam\nCopyright Landgoed Weldam...
2,http://weldam.nl/nederlands/beginpagina/test-2...,Test 1.2 - Weldam\nCopyright Landgoed Weldam 2...
3,https://www.huisdoorn.nl/,Ontdek de geschiedenis - Museum Huis Doorn\nDe...
4,https://www.museumdefundatie.nl/,Museum de FundatieTwitter Widget Iframe\nMuseu...


In [11]:
# check if there are further datasets to add per country

!ls "$path" | grep "^UK_"

UK_dataset_website-content-crawler.json
UK_EH_dataset_website-content-crawler_2025-03-18_13-49-50-417.json
UK_PC_dataset_website-content-crawler_2025-03-11_12-28-08-810.json


In [10]:
# manually append additional datasets (based on previous step)
df_missing1 = pd.read_json(path+'/NL_LG_dataset_website-content-crawler_2025-02-06_09-40-33-880.json')
result = pd.concat([df0, df_missing1])

In [12]:
df_missing1 = pd.read_json(path+'/UK_EH_dataset_website-content-crawler_2025-03-18_13-49-50-417.json')
df_missing2 = pd.read_json(path+'/UK_PC_dataset_website-content-crawler_2025-03-11_12-28-08-810.json')
result = pd.concat([df1, df_missing1, df_missing2])

Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [13]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?m)http(?:s?):\/\/.*?([^\.\/]+?\.[^\.]+?)(?:\/|$)', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [14]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column

# first, create a mapping of dataframes which could be addressed in a loop
df_dict = {'0':df0, '1':df1, '2':df2, '3':df3}

# then, loop through the df_dict to update each dataframe
for k, v in df_dict.items():
  cc_column = cc_list[int(k[-1])]+' domains'
  cc = cc_list[int(k[-1])]
  # print(cc_column, cc)
  urls = pd.read_csv(path+'url_lists/'+cc_list[int(k[-1])]+'_urls.csv')[cc_column].values.tolist()
  domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}
  matching_links = [link for link in v.url if extract_main_domain(link) in domains]
  # update the dataframe
  v['domain'] = v['url'].apply(extract_main_domain)

In [16]:
# check one of the dataframes
df0.head()

Unnamed: 0,url,text,domain
0,http://weldam.nl/,"Introduction - Weldam\nIntroduction\nWeldam, s...",eldam.nl
1,http://weldam.nl/nederlands.html,Nederlands - Weldam\nCopyright Landgoed Weldam...,eldam.nl
2,http://weldam.nl/nederlands/beginpagina/test-2...,Test 1.2 - Weldam\nCopyright Landgoed Weldam 2...,eldam.nl
3,https://www.huisdoorn.nl/,Ontdek de geschiedenis - Museum Huis Doorn\nDe...,huisdoorn.nl
4,https://www.museumdefundatie.nl/,Museum de FundatieTwitter Widget Iframe\nMuseu...,museumdefundatie.nl


## Compare term frequencies across corpora

In [17]:
# list terms to compare
activities = ['kind','spel','avontuur',
              'child','game','adventure',
              'kind', 'spiel', 'abenteuer',
              'enfant', 'jeu', 'aventure']
events = ['feest','bruiloft','bruid','bruidegom',
          'party','wedding','bride','groom',
          'party', 'hochzeit', 'braut', 'bräutigam',
          'fête', 'mariage', 'mariée', 'marié']
nobility = ['graaf','baron','hertog',
            'earl','baron','duke',
            'graf','baron','herzog',
            'compte','baron','duc']
culture = ['tentoonstelling','museum','tuin','park','tour','collectie',
           'exhibition','museum','garden','park','tour','collection',
           'ausstellung','museum','garten','park','tour','sammlung',
           'exposition','musée','jardin','parc','tour','collection']
styles = ['rococo', 'barok', 'renaissance', 'gotisch',
          'rococo', 'baroque', 'renaissance', 'gothic',
          'rokoko', 'barock', 'renaissance', 'gotik',
          'rococo', 'baroc', 'renaissance', 'gothique']

In [18]:
# count term frequencies across four dataframes
list_to_count = culture # replace with the name of a list of terms (above)
dataframes = [df0, df1, df2, df3] # list of dataframes
number_of_terms = int(len(list_to_count) / 4) # calculate the number of terms in 1 language

term_count_dict = {}
for i, df in enumerate(dataframes):
  term_count_values = []
  for term in list_to_count[:number_of_terms]:
    term_count_column = df['text'].apply(lambda x: x.lower().count(term) if isinstance(x, str) else 0)
    #print(int(term_count_column.sum()))
    term_count_values.append(int(term_count_column.sum()))
  term_count_dict[cc_list[i]] = term_count_values

term_count_dict

{'NL': [2407, 10616, 7303, 5752, 1497, 2868],
 'UK': [0, 3061, 0, 4488, 3032, 0],
 'DE': [52, 10618, 53, 6424, 2901, 49],
 'FR': [2, 785, 177, 2290, 15926, 5]}

In [1]:
# Visualize the term count as a stacked bar chart per country
theme = 'art and culture'
terms = list_to_count[number_of_terms:number_of_terms*2] # change which list is visualized
# Create a list of countries (y-axis labels)
countries = list(term_count_dict.keys())

# Create the stacked bar chart
width = 0.45  # Width of the bars

fig, ax = plt.subplots(figsize=(10,8))

bottom = np.zeros(len(countries))
colors = ['#92E0E0', 'red', '#C1E000', '#C192E0','#E0C200','#E05543']

# Add horizontal lines for values in the legend
for value in [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]:
    #[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    #[250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250]
    #[100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000]
    #[100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000]
  plt.axhline(y=value, color='grey', linestyle='--', linewidth=0.5)

for term_index in range(len(terms)):
    term_counts = [term_count_dict[country][term_index] for country in countries]
    ax.bar(countries, term_counts, width, label=terms[term_index], bottom=bottom,
           color=colors[term_index % len(colors)])
    bottom += np.array(term_counts)

ax.set_xlabel('Countries', color='white')
ax.set_ylabel('Term Counts', color='white')
ax.set_title('Terms related to '+theme+' per country', color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['left'].set_color('white')
ax.spines['right'].set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.legend()

plt.tight_layout()
plt.savefig(path+'outputs/2024_'+theme+'.png', transparent=True)
plt.show()

NameError: name 'list_to_count' is not defined

In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer

# count term frequencies across four dataframes
list_to_count = events # replace with the name of a list of terms (above)
dataframes = [df0, df1, df2, df3] # list of dataframes
number_of_terms = int(len(list_to_count) / 4) # calculate the number of terms in 1 language

# Combine text from all dataframes into a single list
corpus = []
for df in dataframes:
  corpus.extend(df['text'].dropna().tolist())

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Calculate TF-IDF scores for each term in each document
tfidf_scores = tfidf_matrix.toarray()

# Create a dictionary to store TF-IDF scores for each term and country
term_tfidf_dict = {}
for i, df in enumerate(dataframes):
    term_tfidf_values = []
    for term in list_to_count[:number_of_terms]:
        if term in feature_names:
            term_index = np.where(feature_names == term)[0][0]
            term_tfidf = np.sum(tfidf_scores[df.index.tolist(), term_index]) # Sum TF-IDF scores for the term across documents in the dataframe
            term_tfidf_values.append(term_tfidf)
        else:
            term_tfidf_values.append(0)
    term_tfidf_dict[cc_list[i]] = term_tfidf_values

term_tfidf_dict
