# Cultural Data Analysis

Introduction to working with datasets

In [7]:
# import necessary libraries
import os, re, csv
import pandas as pd
import numpy as np
import gensim, nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import spacy
import string
import pickle

## Loading the dataset: heritage homes webistes

The dataset is stored in a shared google drive:
https://drive.google.com/drive/folders/11Shm0edDOiWrOe56fzJQRZi-v_BPSW8E?usp=drive_link

Add it to your drive.

To access it, load your gdrive in 'Files' (see left pane of the notebook in google colab) and navigate to the shared folder. You may need to click on 'refresh' to make it appear on the list.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Country code: change here between 'NL' and 'UK'
cc = 'NL'

In [3]:
gdrive_path = '/content/gdrive/MyDrive/CDA/'

In [4]:
raw_data_file = gdrive_path+cc+'_dataset_website-content-crawler.json'

In [8]:
# Import json data from Aipfy scraping
df=pd.read_json(raw_data_file)
# select only two columns for analysis: url and text
df=df[['url','text']]
# Print the DataFrame
df.head()

Unnamed: 0,url,text
0,https://www.ledonjondehoudan.fr/,Donjon de Houdan\nLe Donjon\nProgrammation\nPr...
1,https://www.forteresse-mornas.fr/,Forteresse de Mornas - Vivez le Moyen Âge à Mo...
2,https://www.chateaudelarocheguyon.fr/,Château de La Roche-Guyon – Histoire et Créati...
3,https://www.chateau-coucy.fr/,Bienvenue au Domaine national du château de Co...
4,https://www.chateau-maisons.fr/,Bienvenue au château de Maisons\nLe vestibule ...


Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [9]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?:\w+\.)*\w+\.\w*', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [10]:
# Load the list of domains from a csv file:
cc_column = cc+' domains'
#print(cc_column)

urls = pd.read_csv(gdrive_path+cc+'_urls.csv')[cc_column].values.tolist()

# Extract main domains from nl_urls
domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}

# Check if main domains in list_of_links match any domain in nl_domains
matching_links = [link for link in df.url if extract_main_domain(link) in domains]

In [11]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column
df['domain'] = df['url'].apply(extract_main_domain)
df.head()

Unnamed: 0,url,text,domain
0,https://www.ledonjondehoudan.fr/,Donjon de Houdan\nLe Donjon\nProgrammation\nPr...,ledonjondehoudan.fr
1,https://www.forteresse-mornas.fr/,Forteresse de Mornas - Vivez le Moyen Âge à Mo...,forteresse
2,https://www.chateaudelarocheguyon.fr/,Château de La Roche-Guyon – Histoire et Créati...,chateaudelarocheguyon.fr
3,https://www.chateau-coucy.fr/,Bienvenue au Domaine national du château de Co...,chateau
4,https://www.chateau-maisons.fr/,Bienvenue au château de Maisons\nLe vestibule ...,chateau


In [33]:
# prompt: give me the frequency of the term 'baron' in the df, in the column text

# Convert the 'text' column to lowercase
df['text'] = df['text'].str.lower()

# Count the occurrences of 'baron' in the 'text' column / check 'earl' in UK
baron_counts = df['text'].str.count('etienne').sum()

print(f"The term 'apple' appears {baron_counts} times in the 'text' column.")

The term 'apple' appears 90 times in the 'text' column.


## Understand meaningful words collocations

#### Preparing the text (stopwords, lemmatization, etc)

In [12]:
# make all stopword files stored in github available in this notebook:
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/NL.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/UK.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/DE.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/FR.txt'

--2025-01-08 08:34:55--  https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/NL.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 453 [text/plain]
Saving to: ‘NL.txt’


2025-01-08 08:34:55 (7.04 MB/s) - ‘NL.txt’ saved [453/453]

--2025-01-08 08:34:55--  https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/UK.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7678 (7.5K) [text/plain]
Saving to: ‘UK.txt’


2025-01-08 08:34:56 (82.2 MB/s) - 

In [13]:
# load a list of 'stopwords' in the language you are analyzing
def get_stopwords_list(stop_file_path):
    """load stop words """
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))
stopwords_path = cc+".txt"
stopwords = get_stopwords_list(stopwords_path)

In [14]:
# extend the stopwords list with any other words you want to exclude from analysis
special_stop_words = ['nbsp', 'www', ' ', '', '—', '\’s', 'ii', 'iii', 'iiii', 'l\’']
stopwords_ext = stopwords+special_stop_words

In [15]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

### Load lemmatized text data as document lists

You can see how to produce lemmatized text data in week 4:
- [4. HH Narratives](https://colab.research.google.com/github/jazoza/cultural-data-analysis/blob/main/04_CDA_HH_narratives.ipynb)

In [16]:
# load saved pickles
with open(gdrive_path+'jar/'+cc+'_lemmatized.pickle', 'rb') as handle_l:
    lemmatized = pickle.load(handle_l)

with open(gdrive_path+'jar/'+cc+'_unlist_documents.pickle', 'rb') as handle_u:
    unlist_documents = pickle.load(handle_u)

In [17]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [18]:
# initiate bigrams and trigrams
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [19]:
# identify all collocations in the flat list of words from all documents
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_documents)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_documents)

In [20]:
# compute basic bigram fequency
bigramFreqTable = pd.DataFrame(list(bigramFinder.ngram_fd.items()), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
# compute basic tri fequency
trigramFreqTable = pd.DataFrame(list(trigramFinder.ngram_fd.items()), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

### Find meaningful bi- and tri-grams by filtering adjectives and nouns based on an nltk functionality

In [22]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    for word in ngram:
        _removeNonAscii(word)
        if word in stopwords_ext:
            return False
        if len(word) <= 2:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [23]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]
filtered_bi[:20]

Unnamed: 0,bigram,freq
408,"(geldersch, landschap)",1662
5669,"(kasteel, hoensbroek)",1522
4952,"(kasteeltuinen, arcen)",1275
183556,"(bezoek, site)",1263
2940,"(museum, ijsselstein)",919
88,"(huis, doorn)",897
183549,"(historisch, goud)",864
3623,"(huis, bergh)",830
33287,"(fundatie, collection)",788
6301,"(dekema, state)",665


### Search for bigrams containing specific words

In [28]:
# define the search term by changing the word 'château' to a word you want to explore
search_term = 'baron'
# Filter for bigrams containing the word 'visite'
search_bigrams = filtered_bi[filtered_bi['bigram'].apply(lambda x: search_term in x)]

# print 20 most frequent collocations of the searc_term
# if invalid, try changing the number 20 to a smaller value
search_bigrams.sort_index(ascending=False)[:20]

Unnamed: 0,bigram,freq
562874,"(baron, hélin)",1
562821,"(baron, afleggen)",1
562066,"(erbt, baron)",1
561710,"(baron, opgezetten)",1
561702,"(baron, extra)",1
561530,"(erft, baron)",1
560489,"(dier, baron)",1
558715,"(baron, verzorgden)",1
558453,"(baron, egmont)",1
556347,"(vertellen, baron)",1


In [26]:
def rightTypesTri(ngram):
    for word in ngram:
        _removeNonAscii(word)
        if word in stopwords_ext:
            return False
        if len(word) <= 2:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [27]:
#filter trigrams
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypes(x))]
filtered_tri[:20]

Unnamed: 0,trigram,freq
94,"(museum, huis, doorn)",573
13206,"(rondom, elk, kasteel)",547
469236,"(heino, wijhe, identificatie)",474
344936,"(site, streekarchief, rijckheyt)",421
344935,"(bezoek, site, streekarchief)",421
344924,"(bezoek, site, thermenmuseum)",421
344922,"(gemeente, heerlen, bezoek)",421
344925,"(site, thermenmuseum, heerlen)",421
344926,"(thermenmuseum, heerlen, romeins)",421
344914,"(historisch, goud, historisch)",421


In [30]:
# define the search term by changing the word 'château' to a word you want to explore
search_term = 'baron'
# Filter for bigrams containing the word 'visite'
search_trigrams = filtered_tri[filtered_tri['trigram'].apply(lambda x: search_term in x)]

# print 20 most frequent collocations of the searc_term
# if invalid, try changing the number 20 to a smaller value
search_trigrams.sort_index(ascending=False)[:20]

Unnamed: 0,trigram,freq
1206683,"(baron, willen, daarom)",1
1205147,"(erbt, baron, etienne)",1
1203543,"(erft, baron, etienne)",1
1200403,"(dier, baron, etienne)",1
1200402,"(welk, dier, baron)",1
1200385,"(baron, etienne, huren)",1
1199964,"(baron, thierry, wonen)",1
1198314,"(erft, etienne, baron)",1
1189463,"(vertellen, baron, etienne)",1
1189462,"(boek, vertellen, baron)",1


## Topic Modeling

Use [Latent-dirichlet allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) to extract topics from the text based on a statistical model

In [34]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

NUM_TOPICS = 10

# data to work with: list of tweets
documents = df['text'].tolist()

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words=stopwords_ext, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
documents_vectorized = vectorizer.fit_transform(documents)
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(documents_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
(393104, 10)
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print_topics(lda_model, vectorizer)

(11512, 10)
LDA Model:
Topic 0:
[('les', 11909.426115972314), ('teau', 8803.426618258178), ('des', 8566.977010420502), ('visite', 8497.836031343533), ('pour', 7596.8175937223195), ('vous', 5337.553310668156), ('sur', 5263.2605257052055), ('dans', 4367.887304731753), ('blois', 3548.575265675199), ('votre', 3443.4603728479237)]
Topic 1:
[('des', 4636.051361549724), ('site', 4457.380718281714), ('les', 4141.441402144803), ('vous', 4110.266485095968), ('sur', 3055.4503640880607), ('par', 2742.0470367910066), ('donn', 2529.5071641845566), ('pour', 2497.7089529118603), ('votre', 1990.5652443326908), ('que', 1872.2781123974107)]
Topic 2:
[('weer', 2024.900102262607), ('onze', 1242.8406342658636), ('heel', 1005.5922020893381), ('goed', 876.3772181205004), ('wel', 839.8407866270144), ('gaan', 735.1680578512915), ('hadden', 683.8621466109894), ('matt', 668.7441995229178), ('alle', 609.3456869624405), ('week', 608.3269625412266)]
Topic 3:
[('des', 14441.861788206792), ('les', 11657.464068573205),

In [35]:
from sklearn.decomposition import LatentDirichletAllocation
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = df['text'].tolist()

vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords_ext) # Modify stopwords if needed
dtm = vectorizer.fit_transform(documents)

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda.fit(dtm)


no_top_words = 10
no_top_documents = 10

display_topics(lda.components_, lda.transform(dtm), vectorizer.get_feature_names_out(), documents, no_top_words, no_top_documents)

Topic 0:
et des yonne les département la le ou aide un
Topic 1:
la des vous et le site les du ou sur
Topic 2:
château du 2024 12 blois royal au visite 01 les
Topic 3:
the and to we for on by from weer with
Topic 4:
et la du les château le visite des pour au
Topic 5:
und von des monuments den zu gavray billets das sie
Topic 6:
la et le du des les musée corse au une
Topic 7:
la et le les des du un est une au
Topic 8:
château du haut 00 pédagogique koenigsbourg fr fiche et min
Topic 9:
les la le et 2024 château du au savoir des


#### Visualize topics

In [None]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.io import output_notebook
output_notebook()

from sklearn.decomposition import TruncatedSVD

documents_vectorized = vectorizer.fit_transform(documents)
# plotting documents in 2D
svd = TruncatedSVD(n_components=20)
documents_2d = svd.fit_transform(documents_vectorized)

bok_df = pd.DataFrame(columns=['x', 'y', 'document'])
bok_df['x'], bok_df['y'], bok_df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(documents))

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')

plot=figure(min_width=1000, height=600)
#plot = figure(plot_width=1000, plot_height=600)
plot.scatter("x", "y", size=12, source=source, line_color="black", fill_alpha=0.4)
plot.add_layout(labels)
show(plot, notebook_handle=True)

## Vectorizing the corpus

## Word2Vec model, explore and visualize

In [36]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [37]:
from nltk.tokenize import word_tokenize

# X is a list of tokenized texts (i.e. list of lists of tokens)
X = [word_tokenize(item) for item in df.text.tolist()]
#print(X[0:3])
model = gensim.models.Word2Vec(X, min_count=6, vector_size=200) # min_count: how many times a word appears in the corpus; size: number of dimensions

In [39]:
model.wv.most_similar(positive=["tuin"], topn=12)

[('delen', 0.9513186812400818),
 ('gezet', 0.9510095119476318),
 ('gang', 0.9467923641204834),
 ('gaten', 0.9438691139221191),
 ('boiler', 0.9392896890640259),
 ('opnieuw', 0.9391862154006958),
 ('kachel', 0.9387052059173584),
 ('weide', 0.9386748671531677),
 ('electra', 0.9367073178291321),
 ('ruimte', 0.9352202415466309),
 ('kleur', 0.9352045059204102),
 ('vloer', 0.9343982934951782)]

### Visualize keywords with t-SNE

Choose keywords that correspond to your analysis and visualize how they and their closest terms are distributed in the discourse.
Use t-SNE to visualize the relations.

* [t-Distributed Stochastic Neighbor Embedding (t-SNE)](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) is a technique for dimensionality reduction that is particularly well suited for the visualization of high-dimensional datasets.


In [None]:
keys_role = ['Man', 'Frau']

embedding_clusters = []
word_clusters = []
for word in keys_role:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
from sklearn.manifold import TSNE

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar words from '+cc+' heritage houses website', keys_role, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')