# Cultural Data Analysis

Introduction to working with datasets

In [1]:
# import necessary libraries
import os, re, csv
import pandas as pd
import numpy as np
import gensim, nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import string
import pickle

## Loading the dataset: heritage homes webistes

The dataset is stored in a shared google drive:
https://drive.google.com/drive/folders/11Shm0edDOiWrOe56fzJQRZi-v_BPSW8E?usp=drive_link

Add it to your drive.

To access it, load your gdrive in 'Files' (see left pane of the notebook in google colab) and navigate to the shared folder. You may need to click on 'refresh' to make it appear on the list.

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Country code: change here between 'NL' and 'UK'
cc_list = ['NL', 'UK', 'DE', 'FR']

In [4]:
gdrive_path = '/content/gdrive/MyDrive/CDA/'

In [22]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?:\w+\.)*\w+\.\w*', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [25]:
# Import json data from Aipfy scraping into 4 separate dataframes
df0=pd.read_json(gdrive_path+cc_list[0]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df0=df0[['url','text']]

df1=pd.read_json(gdrive_path+cc_list[1]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df1=df1[['url','text']]

df2=pd.read_json(gdrive_path+cc_list[2]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df2=df2[['url','text']]

df3=pd.read_json(gdrive_path+cc_list[3]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df3=df3[['url','text']]

df0.head()

Unnamed: 0,url,text
0,http://weldam.nl/,"Introduction - Weldam\nIntroduction\nWeldam, s..."
1,http://weldam.nl/nederlands.html,Nederlands - Weldam\nCopyright Landgoed Weldam...
2,http://weldam.nl/nederlands/beginpagina/test-2...,Test 1.2 - Weldam\nCopyright Landgoed Weldam 2...
3,https://www.huisdoorn.nl/,Ontdek de geschiedenis - Museum Huis Doorn\nDe...
4,https://www.museumdefundatie.nl/,Museum de FundatieTwitter Widget Iframe\nMuseu...


Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [42]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column

# first, create a mapping of dataframes which could be addressed in a loop
df_dict = {'0':df0, '1':df1, '2':df2, '3':df3}

# then, loop through the df_dict to update each dataframe
for k, v in df_dict.items():
  cc_column = cc_list[int(k[-1])]+' domains'
  cc = cc_list[int(k[-1])]
  # print(cc_column, cc)
  urls = pd.read_csv(gdrive_path+cc_list[int(k[-1])]+'_urls.csv')[cc_column].values.tolist()
  domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}
  matching_links = [link for link in v.url if extract_main_domain(link) in domains]
  # update the dataframe
  v['domain'] = v['url'].apply(extract_main_domain)

In [43]:
# check one of the dataframes
df1.head()

Unnamed: 0,url,text,domain
0,https://www.whittingtoncastle.co.uk/visit-us,Visit Us — Whittington Castle\nOn the Welsh bo...,hittingtoncastle.co.uk
1,https://www.whittingtoncastle.co.uk/kitchenthe...,Kitchen@theCastle — Whittington Castle\nAt Whi...,hittingtoncastle.co.uk
2,https://www.whittingtoncastle.co.uk/home,Whittington Castle\nA stunning 12th century ca...,hittingtoncastle.co.uk
3,https://www.whittingtoncastle.co.uk/weddings-w...,Get in Touch — Whittington CastlereCAPTCHA\nWe...,hittingtoncastle.co.uk
4,https://www.whittingtoncastle.co.uk/events,Events — Whittington Castle\nUpcoming events\n...,hittingtoncastle.co.uk


## Compare collocations across corpora

#### Preparing the text (stopwords, lemmatization, etc)

In [44]:
# make all stopword files stored in github available in this notebook:
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/NL.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/UK.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/DE.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/FR.txt'

--2024-12-08 11:34:38--  https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/NL.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 453 [text/plain]
Saving to: ‘NL.txt’


2024-12-08 11:34:38 (7.06 MB/s) - ‘NL.txt’ saved [453/453]

--2024-12-08 11:34:38--  https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/UK.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7678 (7.5K) [text/plain]
Saving to: ‘UK.txt’


2024-12-08 11:34:39 (37.4 MB/s) - 

In [49]:
# load a list of 'stopwords' in the language you are analyzing
def get_stopwords_list(stop_file_path):
    """load stop words """
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))

sw_dict = {}
for i, cc in enumerate(cc_list):
  stopwords_path = cc + ".txt"
  sw_dict[str(i)] = get_stopwords_list(stopwords_path)

sw0 = sw_dict['0']
sw1 = sw_dict['1']
sw2 = sw_dict['2']
sw3 = sw_dict['3']

In [50]:
# extend the stopwords list with any other words you want to exclude from analysis
special_stop_words = ['nbsp', ' ', '', '—', '\’s', 'ii', 'iii', 'iiii']

In [51]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [52]:
# search for words from this list or use another list
search_words = ['kasteel', 'castle', 'château', 'Schloss']

In [53]:
# SCI-KIT method, produces lists of co-occurencies for specific terms
def vectorize_text(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['text'])
    return X, vectorizer

def find_collocations(text, target_words):
    words = text.split()
    collocations = []
    for i in range(len(words) - 1):
        if words[i] in target_words:
            collocations.append((words[i], words[i + 1]))
        if words[i + 1] in target_words:
            collocations.append((words[i + 1], words[i]))
    return collocations

def get_frequent_collocations(df, most_frequent_words):
    collocations = []
    for text in df['text']:
        collocations.extend(find_collocations(text, most_frequent_words))
    collocation_counts = Counter(collocations)
    frequent_collocations = {}
    for word in most_frequent_words:
        word_collocations = {collocation: count for collocation, count in collocation_counts.items() if word in collocation}
        frequent_collocations[word] = dict(islice(Counter(word_collocations).most_common(20), 20))
    return frequent_collocations

def analyze_word_collocations(df):
    X, vectorizer = vectorize_text(df)
    most_frequent_words = search_words
    frequent_collocations = get_frequent_collocations(df, most_frequent_words)
    return frequent_collocations

In [54]:
for k,v in df_dict.items():
  collocations = analyze_word_collocations(v)

In [55]:
data = []
for word, colloc_dict in collocations.items():
   for collocation, count in colloc_dict.items():
       #collocation_str = ' '.join(collocation)  # Join collocation words into a single string
       data.append([word, collocation[1], count])

collocations_df = pd.DataFrame(data, columns=['Word', 'Collocation', 'Count'])
print(collocations_df.to_markdown(index=True))

|    | Word    | Collocation       |   Count |
|---:|:--------|:------------------|--------:|
|  0 | kasteel | het               |     325 |
|  1 | kasteel | en                |      40 |
|  2 | kasteel | in                |      19 |
|  3 | kasteel | Het               |      17 |
|  4 | kasteel | De                |      16 |
|  5 | kasteel | is                |      13 |
|  6 | kasteel | ons               |      11 |
|  7 | kasteel | met               |       9 |
|  8 | kasteel | een               |       9 |
|  9 | kasteel | verlaten          |       7 |
| 10 | kasteel | te                |       7 |
| 11 | kasteel | niet              |       7 |
| 12 | kasteel | op                |       6 |
| 13 | kasteel | van               |       5 |
| 14 | kasteel | de                |       4 |
| 15 | kasteel | staan.            |       4 |
| 16 | kasteel | maar              |       4 |
| 17 | kasteel | zijn              |       4 |
| 18 | kasteel | waar              |       4 |
| 19 | kastee

## Topic Modeling

Use [Latent-dirichlet allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) to extract topics from the text based on a statistical model

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

NUM_TOPICS = 10

# data to work with: list of tweets
documents = df['text'].tolist()

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words=stopwords_ext, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
documents_vectorized = vectorizer.fit_transform(documents)
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(documents_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
(393104, 10)
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print_topics(lda_model, vectorizer)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = df['text'].tolist()

vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords_ext) # Modify stopwords if needed
dtm = vectorizer.fit_transform(documents)

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda.fit(dtm)


no_top_words = 10
no_top_documents = 10

display_topics(lda.components_, lda.transform(dtm), vectorizer.get_feature_names_out(), documents, no_top_words, no_top_documents)

#### Visualize topics

In [None]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.io import output_notebook
output_notebook()

from sklearn.decomposition import TruncatedSVD

documents_vectorized = vectorizer.fit_transform(documents)
# plotting documents in 2D
svd = TruncatedSVD(n_components=20)
documents_2d = svd.fit_transform(documents_vectorized)

bok_df = pd.DataFrame(columns=['x', 'y', 'document'])
bok_df['x'], bok_df['y'], bok_df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(documents))

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')

plot=figure(min_width=1000, height=600)
#plot = figure(plot_width=1000, plot_height=600)
plot.scatter("x", "y", size=12, source=source, line_color="black", fill_alpha=0.4)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [None]:
!pip install pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, documents_vectorized, vectorizer, mds='tsne')
vis

## Vectorizing the corpus

## Word2Vec model, explore and visualize

In [None]:
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import word_tokenize

# X is a list of tokenized texts (i.e. list of lists of tokens)
X = [word_tokenize(item) for item in df.text.tolist()]
#print(X[0:3])
model = gensim.models.Word2Vec(X, min_count=6, vector_size=200) # min_count: how many times a word appears in the corpus; size: number of dimensions

In [None]:
model.wv.most_similar(positive=["schloss"], topn=12)

### Visualize keywords with t-SNE

Choose keywords that correspond to your analysis and visualize how they and their closest terms are distributed in the discourse.
Use t-SNE to visualize the relations.

* [t-Distributed Stochastic Neighbor Embedding (t-SNE)](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) is a technique for dimensionality reduction that is particularly well suited for the visualization of high-dimensional datasets.


In [None]:
keys_role = ['Man', 'Frau']

embedding_clusters = []
word_clusters = []
for word in keys_grave:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
from sklearn.manifold import TSNE

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar words from '+cc+' heritage houses website', keys_role, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

In [None]:
keys_status = ['Arbeit', 'Eigentum', 'Zimmer', 'Garten']

# calculate embeddings, using topn number of similar words (change the value to include more or less words)
embedding_clusters = []
word_clusters = []
for word in keys_status:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=20):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

# plot a view with new keywords:
tsne_plot_similar_words('Similar words from '+cc+' heritage houses website', keys_status, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

Calculate basic frequency

## 4. Analyse specific collocations

In [None]:
# search for words from this list or use another list
search_words = ['royal']

In [None]:
# SCI-KIT method, produces lists of co-occurencies for specific terms
def vectorize_text(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['text'])
    return X, vectorizer

def find_collocations(text, target_words):
    words = text.split()
    collocations = []
    for i in range(len(words) - 1):
        if words[i] in target_words:
            collocations.append((words[i], words[i + 1]))
        if words[i + 1] in target_words:
            collocations.append((words[i + 1], words[i]))
    return collocations

def get_frequent_collocations(df, most_frequent_words):
    collocations = []
    for text in df['text']:
        collocations.extend(find_collocations(text, most_frequent_words))
    collocation_counts = Counter(collocations)
    frequent_collocations = {}
    for word in most_frequent_words:
        word_collocations = {collocation: count for collocation, count in collocation_counts.items() if word in collocation}
        frequent_collocations[word] = dict(islice(Counter(word_collocations).most_common(20), 20))
    return frequent_collocations

def analyze_word_collocations(df):
    X, vectorizer = vectorize_text(df)
    most_frequent_words = search_words
    frequent_collocations = get_frequent_collocations(df, most_frequent_words)
    return frequent_collocations

In [None]:
collocations = analyze_word_collocations(df)

In [None]:
data = []
for word, colloc_dict in collocations.items():
   for collocation, count in colloc_dict.items():
       #collocation_str = ' '.join(collocation)  # Join collocation words into a single string
       data.append([word, collocation[1], count])
collocations_df = pd.DataFrame(data, columns=['Word', 'Collocation', 'Count'])
print(collocations_df.to_markdown(index=True))

In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

vec_king = wv['king']