# Cultural Data Analysis

Introduction to working with datasets

In [31]:
# import necessary libraries
import os, re, csv
import pandas as pd
import numpy as np
import gensim, nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import spacy
import string

## Loading the dataset: heritage homes webistes

The dataset is stored in a shared google drive:
https://drive.google.com/drive/folders/11Shm0edDOiWrOe56fzJQRZi-v_BPSW8E?usp=drive_link

Add it to your drive.

To access it, load your gdrive in 'Files' (see left pane of the notebook in google colab) and navigate to the shared folder. You may need to click on 'refresh' to make it appear on the list.

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Country code: change here between 'NL' and 'UK'
cc = 'UK'

In [5]:
raw_data_file = '/content/gdrive/MyDrive/CDA/'+cc+'_dataset_website-content-crawler.json'

In [6]:
# Import json data from Aipfy scraping
df=pd.read_json(raw_data_file)

# Print the DataFrame
df.head()

Unnamed: 0,url,crawl,metadata,screenshotUrl,text,markdown,debug
0,https://www.whittingtoncastle.co.uk/visit-us,{'loadedUrl': 'https://www.whittingtoncastle.c...,{'canonicalUrl': 'https://www.whittingtoncastl...,,Visit Us — Whittington Castle\nOn the Welsh bo...,# Visit Us — Whittington Castle\n\nOn the Wels...,{'requestHandlerMode': 'browser'}
1,https://www.whittingtoncastle.co.uk/kitchenthe...,{'loadedUrl': 'https://www.whittingtoncastle.c...,{'canonicalUrl': 'https://www.whittingtoncastl...,,Kitchen@theCastle — Whittington Castle\nAt Whi...,# Kitchen@theCastle — Whittington Castle\n\n##...,{'requestHandlerMode': 'browser'}
2,https://www.whittingtoncastle.co.uk/home,{'loadedUrl': 'https://www.whittingtoncastle.c...,{'canonicalUrl': 'https://www.whittingtoncastl...,,Whittington Castle\nA stunning 12th century ca...,# Whittington Castle\n\n![](https://images.squ...,{'requestHandlerMode': 'browser'}
3,https://www.whittingtoncastle.co.uk/weddings-w...,{'loadedUrl': 'https://www.whittingtoncastle.c...,{'canonicalUrl': 'https://www.whittingtoncastl...,,Get in Touch — Whittington CastlereCAPTCHA\nWe...,# Get in Touch — Whittington CastlereCAPTCHA\n...,{'requestHandlerMode': 'browser'}
4,https://www.whittingtoncastle.co.uk/events,{'loadedUrl': 'https://www.whittingtoncastle.c...,{'canonicalUrl': 'https://www.whittingtoncastl...,,Events — Whittington Castle\nUpcoming events\n...,# Events — Whittington Castle\n\nUpcoming even...,{'requestHandlerMode': 'browser'}


In [7]:
# select only two columns for analysis: url and text
df=df[['url','text']]
df.head()

Unnamed: 0,url,text
0,https://www.whittingtoncastle.co.uk/visit-us,Visit Us — Whittington Castle\nOn the Welsh bo...
1,https://www.whittingtoncastle.co.uk/kitchenthe...,Kitchen@theCastle — Whittington Castle\nAt Whi...
2,https://www.whittingtoncastle.co.uk/home,Whittington Castle\nA stunning 12th century ca...
3,https://www.whittingtoncastle.co.uk/weddings-w...,Get in Touch — Whittington CastlereCAPTCHA\nWe...
4,https://www.whittingtoncastle.co.uk/events,Events — Whittington Castle\nUpcoming events\n...


Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [21]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?:\w+\.)*\w+\.\w*', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [22]:
# Load the list of domains from a csv file:
cc_column = cc+' domains'
#print(cc_column)

urls = pd.read_csv('/content/gdrive/MyDrive/CDA/'+cc+'_urls.csv')[cc_column].values.tolist()

# Extract main domains from nl_urls
domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}

# Check if main domains in list_of_links match any domain in nl_domains
matching_links = [link for link in df.url if extract_main_domain(link) in domains]

In [23]:
# this cell can be skipped, it is only for verification

# check how many lines in the dataframe have a matching link to the list of urls
print(len(matching_links))

6794


In [24]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column
df['domain'] = df['url'].apply(extract_main_domain)
df.head()

Unnamed: 0,url,text,domain
0,https://www.whittingtoncastle.co.uk/visit-us,Visit Us — Whittington Castle\nOn the Welsh bo...,hittingtoncastle.co.uk
1,https://www.whittingtoncastle.co.uk/kitchenthe...,Kitchen@theCastle — Whittington Castle\nAt Whi...,hittingtoncastle.co.uk
2,https://www.whittingtoncastle.co.uk/home,Whittington Castle\nA stunning 12th century ca...,hittingtoncastle.co.uk
3,https://www.whittingtoncastle.co.uk/weddings-w...,Get in Touch — Whittington CastlereCAPTCHA\nWe...,hittingtoncastle.co.uk
4,https://www.whittingtoncastle.co.uk/events,Events — Whittington Castle\nUpcoming events\n...,hittingtoncastle.co.uk


## Working with text

In [None]:
stopwords_file = 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/'+cc+'.txt'

In [25]:
!wget https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/english.txt

--2024-11-25 21:55:28--  https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/english.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7678 (7.5K) [text/plain]
Saving to: ‘english.txt’


2024-11-25 21:55:28 (22.0 MB/s) - ‘english.txt’ saved [7678/7678]



In [27]:
# load a list of 'stopwords' in the language you are analyzing
def get_stopwords_list(stop_file_path):
    """load stop words """
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))
stopwords_path = cc+".txt"
stopwords = get_stopwords_list(stopwords_path)

In [28]:
special_stop_words = ['nbsp', 'the', 'and']
stopwords_ext = stopwords+special_stop_words

In [29]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [33]:
#load spacy
nlp = spacy.load('en_core_web_sm')

In [34]:
#function to clean and lemmatize comments
def clean_documents(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [35]:
#apply function to clean and lemmatize comments
lemmatized = df.text.map(clean_documents)
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])
lemmatized.head()

Unnamed: 0,text
0,"[visit, us, —, whittington, castle, on, the, w..."
1,"[kitchen, thecastle, —, whittington, castle, a..."
2,"[whittington, castle, a, stunning, 12th, centu..."
3,"[get, in, touch, —, whittington, castlerecaptc..."
4,"[event, —, whittington, castle, upcoming, even..."


## Collocations

In [None]:
# SCI-KIT method, produces lists of co-occurencies for specific terms
def vectorize_text(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['text'])
    return X, vectorizer

def find_collocations(text, target_words):
    words = text.split()
    collocations = []
    for i in range(len(words) - 1):
        if words[i] in target_words:
            collocations.append((words[i], words[i + 1]))
        if words[i + 1] in target_words:
            collocations.append((words[i + 1], words[i]))
    return collocations

def get_frequent_collocations(df, most_frequent_words):
    collocations = []
    for text in df['text']:
        collocations.extend(find_collocations(text, most_frequent_words))
    collocation_counts = Counter(collocations)
    frequent_collocations = {}
    for word in most_frequent_words:
        word_collocations = {collocation: count for collocation, count in collocation_counts.items() if word in collocation}
        frequent_collocations[word] = dict(islice(Counter(word_collocations).most_common(20), 20))
    return frequent_collocations

def analyze_word_collocations(df):
    X, vectorizer = vectorize_text(df)
    most_frequent_words = search_words
    frequent_collocations = get_frequent_collocations(df, most_frequent_words)
    return frequent_collocations

In [None]:
# search for words from this list or use another list
search_words = ['architectuur', 'collectie', 'geschiedenis', 'tuin', 'onderzoek', 'stijl']
family_words = ['kinder', 'spel', 'familie', 'koffie', 'lunch', 'kinderfeestjes', 'huwelijk', 'bruid', 'bruidegom', 'high tea']
epoch_words = ['eeuw', '12de', '13de', '14de', '15de', '16de', 'Barroke', 'Renaissance', 'Romantiek', 'Verlichting', 'Rococo', 'Middeleeuwen', 'schatkamer', 'Floris', 'droom','hofdame']

In [None]:
collocations = analyze_word_collocations(df_urled, family_words)

In [None]:
data = []
for word, colloc_dict in collocations24.items():
   for collocation, count in colloc_dict.items():
       #collocation_str = ' '.join(collocation)  # Join collocation words into a single string
       data.append([word, collocation[1], count])
collocations24_df = pd.DataFrame(data, columns=['Word', 'Collocation', 'Count'])
print(collocations24_df.to_markdown(index=True))

## Word2Vec model

In [None]:
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import word_tokenize

# X is a list of tokenized texts (i.e. list of lists of tokens)
X = [word_tokenize(item) for item in df.text.tolist()]
#print(X[0:3])
model = gensim.models.Word2Vec(X, min_count=6, vector_size=200) # min_count: how many times a word appears in the corpus; size: number of dimensions

In [None]:
model.wv.most_similar(positive=["kasteel"], topn=12)

## TF-IDF vectorization

- What is TF/IDF (term frequency / inverse document frequency)? https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords_ext)
# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(df['text'])
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# Add filenames as index
tfidf_df.index = df['domain']
# Print the TF-IDF DataFrame
tfidf_df.head()

In [None]:
# Add column for document frequency aka number of times word appears in all documents
tfidf_df.loc['ALL'] = (tfidf_df > 0).sum()

In [None]:
tfidf_df.head() # first five rows

In [None]:
# 10 most frequent words!

tfidf_slice = tfidf_df[term_freq_df_stopped.sort_values(by='terms', ascending=False).iloc[:10].index.tolist()]
tfidf_slice.sort_index().round(decimals=2).head() # first five rows

In [None]:
# reorganize the DataFrame so that the words are in rows rather than columns
tfidf_df = tfidf_df.drop('ALL', errors='ignore')
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df.head()

In [None]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'domain': 'document','level_1': 'term'})
tfidf_df.head()

In [None]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head

In [None]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

# Terms in this list will get a red dot in the visualization
term_list = ['kasteel', 'huis']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)