# Cultural Data Analysis

Introduction to working with datasets

In [None]:
# import necessary libraries
import os, re, csv
import pandas as pd
import numpy as np
import gensim, nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import islice
from nltk.corpus import stopwords
import spacy
import string
import pickle

In [None]:
import pickle

## Loading the dataset: heritage homes webistes

The dataset is stored in a shared google drive:
https://drive.google.com/drive/folders/11Shm0edDOiWrOe56fzJQRZi-v_BPSW8E?usp=drive_link

Add it to your drive.

To access it, load your gdrive in 'Files' (see left pane of the notebook in google colab) and navigate to the shared folder. You may need to click on 'refresh' to make it appear on the list.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Country code: change here between 'NL' and 'UK'
cc = 'FR'

In [None]:
gdrive_path = '/content/gdrive/MyDrive/CDA/'

In [None]:
raw_data_file = gdrive_path+cc+'_dataset_website-content-crawler.json'

In [None]:
# Import json data from Aipfy scraping
df=pd.read_json(raw_data_file)
# select only two columns for analysis: url and text
df=df[['url','text']]
# Print the DataFrame
df.head()

Join all pages from a domain to an entry in the analysis. To do this, add a new column which will contain only the main domain name.

In [None]:
# function to extract the main domain from the url in the dataset
def extract_main_domain(url):
    if not isinstance(str(url), str):
        print('NOT VALID',url)
        return None
    match = re.findall('(?:\w+\.)*\w+\.\w*', str(url)) #'www\.?([^/]+)'
    return match[0].lstrip('www.') if match else None

In [None]:
# Load the list of domains from a csv file:
cc_column = cc+' domains'
#print(cc_column)

urls = pd.read_csv(gdrive_path+cc+'_urls.csv')[cc_column].values.tolist()

# Extract main domains from nl_urls
domains = {extract_main_domain(url) for url in urls if extract_main_domain(url) is not None}

# Check if main domains in list_of_links match any domain in nl_domains
matching_links = [link for link in df.url if extract_main_domain(link) in domains]

In [None]:
# Add a new column 'domain' and fill it by applying the extract_main_domain function to the 'url' column
df['domain'] = df['url'].apply(extract_main_domain)
df.head()

## Understand meaningful words collocations

#### Preparing the text (stopwords, lemmatization, etc)

In [None]:
# make all stopword files stored in github available in this notebook:
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/NL.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/UK.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/DE.txt'
!wget 'https://raw.githubusercontent.com/jazoza/cultural-data-analysis/refs/heads/main/stopwords_archive/FR.txt'

In [None]:
# load a list of 'stopwords' in the language you are analyzing
def get_stopwords_list(stop_file_path):
    """load stop words """
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))
stopwords_path = cc+".txt"
stopwords = get_stopwords_list(stopwords_path)

In [None]:
# extend the stopwords list with any other words you want to exclude from analysis
special_stop_words = ['nbsp', 'www', ' ', '', '—', '\’s', 'ii', 'iii', 'iiii', 'l\’']
stopwords_ext = stopwords+special_stop_words

In [None]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [None]:
!python3 -m spacy download nl_core_news_sm

In [None]:
#load spacy

nlp = spacy.load("fr_core_news_sm")
#nlp = spacy.load('fr_core_web_sm')

In [None]:
#function to clean and lemmatize comments
def clean_documents(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [None]:
#apply function to clean and lemmatize comments
lemmatized = df.text.map(clean_documents)
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])
lemmatized.head()

In [None]:
unlist_documents = [item for items in lemmatized for item in items]

In [None]:
# save these outputs for later
with open(gdrive_path+'jar/'+cc+'_lemmatized.pickle', 'wb') as handle_l:
    pickle.dump(lemmatized, handle_l, protocol=pickle.HIGHEST_PROTOCOL)

with open(gdrive_path+'jar/'+cc+'_unlist_documents.pickle', 'wb') as handle_u:
    pickle.dump(unlist_documents, handle_u, protocol=pickle.HIGHEST_PROTOCOL)

### Load lemmatized text data as document lists

You can see how to produce lemmatized text data in week 4:
- [4. HH Narratives](https://colab.research.google.com/github/jazoza/cultural-data-analysis/blob/main/04_CDA_HH_narratives.ipynb)

In [None]:
# load saved pickles
with open(gdrive_path+'jar/'+cc+'_lemmatized.pickle', 'rb') as handle_l:
    lemmatized = pickle.load(handle_l)

with open(gdrive_path+'jar/'+cc+'_unlist_documents.pickle', 'rb') as handle_u:
    unlist_documents = pickle.load(handle_u)

In [None]:
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# initiate bigrams and trigrams
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [None]:
# identify all collocations in the flat list of words from all documents
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_documents)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_documents)

In [None]:
# compute basic bigram fequency
bigramFreqTable = pd.DataFrame(list(bigramFinder.ngram_fd.items()), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
# compute basic tri fequency
trigramFreqTable = pd.DataFrame(list(trigramFinder.ngram_fd.items()), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

Find meaningful bi- and tri-grams by filtering adjectives and nouns based on an nltk functionality

In [None]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    for word in ngram:
        _removeNonAscii(word)
        if word in stopwords_ext:
            return False
        if len(word) <= 2:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [None]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]
filtered_bi[:20]

Search for bigrams containing specific words

In [None]:
# define the search term by changing the word 'château' to a word you want to explore
search_term = 'château'
# Filter for bigrams containing the word 'visite'
search_bigrams = filtered_bi[filtered_bi['bigram'].apply(lambda x: search_term in x)]

# print 20 most frequent collocations of the searc_term
# if invalid, try changing the number 20 to a smaller value
search_bigrams.sort_index(ascending=False)[:20]

In [None]:
def rightTypesTri(ngram):
    for word in ngram:
        _removeNonAscii(word)
        if word in stopwords_ext:
            return False
        if len(word) <= 2:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [None]:
#filter trigrams
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypes(x))]
filtered_tri[:20]

In [None]:
# define the search term by changing the word 'château' to a word you want to explore
search_term = 'château'
# Filter for bigrams containing the word 'visite'
search_trigrams = filtered_tri[filtered_tri['trigram'].apply(lambda x: search_term in x)]

# print 20 most frequent collocations of the searc_term
# if invalid, try changing the number 20 to a smaller value
search_trigrams.sort_index(ascending=False)[:20]

## Topic Modeling

Use [Latent-dirichlet allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) to extract topics from the text based on a statistical model

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

NUM_TOPICS = 10

# data to work with: list of tweets
documents = df['text'].tolist()

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words=stopwords_ext, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
documents_vectorized = vectorizer.fit_transform(documents)
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(documents_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
(393104, 10)
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print_topics(lda_model, vectorizer)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = df['text'].tolist()

vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords_ext) # Modify stopwords if needed
dtm = vectorizer.fit_transform(documents)

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda.fit(dtm)


no_top_words = 10
no_top_documents = 10

display_topics(lda.components_, lda.transform(dtm), vectorizer.get_feature_names_out(), documents, no_top_words, no_top_documents)

#### Visualize topics

In [None]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.io import output_notebook
output_notebook()

from sklearn.decomposition import TruncatedSVD

documents_vectorized = vectorizer.fit_transform(documents)
# plotting documents in 2D
svd = TruncatedSVD(n_components=20)
documents_2d = svd.fit_transform(documents_vectorized)

bok_df = pd.DataFrame(columns=['x', 'y', 'document'])
bok_df['x'], bok_df['y'], bok_df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(documents))

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')

plot=figure(min_width=1000, height=600)
#plot = figure(plot_width=1000, plot_height=600)
plot.scatter("x", "y", size=12, source=source, line_color="black", fill_alpha=0.4)
plot.add_layout(labels)
show(plot, notebook_handle=True)

## Vectorizing the corpus

## Word2Vec model, explore and visualize

In [None]:
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import word_tokenize

# X is a list of tokenized texts (i.e. list of lists of tokens)
X = [word_tokenize(item) for item in df.text.tolist()]
#print(X[0:3])
model = gensim.models.Word2Vec(X, min_count=6, vector_size=200) # min_count: how many times a word appears in the corpus; size: number of dimensions

In [None]:
model.wv.most_similar(positive=["schloss"], topn=12)

### Visualize keywords with t-SNE

Choose keywords that correspond to your analysis and visualize how they and their closest terms are distributed in the discourse.
Use t-SNE to visualize the relations.

* [t-Distributed Stochastic Neighbor Embedding (t-SNE)](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) is a technique for dimensionality reduction that is particularly well suited for the visualization of high-dimensional datasets.


In [None]:
keys_role = ['Man', 'Frau']

embedding_clusters = []
word_clusters = []
for word in keys_role:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
from sklearn.manifold import TSNE

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar words from '+cc+' heritage houses website', keys_role, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')