In [136]:
import os
import pandas as pd
import numpy as np
import gensim

In [58]:
folder_path = "../met-naam/castles-netherlands/"

In [59]:
# Define the lambda function
read_files = lambda folder_path: pd.DataFrame(
    [{'filename': filename, 'content': open(os.path.join(folder_path, filename), 'r').read()} 
     for filename in os.listdir(folder_path) if filename.endswith('.txt')])

# Example usage
df = read_files(folder_path)

# Print the DataFrame
df.head()

Unnamed: 0,filename,content
0,buitenplaatsbeeckestijnnl.txt,Home - Buitenplaats Beeckestijn Beeckestijn ...
1,weldamnl.txt,\n Die Geschichte - Weldam\n \n ...
2,wwwmuiderslotnl.txt,Niet wachten bij de kassa? Koop een e-ticket!...
3,wwwdekemastatenl.txt,geertruidaAnna MargarethavanWageningen Dekema...
4,wwwmenkemaborgnl.txt,Home - Menkemaborg Home Welkom op de website ...


### some preprocessing

In [None]:
from textcleaning import textCleaner, textCleanerFurther, textCleanLinks

In [None]:
# clean tweets further: remove hashtags, mentions and links
'''
# currently not using (adapted to ENG)
print("Cleaning the tweets...\n")
clean_tweet_texts = []
%time
for i in range(0,len(df)):
    if( (i+1)%100000 == 0 ):
        print("Tweets", i+1, "of ", len(df), "has been processed")                                                                   
    clean_tweet_texts.append(textCleaner(df['text'][i]))'''

In [44]:
def get_stopwords_list(stop_file_path):
    """load stop words """  
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))
stopwords_path = "../stopwords_archive/dutch.txt"
stopwords = get_stopwords_list(stopwords_path)

In [46]:
special_stop_words = ['nbsp', 'the', 'and']
stopwords_ext = stopwords+special_stop_words

### term frequency, all

In [110]:
from sklearn.feature_extraction.text import CountVectorizer
cvec_all = CountVectorizer().fit(df.content)
df_matrix_all = cvec_all.transform(df.content)
df_all = np.sum(df_matrix_all,axis=0)
terms = np.squeeze(np.asarray(df_all))
print(terms.shape)
term_freq_df_all = pd.DataFrame([terms],columns=cvec_all.get_feature_names_out()).transpose() #term_freq_df is with stopwords
term_freq_df_all.columns = ['terms']
term_freq_df_all.sort_values(by='terms', ascending=False).iloc[:10]

(56980,)


Unnamed: 0,terms
de,38374
van,27151
het,20198
en,19712
een,15934
in,15843
op,9727
is,8705
met,8394
voor,6531


### term frequency, without stop words

In [53]:
cvec_stopped = CountVectorizer(stop_words=stopwords_ext) # see above, import frozenset from stopwords_archive in correct language
cvec_stopped.fit(df.content)
document_matrix = cvec_stopped.transform(df.content)
term_batches = np.linspace(0,document_matrix.shape[0],10).astype(int) 
i=0
df_stopped = []
while i < len(term_batches)-1:
    batch_result = np.sum(document_matrix[term_batches[i]:term_batches[i+1]].toarray(),axis=0)
    df_stopped.append(batch_result)
    print(term_batches[i+1],"entries' term frequency calculated")
    i += 1

terms_stopped = np.sum(df_stopped,axis=0)
print(terms_stopped.shape)
term_freq_df_stopped = pd.DataFrame([terms_stopped],columns=cvec_stopped.get_feature_names_out()).transpose() 
term_freq_df_stopped.columns = ['terms']
term_freq_df_stopped.sort_values(by='terms', ascending=False).iloc[:10]

6 entries' term frequency calculated
12 entries' term frequency calculated
18 entries' term frequency calculated
24 entries' term frequency calculated
30 entries' term frequency calculated
36 entries' term frequency calculated
42 entries' term frequency calculated
48 entries' term frequency calculated
54 entries' term frequency calculated
(56877,)


Unnamed: 0,terms
kasteel,6438
slot,3274
zuylen,2721
landgoed,1735
huis,1649
00,1403
we,1381
duivenvoorde,1330
jaar,1329
uur,1226


In [None]:
# Plot Zipf's law table

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pylab import *
counts = term_freq_df_all.terms
tokens = term_freq_df_all.index
ranks = arange(1, len(counts)+1)
indices = argsort(-counts)
frequencies = counts[indices]
plt.figure(figsize=(10,10))
plt.rc('font', size=14)
plt.ylim(1,10**6)
plt.xlim(1,10**6)
loglog(ranks, frequencies, marker=".")
plt.plot([1,frequencies[0]],[frequencies[0],1],color='r')
title("Zipf plot for tweets tokens")
xlabel("Frequency rank of token")
ylabel("Absolute frequency of token")
grid(True)
for n in list(logspace(-0.5, log10(len(counts)-2), 25).astype(int)):
    dummy = text(ranks[n], frequencies[n], " " + tokens[indices[n]], 
                 verticalalignment="bottom",
                 horizontalalignment="left")

### TF-IDF

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
#tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words=stopwords)
#tfidf_vector = tfidf_vectorizer.fit_transform(df.filename)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords_ext)
# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(df['content'])
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# Add filenames as index
tfidf_df.index = df['filename']
# Print the TF-IDF DataFrame
tfidf_df.head()

Unnamed: 0_level_0,00,000,0004,0007,000ste,001,0014,0015,0016,0018,...,юбилею,ҧm,ӧffnungszeiten,ԑ8e,ۅh,ۮu,ۯ8,ߛi,丶ya,瓌c
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
buitenplaatsbeeckestijnnl.txt,0.046864,0.002845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weldamnl.txt,0.012787,0.003105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wwwmuiderslotnl.txt,0.024403,0.005333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wwwdekemastatenl.txt,0.010818,0.0,0.005553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wwwmenkemaborgnl.txt,0.031579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
# Add column for document frequency aka number of times word appears in all documents
tfidf_df.loc['ALL'] = (tfidf_df > 0).sum()

In [113]:
tfidf_df.head() # first five rows

Unnamed: 0_level_0,00,000,0004,0007,000ste,001,0014,0015,0016,0018,...,юбилею,ҧm,ӧffnungszeiten,ԑ8e,ۅh,ۮu,ۯ8,ߛi,丶ya,瓌c
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
buitenplaatsbeeckestijnnl.txt,0.035007,0.002125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weldamnl.txt,0.010859,0.002637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wwwmuiderslotnl.txt,0.019462,0.004253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wwwdekemastatenl.txt,0.00839,0.0,0.004307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wwwmenkemaborgnl.txt,0.02728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
# 10 most frequent words!

tfidf_slice = tfidf_df[term_freq_df_stopped.sort_values(by='terms', ascending=False).iloc[:10].index.tolist()]
tfidf_slice.sort_index().round(decimals=2).head() # first five rows

Unnamed: 0_level_0,kasteel,slot,zuylen,landgoed,huis,00,we,duivenvoorde,jaar,uur
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ALL,44.0,22.0,4.0,33.0,42.0,44.0,44.0,3.0,48.0,45.0
artlandtop.txt,0.07,0.0,0.0,0.01,0.03,0.02,0.0,0.0,0.0,0.01
buitenplaatsbeeckestijnnl.txt,0.0,0.0,0.0,0.0,0.07,0.04,0.0,0.0,0.01,0.02
fraeylemaborgnl.txt,0.0,0.0,0.0,0.06,0.0,0.02,0.01,0.0,0.02,0.01
huisberghnl.txt,0.16,0.0,0.0,0.0,0.17,0.01,0.01,0.0,0.02,0.01


In [132]:
# reorganize the DataFrame so that the words are in rows rather than columns
tfidf_df = tfidf_df.drop('ALL', errors='ignore')
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df.head()

Unnamed: 0,filename,level_1,0
0,buitenplaatsbeeckestijnnl.txt,00,0.046864
1,buitenplaatsbeeckestijnnl.txt,000,0.002845
2,buitenplaatsbeeckestijnnl.txt,0004,0.0
3,buitenplaatsbeeckestijnnl.txt,0007,0.0
4,buitenplaatsbeeckestijnnl.txt,000ste,0.0


In [133]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'filename': 'document','level_1': 'term'})
tfidf_df.head()

Unnamed: 0,document,term,tfidf
0,buitenplaatsbeeckestijnnl.txt,00,0.046864
1,buitenplaatsbeeckestijnnl.txt,000,0.002845
2,buitenplaatsbeeckestijnnl.txt,0004,0.0
3,buitenplaatsbeeckestijnnl.txt,0007,0.0
4,buitenplaatsbeeckestijnnl.txt,000ste,0.0


In [134]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head()

Unnamed: 0,document,term,tfidf
3023268,artlandtop.txt,artland,0.485904
3061341,artlandtop.txt,strijthagen,0.364428
3063159,artlandtop.txt,top,0.218563
3048224,artlandtop.txt,middot,0.202460
3042911,artlandtop.txt,kasteelcomplex,0.181084
...,...,...,...
2270788,wwwverhildersumnl.txt,vleermuizen,0.534621
2269496,wwwverhildersumnl.txt,verhildersum,0.507022
2230734,wwwverhildersumnl.txt,borg,0.243010
2249434,wwwverhildersumnl.txt,leens,0.223847


In [None]:
!pip3 install altair

In [135]:
import altair as alt

top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10) 

# Terms in this list will get a red dot in the visualization
term_list = ['kasteel', 'huis']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)

## Word2Vec model

In [142]:
from nltk.tokenize import word_tokenize
 
# X is a list of tokenized texts (i.e. list of lists of tokens)
X = [word_tokenize(item) for item in df.content.tolist()]
#print(X[0:3])
model = gensim.models.Word2Vec(X, min_count=6, vector_size=200) # min_count: how many times a word appears in the corpus; size: number of dimensions

In [143]:
model.wv.most_similar(positive=["kasteel"], topn=12)

[('landgoed', 0.9820836186408997),
 ('sprookjesachtige', 0.9523215293884277),
 ('behoud', 0.9445139169692993),
 ('huis', 0.9404441714286804),
 ('historische', 0.9377674460411072),
 ('vroegere', 0.9367468953132629),
 ('park', 0.9356735944747925),
 ('geschiedenis', 0.9350740909576416),
 ('herstel', 0.9347659945487976),
 ('tuinen', 0.9342954754829407),
 ('voorterrein', 0.9321644306182861),
 ('gebied', 0.9291359782218933)]

In [144]:
model.wv.most_similar(positive=["huis"], topn=12)

[('deel', 0.9840012788772583),
 ('grote', 0.9807493686676025),
 ('gebied', 0.9794774651527405),
 ('oude', 0.9751813411712646),
 ('prachtige', 0.9750627279281616),
 ('historische', 0.9741536378860474),
 ('tuinen', 0.9738842844963074),
 ('rand', 0.9730719327926636),
 ('vroegere', 0.9710891842842102),
 ('werk', 0.9706026911735535),
 ('staat', 0.9705648422241211),
 ('beek', 0.9696680903434753)]