# Modelling Literary History (ENG)

--- Last updated: 2025-02-10 ---

In [1]:
import spacy
import pandas as pd
import re
from pathlib import Path
import numpy as np
from numpy.linalg import norm
from statistics import mean 
import os
import json
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import collections
import random
from heapq import nsmallest
from heapq import nlargest
import pickle
import string

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

from itertools import chain

from joblib import Parallel, delayed

import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from scipy.interpolate import interp1d
from scipy.spatial import distance_matrix
from scipy.spatial import distance

import seaborn as sns

import sklearn
from sklearn.manifold import TSNE
from sklearn import cluster
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA

from tqdm import tqdm

In [2]:
# preprocess(texts, nlp_model, chunk_size, sentence_punct)
#
# Preprocesses a list of texts by splitting them up in smaller chunks.
# @param texts list of string objects
# @param nlp_model pre-loaded spacy nlp model
# @param chunk_size int defining the approximate size of the chunks (the chunks end with the next sentence punctuation)
# @param sentence_punkt string of puncuations markers that define sentence breaks, default = "?.!"
# return tuple consisting of a list of preprocessed string objects and a list of chapter ids

def preprocess(texts, nlp_model, chunk_size, chap_ids, sentence_punct="?.!"):
    index = 0
    texts_out = []
    chap_id = []
    for text in texts:
        doc = nlp_model(text)
        new_text = [[tok.text.lower() for tok in sents 
             if tok.text != " " 
             and tok.text != "\n"] for sents in doc.sents]
        new_text = [item for sublist in new_text for item in sublist]
        if new_text:
            paragraphs = []
            start = 0
            end = chunk_size
            x = 0
            while end + 100 <= len(new_text)-1:
                while new_text[end] not in sentence_punct:
                    end += 1
                paragraphs.append(new_text[start:end+1])
                new_id = "_".join([chap_ids[index], str(x)])
                chap_id.append(new_id)
                start = end + 2
                end += chunk_size
                x += 1
            texts_out.append(paragraphs)
        print('Doc ' + str(index) + ' of ' + str(len(texts)-1) + ' has been preprocessed!')
        index += 1
    return(texts_out, chap_id)

In [3]:
def get_vectors(models, texts):
    all_vectors = []
    for text in texts:
        # Infer a vector from each model for the text
        vectors = np.array([model.infer_vector(text.split()) for model in models])
        all_vectors.append(vectors)
    return all_vectors

### Preprocessing

In [12]:
corpus_path = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/corpora"
path_results = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/ENG"
path_data = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/data"
path_pickled = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/ENG/pickled"

In [13]:
paths = [path_results, path_data, path_pickled]

for path in paths:
    if not os.path.exists(path):
        os.makedirs(path)

In [14]:
corpus_dir = Path(corpus_path + '\\ENG').glob('*.txt')
files = list(corpus_dir)

texts = []
for file in files:
    with open (file, encoding = 'utf-8-sig') as f:
        texts.append(f.read())

In [15]:
texts_clean = [word.replace('\n',' ') for word in texts]
texts_clean = [word.lower() for word in texts_clean]

In [16]:
ids = [re.sub(r'C:.+\\(.+).txt', '\\1', str(file)) for file in files]

In [17]:
nlp = spacy.load("en_core_web_lg")

In [18]:
nlp.max_length = 50000000

In [12]:
start = 0
step = 10
stop = len(texts_clean)

list_preprocessed_texts = []

while start < stop:
    end = min(start + step, stop)
    print('\n\n----------------!!! New round !!!----------------\n\n')
    print('Start index: ', start)
    print('Stop index: ', end, '\n\n')
    x = preprocess(texts_clean[start:end], nlp, 500, ids[start:end])
    list_preprocessed_texts.append(x)
    start += step



----------------!!! New round !!!----------------


Start index:  0
Stop index:  10 


Doc 0 of 9 has been preprocessed!
Doc 1 of 9 has been preprocessed!
Doc 2 of 9 has been preprocessed!
Doc 3 of 9 has been preprocessed!
Doc 4 of 9 has been preprocessed!
Doc 5 of 9 has been preprocessed!
Doc 6 of 9 has been preprocessed!
Doc 7 of 9 has been preprocessed!
Doc 8 of 9 has been preprocessed!
Doc 9 of 9 has been preprocessed!


----------------!!! New round !!!----------------


Start index:  10
Stop index:  20 


Doc 0 of 9 has been preprocessed!
Doc 1 of 9 has been preprocessed!
Doc 2 of 9 has been preprocessed!
Doc 3 of 9 has been preprocessed!
Doc 4 of 9 has been preprocessed!
Doc 5 of 9 has been preprocessed!
Doc 6 of 9 has been preprocessed!
Doc 7 of 9 has been preprocessed!
Doc 8 of 9 has been preprocessed!
Doc 9 of 9 has been preprocessed!


----------------!!! New round !!!----------------


Start index:  20
Stop index:  30 


Doc 0 of 9 has been preprocessed!
Doc 1 of 9 has be

In [11]:
#with open(path_pickled + '\\list_preprocessed_texts_chunks=500_ENG.pkl', 'wb') as f:
#    pickle.dump(list_preprocessed_texts, f)

with open(path_pickled + '\\list_preprocessed_texts_chunks=500_ENG.pkl', 'rb') as f:
    list_preprocessed_texts = pickle.load(f)

In [12]:
list_chunks = []
list_ids = []

for l in list_preprocessed_texts:
    list_chunks.append([item for item in l[0]])
    list_ids.append([item for item in l[1]])

In [13]:
ids = list(chain(*list_ids))

In [14]:
all_chunks = []

counter = 0
for text in list_chunks:
    for l in text:
        if len(l) != 0:
            for x in l:
                all_chunks.append([x, ids[counter]])
                counter += 1

In [19]:
#with open(path_pickled + '\\preprocessed_texts_chunks=500_ENG.pkl', 'wb') as f:
#    pickle.dump(all_chunks, f)
    
with open(path_pickled + '\\preprocessed_texts_chunks=500_ENG.pkl', 'rb') as f:
    all_chunks = pickle.load(f)

In [20]:
chunks_ls = []

for x in all_chunks:
    chunks_ls.append(x[0])
                       
ids_ls = []

for x in all_chunks:
    ids_ls.append(x[1])

In [21]:
chunks_pd = pd.DataFrame(list(zip(ids_ls, chunks_ls)),
                         columns=['chunk_id', 'chunk'])

In [22]:
chunks_pd['text_id'] = chunks_pd['chunk_id'].str.replace('(.+)_(\\d+)', '\\1', regex=True)

In [23]:
doc_chunks = chunks_pd.apply(
    lambda r: TaggedDocument(words=r['chunk'], tags=[r.chunk_id]), axis=1)

In [24]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [25]:
iterations = 5

In [26]:
we_results = path_results + '\\model_iteration=' + str(iterations)
if not os.path.exists(we_results):
    os.makedirs(we_results)

In [25]:
for i in range(1, iterations+1):
    model = Doc2Vec(vector_size = 100, window = 5, min_count=5, seed=i)

    model.build_vocab(doc_chunks.values)
    model.train(doc_chunks, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Save each model
    model.save(we_results + '\\doc2vec_' + str(i) + '.model')

2025-02-11 02:19:58,683 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w5,mc5,s0.001,t3>', 'datetime': '2025-02-11T02:19:58.682719', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-02-11 02:19:58,685 : INFO : collecting all words and their counts
2025-02-11 02:19:58,686 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-02-11 02:20:00,031 : INFO : PROGRESS: at example #10000, processed 5181264 words (3855888 words/s), 58739 word types, 10000 tags
2025-02-11 02:20:01,465 : INFO : PROGRESS: at example #20000, processed 10360496 words (3614770 words/s), 81627 word types, 20000 tags
2025-02-11 02:20:02,933 : INFO : PROGRESS: at example #30000, processed 15551922 words (3542299 words/s), 101738 word types, 30000 tags
2025-02-11 02:20:04,250 : INFO : PROGRESS: at example #40000, processed 2

In [26]:
we_results_control = path_results + '\\control_model_iteration=' + str(iterations)
if not os.path.exists(we_results_control):
    os.makedirs(we_results_control)

In [27]:
for i in range(1, iterations+1):
    model = Doc2Vec(vector_size = 100, window = 5, min_count=5, seed=i)

    model.build_vocab(doc_chunks.values)
    model.train(doc_chunks, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Save each model
    model.save(we_results_control + '\\doc2vec_' + str(i) + '.model')

2025-02-11 03:33:21,886 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w5,mc5,s0.001,t3>', 'datetime': '2025-02-11T03:33:21.886027', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-02-11 03:33:21,918 : INFO : collecting all words and their counts
2025-02-11 03:33:21,918 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-02-11 03:33:23,025 : INFO : PROGRESS: at example #10000, processed 5181264 words (4661077 words/s), 58739 word types, 10000 tags
2025-02-11 03:33:24,232 : INFO : PROGRESS: at example #20000, processed 10360496 words (4329937 words/s), 81627 word types, 20000 tags
2025-02-11 03:33:25,407 : INFO : PROGRESS: at example #30000, processed 15551922 words (4423520 words/s), 101738 word types, 30000 tags
2025-02-11 03:33:26,493 : INFO : PROGRESS: at example #40000, processed 2

In [27]:
models = [Doc2Vec.load(we_results + '\\doc2vec_' + str(i+1) + '.model') for i in range(iterations)]

2025-02-19 14:31:49,935 : INFO : loading Doc2Vec object from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/ENG\model_iteration=5\doc2vec_1.model
2025-02-19 14:31:50,177 : INFO : loading dv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/ENG\model_iteration=5\doc2vec_1.model.dv.* with mmap=None
2025-02-19 14:31:50,178 : INFO : loading vectors from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/ENG\model_iteration=5\doc2vec_1.model.dv.vectors.npy with mmap=None
2025-02-19 14:31:50,245 : INFO : loading wv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/ENG\model_iteration=5\doc2vec_1.model.wv.* with mmap=None
2025-02-19 14:31:50,247 : INFO : setting ignored attribute cum_table to None
2025-02-19 14:31:51,359 : INFO : Doc2Vec lifecycle event {'fname': 'C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lith

In [29]:
control_models = [Doc2Vec.load(we_results_control + '\\doc2vec_' + str(i+1) + '.model') for i in range(iterations)]

2025-02-11 04:46:16,981 : INFO : loading Doc2Vec object from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/ENG\ENG/control_model_iteration=5\doc2vec_1.model
2025-02-11 04:46:17,144 : INFO : loading dv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/ENG\ENG/control_model_iteration=5\doc2vec_1.model.dv.* with mmap=None
2025-02-11 04:46:17,146 : INFO : loading vectors from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/ENG\ENG/control_model_iteration=5\doc2vec_1.model.dv.vectors.npy with mmap=None
2025-02-11 04:46:17,174 : INFO : loading wv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/ENG\ENG/control_model_iteration=5\doc2vec_1.model.wv.* with mmap=None
2025-02-11 04:46:17,175 : INFO : setting ignored attribute cum_table to None
2025-02-11 04:46:18,013 : INFO : Doc2Vec lifecycle event {'fname': 'C:/Users/Brottrager/Docu

In [30]:
def create_dv_list(models, df_texts, filtered_indices, id):
    matrix = []

    for i in filtered_indices:
        doc_id = df_texts.loc[i, id]
        chunk_vectors = [model.dv[doc_id] for model in models]
        matrix.append(chunk_vectors)
    
    return np.array(matrix)  # Shape: (num_chunks, num_models, vector_size)

In [31]:
def compute_centroid_similarity(models, set_of_texts, df_texts, id):
    centroids = []
    text_titles = []

    for text_id in set_of_texts:
        # Retrieve indices for all chunks of the document
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        
        # Get document vectors for all chunks across all models
        chunk_vectors = create_dv_list(models, df_texts, filtered_indices, id)
        
        # Compute centroid by averaging across all chunks and models
        document_centroid = np.mean(chunk_vectors, axis=(0, 1))  # Average over chunks and models
        centroids.append(document_centroid)
        text_titles.append(text_id)
    
    # Calculate cosine similarity between centroids
    similarity_matrix = cosine_similarity(centroids)
    
    # Set diagonal to 0
    np.fill_diagonal(similarity_matrix, 0)
    
    # Optionally, return as a DataFrame for better readability with text titles as labels
    similarity_df = pd.DataFrame(similarity_matrix, index=text_titles, columns=text_titles)
    
    return similarity_df

In [32]:
set_of_texts = set(chunks_pd['text_id'])

In [33]:
similarity_df = compute_centroid_similarity(models, set_of_texts, chunks_pd, 'chunk_id')
#similarity_df.to_csv(path_results + '\\doc2vec_adjacency_ENG.csv', encoding='utf-8')

In [None]:
similarity_control_df = compute_centroid_similarity(control_models, set_of_texts, chunks_pd, 'chunk_id')
#similarity_control_df.to_csv(path_results + '\\doc2vec_adjacency_control_models_ENG.csv', encoding='utf-8') 

### Intra-textual variance

The intra-textual variance of a text is measured by the the Euclidean distance of its chunks to its centroid. The result is summarised as the mean of squared distances. 

In [38]:
def intra_textual_variance(models, set_of_texts, df_texts, id):
    mean_squared_distances = []

    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        matrix = create_dv_list(models, df_texts, filtered_indices, id)
        centroid = np.mean(matrix, axis=(0, 1))  # Average over chunks and models
        
        squared_distances = [
            distance.euclidean(chunk_vector.flatten(), centroid) ** 2
            for chunk_vector in matrix.reshape(-1, matrix.shape[2])
        ]
        
        mean_squared_distances.append(np.mean(squared_distances))
    
    return mean_squared_distances

In [39]:
intra_textual_variance = intra_textual_variance(models, set_of_texts, chunks_pd, 'chunk_id')

In [40]:
intra_textual_variance

[19.130528404704158,
 21.40866992198192,
 22.715940940954653,
 16.938510887385622,
 20.78259401302657,
 21.687592389925538,
 19.969470903452166,
 19.998152182339695,
 25.71898472370166,
 23.623233775175546,
 26.10774040466207,
 22.017521369209227,
 18.510987410255062,
 27.244716133450616,
 31.486928059724193,
 18.558156728523333,
 23.40037763880693,
 21.737126498869728,
 19.159597968846874,
 23.714869846625728,
 22.033606089248757,
 25.710354829588457,
 31.49858260118908,
 25.516245900132333,
 22.630415756216387,
 21.256674493296437,
 22.673502894380402,
 17.735625506100543,
 22.018886806279273,
 25.926861035249914,
 21.590312909076477,
 25.203918470567086,
 25.71515894358684,
 21.48047816502436,
 21.640515663872097,
 21.8279024749818,
 19.109343958089124,
 24.987680571627102,
 21.03308514113614,
 19.29742447902449,
 26.18144380656109,
 23.755115547085843,
 19.825091255812534,
 20.752778605329887,
 27.676429137697294,
 23.275594732984988,
 23.85549226878339,
 16.971741614068947,
 22.05

### Stepwise distance

The stepwise distance measures gradual changes of a text by calculating the Euclidean distances between consecutive chunks, summarised by the mean of squared distances. 

In [41]:
def stepwise_distance(models, set_of_texts, df_texts, id):
    all_model_distances = []

    for s in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == s].tolist()
        matrices = create_dv_list(models, df_texts, filtered_indices, id)
        model_distances = []
        for matrix in matrices:
            n_chunks = matrix.shape[0]
            
            if n_chunks == 1:
                model_distances.append(0)  # No distance if only one chunk
                continue
            
            squared_distances = [
                distance.euclidean(matrix[i], matrix[i + 1]) ** 2
                for i in range(n_chunks - 1)
            ]
            model_distances.append(mean(squared_distances))        
        all_model_distances.append(mean(model_distances))

    return all_model_distances

In [42]:
stepwise_distance = stepwise_distance(models, set_of_texts, chunks_pd, 'chunk_id')

In [43]:
stepwise_distance

[40.709238943153224,
 45.72324503652294,
 47.536720874266685,
 35.20571182249816,
 43.876458309420705,
 45.30243091215895,
 41.56727574945695,
 43.56753515455276,
 54.54031672180091,
 50.063584269808906,
 55.698396739946894,
 46.20012662044319,
 39.68225918222758,
 60.652721365207135,
 69.4751076630164,
 39.65418144047861,
 49.66426516481516,
 46.502979484223715,
 40.28438551349813,
 50.470612319342216,
 46.888647459723614,
 56.33488927612931,
 68.75677807772574,
 54.423662628657354,
 47.09623567209616,
 44.631714133699866,
 49.27739375210699,
 37.447489326905675,
 46.756129536641915,
 57.31277714258087,
 45.91028442041624,
 55.119723012832736,
 56.181122757226674,
 44.86344206811965,
 44.77598944490477,
 46.215430461097036,
 41.08765894688469,
 53.46268157479686,
 44.312742248684195,
 41.13528324164119,
 54.7636238605235,
 51.75880230104462,
 41.40224421862557,
 43.606652423042036,
 61.024467284193584,
 50.196716187582,
 50.3060778550091,
 36.0190725826378,
 46.3798640557416,
 34.4581

### Outlier score
The outlier score compares the centroids of all text chunks and identifies the distance to the nearest neighbour.

In [44]:
def outlier_score(models, set_of_texts, df_texts, id):
    list_of_arrays = []
    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        list_of_arrays.append(create_dv_list(models, df_texts, filtered_indices, id))

    centroids = [np.mean(array, axis=(0, 1)) for array in list_of_arrays]
    dist = pd.DataFrame(euclidean_distances(centroids, centroids))
    
    minimal_distances = []
    dist = dist.replace(0, np.nan)
    for i in range(dist.shape[0]):
        minimal_distances.append(np.nanmin(dist[i]))
    
    return minimal_distances

In [45]:
outlier_score = outlier_score(models, set_of_texts, chunks_pd, 'chunk_id')

In [46]:
outlier_score

[3.6500243e-08,
 0.33340338,
 2.9802322e-08,
 2.1073424e-08,
 2.1073424e-08,
 2.1073424e-08,
 0.45724544,
 4.712161e-08,
 0.6410083,
 2.9802322e-08,
 6.664002e-08,
 0.655885,
 2.9802322e-08,
 0.7004929,
 5.9604645e-08,
 0.53759915,
 2.9802322e-08,
 0.5608188,
 2.1073424e-08,
 3.6500243e-08,
 0.55682075,
 0.8405611,
 5.9604645e-08,
 0.7965483,
 2.9802322e-08,
 0.531817,
 0.46524116,
 2.9802322e-08,
 2.1073424e-08,
 2.9802322e-08,
 2.1073424e-08,
 2.9802322e-08,
 2.9802322e-08,
 3.6500243e-08,
 2.9802322e-08,
 2.1073424e-08,
 0.45024997,
 2.9802322e-08,
 0.52943134,
 0.2212395,
 0.5164083,
 2.9802322e-08,
 3.6500243e-08,
 0.4182702,
 2.9802322e-08,
 0.612431,
 2.9802322e-08,
 0.4047821,
 0.4662098,
 0.24976256,
 0.90431046,
 0.6806304,
 5.1619136e-08,
 0.3390169,
 1.3898828,
 5.1619136e-08,
 5.9604645e-08,
 0.38354227,
 2.9802322e-08,
 4.2146848e-08,
 0.4170781,
 2.9802322e-08,
 0.5143644,
 2.9802322e-08,
 2.9802322e-08,
 5.9604645e-08,
 2.9802322e-08,
 0.43150935,
 2.9802322e-08,
 2.107

### Overlap score
The overlap score is the fraction of the k-nearest chunks to a centroid are identified (with k being the number of chunks of the text in question) that do not belong to the text itself. 

In [47]:
def overlap_score(models, set_of_texts, df_texts, id):
    list_of_arrays = []
    num_of_chunks = []
    all_filtered_indices = []

    # Step 1: Use create_dv_list to gather chunk vectors for each document
    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        all_filtered_indices.append(filtered_indices)
        list_of_arrays.append(create_dv_list(models, df_texts, filtered_indices, id))
        num_of_chunks.append(len(filtered_indices))
    
    # Step 2: Calculate centroids for each document by averaging across chunks and models
    centroids = [np.mean(array, axis=(0, 1)) for array in list_of_arrays]  # 1-D centroids

    overlap_scores = []

    # Flatten each chunk vector across all documents into a single list, averaged across models
    all_chunks = [np.mean(chunk, axis=0) for doc_vectors in list_of_arrays for chunk in doc_vectors]

    for i, centroid in enumerate(centroids):
        # Calculate distances from the centroid to each chunk in all documents
        distances = {j: distance.euclidean(centroid, chunk) for j, chunk in enumerate(all_chunks)}
        
        # Get indices of k-nearest chunks to the centroid (k = number of chunks in the document)
        smallest_dist_indices = nsmallest(num_of_chunks[i], distances, key=distances.get)
        
        # Count how many of the k-nearest chunks do not belong to the document itself
        overlap_count = len([idx for idx in smallest_dist_indices if idx not in all_filtered_indices[i]])
        
        # Calculate overlap score as the fraction of out-of-document chunks among k-nearest neighbors
        overlap_scores.append(overlap_count / num_of_chunks[i])

    return overlap_scores

In [48]:
overlap_score = overlap_score(models, set_of_texts, chunks_pd, 'chunk_id')

In [49]:
overlap_score

[0.9984025559105432,
 1.0,
 1.0,
 1.0,
 0.9963636363636363,
 0.9858657243816255,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9957983193277311,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9883381924198251,
 0.9968944099378882,
 0.9932885906040269,
 0.9976580796252927,
 1.0,
 0.9955357142857143,
 1.0,
 1.0,
 0.9895287958115183,
 1.0,
 1.0,
 1.0,
 0.9942307692307693,
 0.9973262032085561,
 1.0,
 0.9887133182844243,
 0.9955423476968797,
 1.0,
 1.0,
 1.0,
 0.9929078014184397,
 1.0,
 0.9984399375975039,
 1.0,
 1.0,
 1.0,
 0.9973118279569892,
 1.0,
 1.0,
 1.0,
 0.996969696969697,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9958333333333333,
 1.0,
 1.0,
 1.0,
 0.994475138121547,
 1.0,
 1.0,
 0.9976303317535545,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.996,
 0.9974093264248705,
 0.9922077922077922,
 1.0,
 1.0,
 1.0,
 0.9974874371859297,
 1.0,
 1.0,
 1.0,
 0.9986168741355463,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9951219512195122,
 1.0,
 0.9953775038520801,
 1.0,
 0.9975429975

In [50]:
df_scores = pd.DataFrame(list(zip(set_of_texts, intra_textual_variance, stepwise_distance, outlier_score, overlap_score)),
                         columns =['text_id', 'intra_textual_variance', 'stepwise_distance', 'outlier_score', 'overlap_score'])

In [51]:
df_scores.to_csv(path_results + '\\doc2vec_scores_ENG.csv')

In [34]:
corpus_meta = pd.read_csv(corpus_path + '/ENG_corpus.csv', sep = ';', encoding='UTF-8')

In [35]:
corpus_meta = corpus_meta.copy()
corpus_meta['text_id'] = corpus_meta['wikiname'] + '_' + corpus_meta['wikiID']

In [26]:
shorts = []
for file in files:
    short = re.sub(r'.+\\ENG\\(.+).txt', '\\1', str(file))
    if short not in list(corpus_meta['text_id']):
        print(short)
    shorts.append(short)

In [27]:
for e in list(corpus_meta['text_id']):
    if e not in shorts:
        print(e)

In [45]:
def fetch_texts(subset_df, text_column, full_text_df, text_id_column):
    text_ids = subset_df[text_id_column].tolist()
    texts = full_text_df[full_text_df[text_id_column].isin(text_ids)][text_column].tolist()
    return texts

In [46]:
def get_vectors_from_models(models, texts):
    all_vectors = []
    for model in models:
        # Directly use tokenized lists
        vectors = [model.infer_vector(text) for text in texts]
        all_vectors.append(vectors)
    return np.array(all_vectors).transpose(1, 0, 2)  # Shape: (num_texts, num_models, vector_size)

In [47]:
def compute_centroid(vectors):
    return np.mean(vectors, axis=(0, 1))  # Mean over both texts and models

In [48]:
def calculate_centroid_for_subset(models, subset_df, full_text_df, text_id_column, text_column):
    # Fetch texts based on metadata
    texts = fetch_texts(subset_df, text_column, full_text_df, text_id_column)
    
    # Generate vectors for the texts
    vectors = get_vectors_from_models(models, texts)  # Shape: (num_texts, num_models, vector_size)
    
    # Compute and return centroid
    return compute_centroid(vectors)

In [49]:
def process_year(start):
    # Track progress within the worker
    print(f"Processing year range: {start-5} to {start+5}")
    
    # Filter the subset
    corpus_subset = corpus_meta[(corpus_meta['pub_year'] >= start - 5) & (corpus_meta['pub_year'] < start + 5)]
    
    if not corpus_subset.empty:
        centroid = calculate_centroid_for_subset(
            models=models, 
            subset_df=corpus_subset, 
            full_text_df=chunks_pd, 
            text_id_column='text_id', 
            text_column='chunk'
        )
        label = f"{start}"
        return label, centroid
    return None, None

In [36]:
start_year = 1688
end_year = 1914
years = list(range(start_year, end_year + 1))

In [37]:
output_dir = path_results + '\\batch_rollingCentroids'
os.makedirs(output_dir, exist_ok=True)

In [38]:
progress_file = os.path.join(output_dir, "progress.json")

# Check for progress
if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        progress = json.load(f)
    last_index = progress.get("last_index", -1)  # Store last used index
else:
    last_index = -1

batch_size = 50
start = last_index + 1  # Start at the next index
print(f"Resuming from index {start}")

# Iterate over batches
for i in range(start, len(years), batch_size):
    batch = years[i:i + batch_size]

    results = Parallel(n_jobs=4)(
        delayed(process_year)(year) for year in tqdm(batch, desc=f"Processing index {i}")
    )

    # Save the results for the current batch
    output_file = os.path.join(output_dir, f"batch_{i}.pkl")
    with open(output_file, "wb") as f:
        pickle.dump(results, f)
    print(f"Batch starting at index {i} saved to {output_file}")

    # Update progress
    last_index = i + batch_size - 1  # Save the last processed index
    with open(progress_file, "w") as f:
        json.dump({"last_index": last_index}, f)

Resuming from index 150


Processing index 150: 100%|█████████████████████████████████████████████████████████| 50/50 [3:27:15<00:00, 248.72s/it]


Batch starting at index 150 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/ENG\batch_rollingCentroids\batch_150.pkl


Processing index 200: 100%|█████████████████████████████████████████████████████████| 27/27 [1:31:55<00:00, 204.29s/it]


Batch starting at index 200 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/ENG\batch_rollingCentroids\batch_200.pkl


In [39]:
with open(output_dir + '\\batch_0.pkl', 'rb') as f:
    results_1 = pickle.load(f)

with open(output_dir + '\\batch_50.pkl', 'rb') as f:
    results_2 = pickle.load(f)

with open(output_dir + '\\batch_100.pkl', 'rb') as f:
    results_3 = pickle.load(f)

with open(output_dir + '\\batch_150.pkl', 'rb') as f:
    results_4 = pickle.load(f)

with open(output_dir + '\\batch_200.pkl', 'rb') as f:
    results_5 = pickle.load(f)

In [40]:
all_results_ls = [results_1, results_2, results_3, results_4, results_5]

In [41]:
all_centroids = []

for r in all_results_ls:
    all_centroids.extend(r)

In [38]:
#with open(output_dir + '\\all_rollingCentroids.pkl', 'wb') as f:
#    pickle.dump(all_centroids, f)

with open(output_dir + '\\all_rollingCentroids.pkl', 'rb') as f:
    all_centroids = pickle.load(f)

In [39]:
valid_indices = [i for i, c in enumerate(all_centroids) if c[1] is not None and len(c) > 0]
valid_centroids = [np.ravel(all_centroids[i][1]) for i in valid_indices]

In [40]:
interp_func = interp1d(valid_indices, valid_centroids, axis=0, fill_value="extrapolate")

# Impute missing centroids using the interpolation function
imputed_centroids = []
imputed_years = []

for idx in range(len(all_centroids)):
    if all_centroids[idx][1] is None or len(all_centroids[idx][1]) == 0:  # Check the second part of the tuple
        print(f"Centroid {idx} is missing. Imputing with interpolation.")
        # Interpolate missing centroid using the function
        interpolated_centroid = interp_func(idx)
        # Keep the same publication year and apply the interpolated centroid
        imputed_centroids.append(interpolated_centroid)
        imputed_years.append(int(years[idx]))
    else:
        # If centroid exists, flatten the numerical array and preserve the text id
        imputed_centroids.append((np.ravel(all_centroids[idx][1])))
        imputed_years.append(int(all_centroids[idx][0]))

# Convert the list of imputed centroids into a Numpy array of centroid vectors
centroid_matrix = np.array(imputed_centroids)
print(f"Centroid matrix shape: {centroid_matrix.shape}")

Centroid 11 is missing. Imputing with interpolation.
Centroid matrix shape: (227, 100)


In [41]:
def compute_text_centroids(models, set_of_texts, df_texts, id):
    centroids = []  # List to store centroids of each text
    text_ids = []   # List to store corresponding text_id labels

    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        chunk_vectors = create_dv_list(models, df_texts, filtered_indices, id)
        document_centroid = np.mean(chunk_vectors, axis=(0, 1)) 
        
        centroids.append(document_centroid)
        text_ids.append(text_id)
    
    centroid_matrix = np.array(centroids)
    
    return centroid_matrix, text_ids

In [42]:
text_matrix = compute_text_centroids(models, set_of_texts, chunks_pd, 'chunk_id')
print(f"Centroid matrix shape: {text_matrix[0].shape}")

Centroid matrix shape: (679, 100)


In [43]:
similarity_matrix = cosine_similarity(centroid_matrix, text_matrix[0])

In [44]:
def compute_similarity_relative(similarity_matrix, yearly_labels, text_publication_years, text_labels):
    similarities_by_text = {}
    total_valid_similarities = 0  # Initialize counter

    for idx, text_label in enumerate(text_labels):
        publication_year = text_publication_years[idx]
        
        if publication_year is None:  # Skip texts with missing publication year
            continue

        # Get the similarities for the current text
        similarities = similarity_matrix[:, idx]
        
        # Calculate relative years and corresponding similarities, filtering for the 50 years before and after
        relative_similarities = [
            (year - publication_year, similarities[i])
            for i, year in enumerate(yearly_labels)
            if year is not None and not np.isnan(similarities[i])
            and (year >= publication_year - 50 and year <= publication_year + 50)
        ]
        
        # Update the total counter with the number of valid relative similarities
        total_valid_similarities += len(relative_similarities)
        
        similarities_by_text[text_label] = {'sim': relative_similarities}

    print(f"Total valid relative similarities: {total_valid_similarities}")  # Output the count
    return similarities_by_text

In [58]:
# Alternative: only text with full range of 100 years before and after publication are considered

# def compute_similarity_relative(similarity_matrix, yearly_labels, text_publication_years, text_labels):
#     similarities_by_text = {}

#     for idx, text_label in enumerate(text_labels):
#         publication_year = text_publication_years[idx]
        
#         if publication_year is None:  # Skip texts with missing publication year
#             continue

#         # Get the similarities for the current text
#         similarities = similarity_matrix[:, idx]
        
#         # Calculate relative years and corresponding similarities, filtering for the 50 years before and after
#         relative_similarities = [
#             (year - publication_year, similarities[i])
#             for i, year in enumerate(yearly_labels)
#             if year is not None and not np.isnan(similarities[i])
#             and (year >= publication_year - 50 and year <= publication_year + 50)
#         ]
        
#         # Check if there is a full 100-year span (50 years before and 50 years after)
#         years_before = [year for year, _ in relative_similarities if year < 0]
#         years_after = [year for year, _ in relative_similarities if year > 0]
        
#         if len(years_before) >= 50 and len(years_after) >= 50:
#             similarities_by_text[text_label] = {'sim': relative_similarities}
    
#     return similarities_by_text

In [45]:
publications_years = []
text_labels = []

for text_id in set_of_texts:
    pubyear_series = corpus_meta['pub_year'][corpus_meta['text_id'] == text_id]
    if not pubyear_series.empty:
        pubyear = pubyear_series.iloc[0]
        publications_years.append(pubyear)
    else:
        publications_years.append(None)  # or handle it as needed
        print(text_id)
    label_series = corpus_meta['text_id'][corpus_meta['text_id'] == text_id]
    if not label_series.empty:
        label = label_series.iloc[0]
        text_labels.append(label)
    else:
        text_labels.append(None)  # or handle it as needed
        print(text_id)

In [46]:
similarities_by_text = compute_similarity_relative(similarity_matrix, imputed_years, publications_years, text_labels)

Total valid relative similarities: 58315


In [52]:
rows = []
for text, values in similarities_by_text.items():
    for idx, sim in values['sim']:
        rows.append((text, idx, sim))

sim_for_each_text = pd.DataFrame(rows, columns=['Text', 'Index', 'Similarity'])

In [53]:
sim_for_each_text.to_csv(path_results + '\\sim_for_each_text_ENG.csv')

In [61]:
canonisation_score = pd.read_csv(path_data + '\\scores/canonisation_scores/ENG/ENG_canonisationScore.csv', encoding='UTF-8')

In [62]:
for key, values in similarities_by_text.items():
    matching_row = canonisation_score['canonisation_score'][canonisation_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['canonisation_score'] = score

In [63]:
data = {
    'ID': [],
    'Year': [],
    'Similarity': [],
    'Canonisation Score': []
}

for text, values in similarities_by_text.items():
    canon_score = values['canonisation_score']
    for year, similarity in values['sim']:
        data['ID'].append(text)
        data['Year'].append(year)
        data['Similarity'].append(similarity)
        data['Canonisation Score'].append(canon_score)

df = pd.DataFrame(data)

In [64]:
df_mean = df.groupby(['Year'])['Similarity'].agg(['mean', 'std']).reset_index()

In [65]:
df_mean.to_csv(path_results + '\\rollingCentroids_mean_ENG.csv')

In [66]:
# Define bins based on specified thresholds
bins = [0, 0.25, 0.5, 0.75, 1.0]
labels = ['Low (0-0.25)', 'Mid-Low (0.25-0.5)', 'Mid-High (0.5-0.75)', 'High (0.75-1.0)']

# Categorize canonisation scores into specified ranges
df['Score Range'] = pd.cut(df['Canonisation Score'], bins=bins, labels=labels, include_lowest=True)

# Aggregate similarities by year and score range
grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()

  grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()


In [67]:
df.to_csv(path_results + '\\rollingCentroids_canonisation_sim.csv')

In [68]:
grouped.to_csv(path_results + '\\rollingCentroids_canonisation_sim_groupeddf_ENG.csv')

In [69]:
# Define bins based on specified thresholds
bins = [0, 0.25, 0.75, 1.0]
labels = ['Low (0-0.25)', 'Mid (0.25-0.75)', 'High (0.75-1.0)']

# Categorize canonisation scores into specified ranges
df['Score Range'] = pd.cut(df['Canonisation Score'], bins=bins, labels=labels, include_lowest=True)

# Aggregate similarities by year and score range
grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()

  grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()


In [70]:
grouped.to_csv(path_results + '\\rollingCentroids_canonisation_sim_groupeddf_three_ENG.csv')

In [71]:
# Define bins based on specified thresholds
bins = [0, 0.5, 1.0]
labels = ['Low (0-0.5)', 'High (0.5-1.0)']

# Categorize canonisation scores into specified ranges
df['Score Range'] = pd.cut(df['Canonisation Score'], bins=bins, labels=labels, include_lowest=True)

# Aggregate similarities by year and score range
grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()

  grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()


In [72]:
grouped.to_csv(path_results + '\\rollingCentroids_canonisation_sim_groupeddf_binary_ENG.csv')

In [66]:
reception_score = pd.read_csv(path_data + '\\scores/reception_scores/reception_scores_classes_ENG.csv', encoding='UTF-8')

In [67]:
for key, values in similarities_by_text.items():
    matching_row = reception_score['circl_binary'][reception_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['circl_binary'] = score
    
    matching_row = reception_score['reviews_binary'][reception_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['reviews_binary'] = score

In [68]:
data = {
    'ID': [],
    'Year': [],
    'Similarity': [],
    'Circulating Library': [],
    'Review': []
}

for text, values in similarities_by_text.items():
    circl_binary = values['circl_binary']
    review = values['reviews_binary']
    for year, similarity in values['sim']:
        data['ID'].append(text)
        data['Year'].append(year)
        data['Similarity'].append(similarity)
        data['Circulating Library'].append(circl_binary)
        data['Review'].append(review)

df = pd.DataFrame(data)

In [69]:
grouped = df.groupby(['Year', 'Circulating Library'])['Similarity'].agg(['mean', 'std']).reset_index()
grouped.to_csv(path_results + '\\rollingCentroids_reception_circllibs_sim_groupeddf_ENG.csv')

In [70]:
grouped = df.groupby(['Year', 'Review'])['Similarity'].agg(['mean', 'std']).reset_index()
grouped.to_csv(path_results + '\\rollingCentroids_reception_reviews_sim_groupeddf_ENG.csv')

In [71]:
for key, values in similarities_by_text.items():
    matching_row = reception_score['class'][reception_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['reception_class'] = score

In [72]:
data = {
    'ID': [],
    'Year': [],
    'Similarity': [],
    'Reception Class': []
}

for text, values in similarities_by_text.items():
    reception_class = values['reception_class']
    for year, similarity in values['sim']:
        data['ID'].append(text)
        data['Year'].append(year)
        data['Similarity'].append(similarity)
        data['Reception Class'].append(reception_class)

df = pd.DataFrame(data)

In [73]:
df.to_csv(path_results + '\\rollingCentroids_reception_sim.csv')

In [74]:
grouped = df.groupby(['Year', 'Reception Class'])['Similarity'].mean().reset_index()

In [75]:
grouped.to_csv(path_results + '\\rollingCentroids_reception_sim_groupeddf_ENG.csv')

In [76]:
def compute_chunk_to_centroid_similarities(models, df_texts, rolling_centroids, set_of_texts, id):
    all_similarities = []  # To store similarities for each rolling centroid

    for text_id, centroid in zip(set_of_texts, rolling_centroids):
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        chunk_vectors = create_dv_list(models, df_texts, filtered_indices, id)

        # Flatten the chunk vectors for distance calculation
        chunk_vectors_flat = np.vstack(chunk_vectors)
        
        # Compute cosine similarities of chunks to the centroid
        cosine_similarities = cosine_similarity(chunk_vectors_flat, centroid.reshape(1, -1))
            
        # Store the distances
        all_similarities.append(cosine_similarities)
    
    return all_similarities

In [77]:
chunk_to_centroid_similarities = compute_chunk_to_centroid_similarities(models, chunks_pd, imputed_centroids, set_of_texts, 'chunk_id')

In [78]:
mean_chunk_to_centroid_similarities = np.array([np.mean(sim) for sim in chunk_to_centroid_similarities])
std_similarities = np.array([np.std(sim) for sim in chunk_to_centroid_similarities])
lower_percentiles = [np.percentile(sim, 25) for sim in chunk_to_centroid_similarities]
upper_percentiles = [np.percentile(sim, 75) for sim in chunk_to_centroid_similarities]
num_chunks = [len(sim) for sim in chunk_to_centroid_similarities]

In [79]:
years_np = np.asarray(years, dtype=np.float32)

In [80]:
df_chunk_to_centroid_similarities= pd.DataFrame(np.column_stack((mean_chunk_to_centroid_similarities, std_similarities, 
                                                                 lower_percentiles, upper_percentiles, years_np, num_chunks)), 
                                                columns=['mean', 'std', 'Q1', 'Q4', 'year', 'num_chunks'])

In [81]:
df_chunk_to_centroid_similarities.to_csv(path_results + '\\df_chunk_to_centroid_similarities.csv')

In [82]:
cosine_similarities = [
    cosine_similarity([imputed_centroids[i]], [imputed_centroids[i + 1]])[0][0]
    for i in range(len(imputed_centroids) - 1)
]

In [83]:
df_cosine_similarities = pd.DataFrame(np.column_stack((cosine_similarities, years_np[1:], num_chunks[1:])), 
                                      columns=['cosine_similarity','year', 'num_chunks'])

In [84]:
df_cosine_similarities.to_csv(path_results + '\\df_centroid_to_centroid_similarities.csv')

In [35]:
canonisation_score = pd.merge(canonisation_score, corpus_meta, 'left', left_on='ID', right_on='text_id')

In [86]:
results = []

for text in similarity_df.index:
    # Get publication year and canonisation score for the current text
    current_row = canonisation_score.loc[canonisation_score['ID'] == text]
    pub_year = current_row['pub_year'].values[0]
    canonisation = current_row['canonisation_score'].values[0]
    
    # Get texts published before the current text
    valid_texts = canonisation_score[canonisation_score['pub_year'] < pub_year]['ID'].tolist()
    
    # Get similarities for the current text and filter by valid texts
    similarities = similarity_df.loc[text]
    similarities = similarities.loc[valid_texts]
    
    if not similarities.empty:
        # Find the nearest neighbour
        nearest_neighbour = similarities.idxmax()  # Index of max similarity
        similarity_score = similarities.max()      # Corresponding similarity value
        
        # Append results only if NearestNeighbour and SimilarityScore are valid
        results.append({
            'Text': text,
            'CanonisationScore': canonisation,
            'NearestNeighbour': nearest_neighbour,
            'SimilarityScore': similarity_score
        })

In [87]:
# Convert results to a DataFrame
nearest_neighbours_df = pd.DataFrame(results)

In [88]:
nearest_neighbours_df.to_csv(path_results + '\\nearest_neighbours_1_df_ENG.csv')

In [28]:
def decay_function(time_diff, lambda_param):
    """
    Apply exponential decay to the time difference between texts.
    
    Parameters:
    - time_diff (int): The time difference between two texts.
    - lambda_param (float): The decay constant (controls how quickly the weight decays).
    
    Returns:
    - weight (float): The weight of the relationship after applying decay.
    """
    return np.exp(-lambda_param * time_diff)

In [29]:
def calculate_similarity_weight(pub_year_text1, pub_year_text2, lambda_param=0.1):
    """
    Calculate the similarity weight between two texts based on their publication years
    using a decay function.
    
    Parameters:
    - pub_year_text1 (int): Publication year of the first text.
    - pub_year_text2 (int): Publication year of the second text.
    - lambda_param (float): The decay constant.
    
    Returns:
    - similarity_weight (float): The similarity weight based on temporal decay.
    """
    # Calculate the time difference between the two texts
    time_diff = abs(pub_year_text1 - pub_year_text2)
    
    # Calculate the similarity weight using the decay function
    similarity_weight = decay_function(time_diff, lambda_param)
    
    return similarity_weight

In [36]:
results = []

lambda_param = 0.01

# Number of nearest neighbours to retrieve
num_neighbours = 3

# Loop through the similarity dataframe
for text in similarity_df.index:
    # Get publication year and canonisation score for the current text
    current_row = canonisation_score.loc[canonisation_score['ID'] == text]
    pub_year = current_row['pub_year'].values[0]
    canonisation = current_row['canonisation_score'].values[0]
    
    # Get texts published before the current text
    valid_texts = canonisation_score[canonisation_score['pub_year'] < pub_year]['ID'].tolist()
    
    # Get similarities for the current text and filter by valid texts
    similarities = similarity_df.loc[text]
    similarities = similarities.loc[valid_texts]
    
    if not similarities.empty:
        # Apply temporal decay to each similarity score
        weighted_similarities = []
        
        for neighbour, similarity in similarities.items():
            # Get publication year for the neighbour text
            neighbour_pub_year = canonisation_score.loc[canonisation_score['ID'] == neighbour, 'pub_year'].values[0]
            
            # Calculate the temporal similarity weight
            time_weight = calculate_similarity_weight(pub_year, neighbour_pub_year, lambda_param)
            
            # Apply the time weight to the similarity score
            weighted_similarity = similarity * time_weight
            weighted_similarities.append((neighbour, weighted_similarity))
        
        # Sort by weighted similarity and retrieve the top `num_neighbours` texts
        sorted_similarities = sorted(weighted_similarities, key=lambda x: x[1], reverse=True)
        
        # Take the top `num_neighbours` entries
        top_neighbours = sorted_similarities[:num_neighbours]
        
        # Append the results
        for neighbour, similarity_score in top_neighbours:
            results.append({
                'Text': text,
                'CanonisationScore': canonisation,
                'NearestNeighbour': neighbour,
                'SimilarityScore': similarity_score
            })

In [37]:
# Convert results to a DataFrame
nearest_neighbours_decay_df = pd.DataFrame(results)

In [38]:
nearest_neighbours_decay_df.to_csv(path_results + '\\nearest_neighbours_3_decay=0.01_df_ENG.csv')