# Modelling Literary History (GER)

--- Last updated: 2025-02-10 ---

In [1]:
import spacy
import pandas as pd
import re
from pathlib import Path
import numpy as np
from scipy.spatial import distance
from statistics import mean 
from sklearn.metrics.pairwise import euclidean_distances
import os
import json

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import collections
import random
from scipy.spatial import distance_matrix
from heapq import nsmallest 
import pickle

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.manifold import TSNE
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
import sklearn
from sklearn import cluster
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
import string  
from itertools import chain

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

from joblib import Parallel, delayed
from tqdm import tqdm

from scipy.interpolate import interp1d

In [2]:
# preprocess(texts, nlp_model, chunk_size, sentence_punct)
#
# Preprocesses a list of texts by splitting them up in smaller chunks.
# @param texts list of string objects
# @param nlp_model pre-loaded spacy nlp model
# @param chunk_size int defining the approximate size of the chunks (the chunks end with the next sentence punctuation)
# @param sentence_punkt string of puncuations markers that define sentence breaks, default = "?.!"
# return tuple consisting of a list of preprocessed string objects and a list of chapter ids

def preprocess(texts, nlp_model, chunk_size, chap_ids, sentence_punct="?.!"):
    index = 0
    texts_out = []
    chap_id = []
    for text in texts:
        doc = nlp_model(text)
        new_text = [[tok.text.lower() for tok in sents 
             if tok.text != " " 
             and tok.text != "\n"] for sents in doc.sents]
        new_text = [item for sublist in new_text for item in sublist]
        if new_text:
            paragraphs = []
            start = 0
            end = chunk_size
            x = 0
            while end + 100 <= len(new_text)-1:
                while new_text[end] not in sentence_punct:
                    end += 1
                paragraphs.append(new_text[start:end+1])
                new_id = "_".join([chap_ids[index], str(x)])
                chap_id.append(new_id)
                start = end + 2
                end += chunk_size
                x += 1
            texts_out.append(paragraphs)
        print('Doc ' + str(index) + ' of ' + str(len(texts)-1) + ' has been preprocessed!')
        index += 1
    return(texts_out, chap_id)

### Preprocessing

In [3]:
corpus_path = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/corpora"
path_results = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER"
path_data = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/data"
path_pickled = "C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER/pickled"

In [4]:
paths = [path_results, path_data, path_pickled]

for path in paths:
    if not os.path.exists(path):
        os.makedirs(path)

In [5]:
corpus_dir = Path(corpus_path + '\\GER').glob('*.txt')
files = list(corpus_dir)

texts = []
for file in files:
    with open (file, encoding = "UTF-8") as f:
        texts.append(f.read()) 

In [6]:
texts_clean = [word.replace('\n',' ') for word in texts]
texts_clean = [word.lower() for word in texts_clean]

In [7]:
ids = [re.sub(r'C:.+\\(.+).txt', '\\1', str(file)) for file in files]

In [8]:
nlp = spacy.load("de_core_news_lg")

In [9]:
nlp.max_length = 50000000

In [18]:
start = 0
step = 10
stop = len(texts_clean)

list_preprocessed_texts = []

while start < stop:
    end = min(start + step, stop)
    print('\n\n----------------!!! New round !!!----------------\n\n')
    print('Start index: ', start)
    print('Stop index: ', end, '\n\n')
    x = preprocess(texts_clean[start:end], nlp, 500, ids[start:end])
    list_preprocessed_texts.append(x)
    start += step



----------------!!! New round !!!----------------


Start index:  0
Stop index:  10 


Doc 0 of 9 has been preprocessed!
Doc 1 of 9 has been preprocessed!
Doc 2 of 9 has been preprocessed!
Doc 3 of 9 has been preprocessed!
Doc 4 of 9 has been preprocessed!
Doc 5 of 9 has been preprocessed!
Doc 6 of 9 has been preprocessed!
Doc 7 of 9 has been preprocessed!
Doc 8 of 9 has been preprocessed!
Doc 9 of 9 has been preprocessed!


----------------!!! New round !!!----------------


Start index:  10
Stop index:  20 


Doc 0 of 9 has been preprocessed!
Doc 1 of 9 has been preprocessed!
Doc 2 of 9 has been preprocessed!
Doc 3 of 9 has been preprocessed!
Doc 4 of 9 has been preprocessed!
Doc 5 of 9 has been preprocessed!
Doc 6 of 9 has been preprocessed!
Doc 7 of 9 has been preprocessed!
Doc 8 of 9 has been preprocessed!
Doc 9 of 9 has been preprocessed!


----------------!!! New round !!!----------------


Start index:  20
Stop index:  30 


Doc 0 of 9 has been preprocessed!
Doc 1 of 9 has be

In [10]:
#with open(path_pickled + '\\list_preprocessed_texts_chunks=500_GER.pkl', 'wb') as f:
#   pickle.dump(list_preprocessed_texts, f)

with open(path_pickled + '\\list_preprocessed_texts_chunks=500_GER.pkl', 'rb') as f:
    list_preprocessed_texts = pickle.load(f)

In [11]:
list_chunks = []
list_ids = []

for l in list_preprocessed_texts:
    list_chunks.append([item for item in l[0]])
    list_ids.append([item for item in l[1]])

In [12]:
ids = list(chain(*list_ids))

In [13]:
all_chunks = []

counter = 0
for text in list_chunks:
    for l in text:
        if len(l) != 0:
            for x in l:
                all_chunks.append([x, ids[counter]])
                #print(ids[counter])
                counter += 1

In [10]:
#with open(path_pickled + '\\preprocessed_texts_chunks=500_GER.pkl', 'wb') as f:
#    pickle.dump(all_chunks, f)
    
with open(path_pickled + '\\preprocessed_texts_chunks=500_GER.pkl', 'rb') as f:
    all_chunks = pickle.load(f)

In [11]:
chunks_ls = []

for x in all_chunks:
    chunks_ls.append(x[0])
                       
ids_ls = []

for x in all_chunks:
    ids_ls.append(x[1])

In [12]:
chunks_pd = pd.DataFrame(list(zip(ids_ls, chunks_ls)),
                         columns=['chunk_id', 'chunk'])

In [13]:
chunks_pd['text_id'] = chunks_pd['chunk_id'].str.replace('(.+)_(\\d+)', '\\1', regex=True)

In [14]:
doc_chunks = chunks_pd.apply(
    lambda r: TaggedDocument(words=r['chunk'], tags=[r.chunk_id]), axis=1)

In [15]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [16]:
iterations = 5

In [17]:
we_results = path_results + '\\model_iteration=' + str(iterations)
if not os.path.exists(we_results):
    os.makedirs(we_results)

In [35]:
for i in range(1, iterations+1):
    model = Doc2Vec(vector_size = 100, window = 5, min_count=5, seed=i)

    model.build_vocab(doc_chunks.values)
    model.train(doc_chunks, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Save each model
    model.save(we_results + '\\doc2vec_' + str(i) + '.model')

2025-02-10 22:44:29,082 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w5,mc5,s0.001,t3>', 'datetime': '2025-02-10T22:44:29.082836', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-02-10 22:44:29,098 : INFO : collecting all words and their counts
2025-02-10 22:44:29,100 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-02-10 22:44:30,813 : INFO : PROGRESS: at example #10000, processed 5188297 words (3029794 words/s), 165015 word types, 10000 tags
2025-02-10 22:44:32,616 : INFO : PROGRESS: at example #20000, processed 10360222 words (2870800 words/s), 246855 word types, 20000 tags
2025-02-10 22:44:34,777 : INFO : PROGRESS: at example #30000, processed 15531830 words (2389448 words/s), 311173 word types, 30000 tags
2025-02-10 22:44:37,003 : INFO : PROGRESS: at example #40000, processed

In [36]:
we_results_control = path_results + '\\control_model_iteration=' + str(iterations)
if not os.path.exists(we_results_control):
    os.makedirs(we_results_control)

In [37]:
for i in range(1, iterations+1):
    model = Doc2Vec(vector_size = 100, window = 5, min_count=5, seed=i)

    model.build_vocab(doc_chunks.values)
    model.train(doc_chunks, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Save each model
    model.save(we_results_control + '\\doc2vec_' + str(i) + '.model')

2025-02-10 23:08:04,649 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w5,mc5,s0.001,t3>', 'datetime': '2025-02-10T23:08:04.649075', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-02-10 23:08:04,703 : INFO : collecting all words and their counts
2025-02-10 23:08:04,705 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-02-10 23:08:06,861 : INFO : PROGRESS: at example #10000, processed 5188297 words (2405564 words/s), 165015 word types, 10000 tags
2025-02-10 23:08:09,133 : INFO : PROGRESS: at example #20000, processed 10360222 words (2280724 words/s), 246855 word types, 20000 tags
2025-02-10 23:08:11,310 : INFO : PROGRESS: at example #30000, processed 15531830 words (2376483 words/s), 311173 word types, 30000 tags
2025-02-10 23:08:13,650 : INFO : PROGRESS: at example #40000, processed

In [18]:
models = [Doc2Vec.load(we_results + '\\doc2vec_' + str(i+1) + '.model') for i in range(iterations)]

2025-02-21 16:20:07,282 : INFO : loading Doc2Vec object from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER\model_iteration=5\doc2vec_1.model
2025-02-21 16:20:07,480 : INFO : loading dv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER\model_iteration=5\doc2vec_1.model.dv.* with mmap=None
2025-02-21 16:20:07,482 : INFO : loading wv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER\model_iteration=5\doc2vec_1.model.wv.* with mmap=None
2025-02-21 16:20:07,483 : INFO : loading vectors from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER\model_iteration=5\doc2vec_1.model.wv.vectors.npy with mmap=None
2025-02-21 16:20:07,544 : INFO : loading syn1neg from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_lithist/GER\model_iteration=5\doc2vec_1.model.syn1neg.npy with mmap=None
2025-02-21 16:20:07,60

In [39]:
control_models = [Doc2Vec.load(we_results_control + '\\doc2vec_' + str(i+1) + '.model') for i in range(iterations)]

2025-02-11 00:03:11,348 : INFO : loading Doc2Vec object from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\control_model_iteration=5\doc2vec_1.model
2025-02-11 00:03:11,545 : INFO : loading dv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\control_model_iteration=5\doc2vec_1.model.dv.* with mmap=None
2025-02-11 00:03:11,546 : INFO : loading wv recursively from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\control_model_iteration=5\doc2vec_1.model.wv.* with mmap=None
2025-02-11 00:03:11,547 : INFO : loading vectors from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\control_model_iteration=5\doc2vec_1.model.wv.vectors.npy with mmap=None
2025-02-11 00:03:11,588 : INFO : loading syn1neg from C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\control_model_iteration=5\doc2vec_1.model.syn1

In [19]:
def create_dv_list(models, df_texts, filtered_indices, id):
    matrix = []

    for i in filtered_indices:
        doc_id = df_texts.loc[i, id]
        chunk_vectors = [model.dv[doc_id] for model in models]
        matrix.append(chunk_vectors)
    
    return np.array(matrix)  # Shape: (num_chunks, num_models, vector_size)

In [20]:
def compute_centroid_similarity(models, set_of_texts, df_texts, id):
    centroids = []
    text_titles = []

    for text_id in set_of_texts:
        # Retrieve indices for all chunks of the document
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        
        # Get document vectors for all chunks across all models
        chunk_vectors = create_dv_list(models, df_texts, filtered_indices, id)
        
        # Compute centroid by averaging across all chunks and models
        document_centroid = np.mean(chunk_vectors, axis=(0, 1))  # Average over chunks and models
        centroids.append(document_centroid)
        text_titles.append(text_id)
    
    # Calculate cosine similarity between centroids
    similarity_matrix = cosine_similarity(centroids)
    
    # Set diagonal to 0
    np.fill_diagonal(similarity_matrix, 0)
    
    # Optionally, return as a DataFrame for better readability with text titles as labels
    similarity_df = pd.DataFrame(similarity_matrix, index=text_titles, columns=text_titles)
    
    return similarity_df

In [21]:
set_of_texts = set(chunks_pd['text_id'])

In [22]:
similarity_df = compute_centroid_similarity(models, set_of_texts, chunks_pd, 'chunk_id')
#similarity_df.to_csv(path_results + '\\doc2vec_adjacency_GER.csv', encoding='UTF-8') 

In [None]:
similarity_control_df = compute_centroid_similarity(control_models, set_of_texts, chunks_pd, 'chunk_id')
#similarity_control_df.to_csv(path_results + '\\doc2vec_adjacency_control_models_GER.csv', encoding='UTF-8') 

### Intra-textual variance

The intra-textual variance of a text is measured by the the Eucledian distance of its chunks to its centroid. The result is summarised as the mean of squared distances. 

In [45]:
def intra_textual_variance(models, set_of_texts, df_texts, id):
    mean_squared_distances = []

    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        matrix = create_dv_list(models, df_texts, filtered_indices, id)
        centroid = np.mean(matrix, axis=(0, 1))  # Average over chunks and models
        
        squared_distances = [
            distance.euclidean(chunk_vector.flatten(), centroid) ** 2
            for chunk_vector in matrix.reshape(-1, matrix.shape[2])
        ]
        
        mean_squared_distances.append(np.mean(squared_distances))
    
    return mean_squared_distances

In [46]:
intra_textual_variance = intra_textual_variance(models, set_of_texts, chunks_pd, 'chunk_id')

### Stepwise distance

The stepwise distance measures gradual changes of a text by calculating the Eucledian distances between consecutive chunks, summarised the mean of squared distances. 

In [47]:
def stepwise_distance(models, set_of_texts, df_texts, id):
    all_model_distances = []

    for s in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == s].tolist()
        matrices = create_dv_list(models, df_texts, filtered_indices, id)
        model_distances = []
        for matrix in matrices:
            n_chunks = matrix.shape[0]
            
            if n_chunks == 1:
                model_distances.append(0)  # No distance if only one chunk
                continue
            
            squared_distances = [
                distance.euclidean(matrix[i], matrix[i + 1]) ** 2
                for i in range(n_chunks - 1)
            ]
            model_distances.append(mean(squared_distances))        
        all_model_distances.append(mean(model_distances))

    return all_model_distances

In [48]:
stepwise_distance = stepwise_distance(models, set_of_texts, chunks_pd, 'chunk_id')

In [49]:
stepwise_distance

[47.36115605120619,
 48.47470149280319,
 41.86271211195424,
 39.8888894370949,
 41.67780311536539,
 70.67649835852558,
 55.827214489294626,
 48.58163435817601,
 42.3024414346353,
 53.3233212269795,
 49.32041200506156,
 44.69160810887551,
 45.64902988210415,
 45.03655348659672,
 39.92074051655701,
 47.985969950276875,
 42.61996415924564,
 53.35305139595727,
 50.74252750110251,
 50.0455668494289,
 44.478746250076924,
 53.77687543016851,
 49.69705818007989,
 41.02924706519229,
 44.84436164040677,
 45.25994662097023,
 45.491166880077685,
 45.485544350666906,
 44.32354611401321,
 59.98527282353309,
 36.161529756948035,
 41.00136947175002,
 44.83973884923964,
 50.820280163220545,
 48.94853195601379,
 46.09979890524383,
 45.2842024573763,
 46.72575946087212,
 38.23518916013479,
 67.97727928517136,
 44.59420739280349,
 52.4396358165087,
 61.39982305215079,
 56.10766141857462,
 55.807605254996396,
 42.45130993045923,
 37.83794678957062,
 49.41803054251742,
 47.61130740958595,
 38.73378732909935

### Outlier score
The outlier score compares the centroids of all text chunks and identifies the distance to the nearest neighbour.

In [50]:
def outlier_score(models, set_of_texts, df_texts, id):
    list_of_arrays = []
    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        list_of_arrays.append(create_dv_list(models, df_texts, filtered_indices, id))

    centroids = [np.mean(array, axis=(0, 1)) for array in list_of_arrays]
    dist = pd.DataFrame(euclidean_distances(centroids, centroids))
    
    minimal_distances = []
    dist = dist.replace(0, np.nan)
    for i in range(dist.shape[0]):
        minimal_distances.append(np.nanmin(dist[i]))
    
    return minimal_distances

In [51]:
outlier_score = outlier_score(models, set_of_texts, chunks_pd, 'chunk_id')

### Overlap score
The overlap score is the fraction of the k-nearest chunks to a centroid are identified (with k being the number of chunks of the text in question) that do not belong to the text itself. 

In [52]:
def overlap_score(models, set_of_texts, df_texts, id):
    list_of_arrays = []
    num_of_chunks = []
    all_filtered_indices = []

    # Step 1: Use create_dv_list to gather chunk vectors for each document
    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        all_filtered_indices.append(filtered_indices)
        list_of_arrays.append(create_dv_list(models, df_texts, filtered_indices, id))
        num_of_chunks.append(len(filtered_indices))
    
    # Step 2: Calculate centroids for each document by averaging across chunks and models
    centroids = [np.mean(array, axis=(0, 1)) for array in list_of_arrays]  # 1-D centroids

    overlap_scores = []

    # Flatten each chunk vector across all documents into a single list, averaged across models
    all_chunks = [np.mean(chunk, axis=0) for doc_vectors in list_of_arrays for chunk in doc_vectors]

    for i, centroid in enumerate(centroids):
        # Calculate distances from the centroid to each chunk in all documents
        distances = {j: distance.euclidean(centroid, chunk) for j, chunk in enumerate(all_chunks)}
        
        # Get indices of k-nearest chunks to the centroid (k = number of chunks in the document)
        smallest_dist_indices = nsmallest(num_of_chunks[i], distances, key=distances.get)
        
        # Count how many of the k-nearest chunks do not belong to the document itself
        overlap_count = len([idx for idx in smallest_dist_indices if idx not in all_filtered_indices[i]])
        
        # Calculate overlap score as the fraction of out-of-document chunks among k-nearest neighbors
        overlap_scores.append(overlap_count / num_of_chunks[i])

    return overlap_scores

In [53]:
overlap_score = overlap_score(models, set_of_texts, chunks_pd, 'chunk_id')

In [54]:
df_scores = pd.DataFrame(list(zip(set_of_texts, intra_textual_variance, stepwise_distance, outlier_score, overlap_score)),
                         columns =['text_id', 'intra_textual_variance', 'stepwise_distance', 'outlier_score', 'overlap_score'])

In [55]:
df_scores.to_csv(path_results + '\\doc2vec_scores_GER.csv')

In [23]:
corpus_meta = pd.read_csv(corpus_path + '/GER_corpus.csv', sep = ';', encoding='UTF-8')

In [24]:
corpus_meta = corpus_meta.copy()
corpus_meta['text_id'] = corpus_meta['wikiname'] + '_' + corpus_meta['wikiID']

In [54]:
shorts = []
for file in files:
    short = re.sub(r'.+\\GER\\(.+).txt', '\\1', str(file))
    if short not in list(corpus_meta['text_id']):
        print(short)
    shorts.append(short)

In [55]:
for e in list(corpus_meta['text_id']):
    if e not in shorts:
        print(e)

In [56]:
def fetch_texts(subset_df, text_column, full_text_df, text_id_column):
    text_ids = subset_df[text_id_column].tolist()
    texts = full_text_df[full_text_df[text_id_column].isin(text_ids)][text_column].tolist()
    return texts

In [57]:
def get_vectors_from_models(models, texts):
    all_vectors = []
    for model in models:
        # Directly use tokenized lists
        vectors = [model.infer_vector(text) for text in texts]
        all_vectors.append(vectors)
    return np.array(all_vectors).transpose(1, 0, 2)  # Shape: (num_texts, num_models, vector_size)

In [58]:
def compute_centroid(vectors):
    return np.mean(vectors, axis=(0, 1))  # Mean over both texts and models

In [59]:
def calculate_centroid_for_subset(models, subset_df, full_text_df, text_id_column, text_column):
    # Fetch texts based on metadata
    texts = fetch_texts(subset_df, text_column, full_text_df, text_id_column)
    
    # Generate vectors for the texts
    vectors = get_vectors_from_models(models, texts)  # Shape: (num_texts, num_models, vector_size)
    
    # Compute and return centroid
    return compute_centroid(vectors)

In [60]:
def process_year(start):
    # Track progress within the worker
    print(f"Processing year range: {start-5} to {start+5}")
    
    # Filter the subset
    corpus_subset = corpus_meta[(corpus_meta['pub_year'] >= start - 5) & (corpus_meta['pub_year'] < start + 5)]
    
    if not corpus_subset.empty:
        centroid = calculate_centroid_for_subset(
            models=models, 
            subset_df=corpus_subset, 
            full_text_df=chunks_pd, 
            text_id_column='text_id', 
            text_column='chunk'
        )
        label = f"{start}"
        return label, centroid
    return None, None

In [25]:
start_year = 1688
end_year = 1914
years = list(range(start_year, end_year + 1))

In [26]:
output_dir = path_results + '\\batch_rollingCentroids'
os.makedirs(output_dir, exist_ok=True)

In [84]:
progress_file = os.path.join(output_dir, "progress.json")

# Check for progress
if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        progress = json.load(f)
    last_index = progress.get("last_index", -1)  # Store last used index
else:
    last_index = -1

batch_size = 50
start = last_index + 1  # Start at the next index
print(f"Resuming from index {start}")

# Iterate over batches
for i in range(start, len(years), batch_size):
    batch = years[i:i + batch_size]

    results = Parallel(n_jobs=4)(
        delayed(process_year)(year) for year in tqdm(batch, desc=f"Processing index {i}")
    )

    # Save the results for the current batch
    output_file = os.path.join(output_dir, f"batch_{i}.pkl")
    with open(output_file, "wb") as f:
        pickle.dump(results, f)
    print(f"Batch starting at index {i} saved to {output_file}")

    # Update progress
    last_index = i + batch_size - 1  # Save the last processed index
    with open(progress_file, "w") as f:
        json.dump({"last_index": last_index}, f)

Resuming from index 0








Processing index 0:   0%|                                                                       | 0/50 [00:00<?, ?it/s][A[A[A[A[A[A





Processing index 0:  16%|██████████                                                     | 8/50 [05:50<30:42, 43.87s/it][A[A[A[A[A[A





Processing index 0:  24%|██████████████▉                                               | 12/50 [07:24<22:21, 35.31s/it][A[A[A[A[A[A





Processing index 0:  32%|███████████████████▊                                          | 16/50 [09:29<19:08, 33.78s/it][A[A[A[A[A[A





Processing index 0:  40%|████████████████████████▊                                     | 20/50 [11:45<16:56, 33.88s/it][A[A[A[A[A[A





Processing index 0:  48%|█████████████████████████████▊                                | 24/50 [12:57<12:24, 28.64s/it][A[A[A[A[A[A





Processing index 0:  56%|██████████████████████████████████▋                           | 28/50 [13:32<08:08, 22.22s/it][A[A[A[A[A[

Batch starting at index 0 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\batch_rollingCentroids\batch_0.pkl








Processing index 50:   0%|                                                                                                          | 0/50 [00:00<?, ?it/s][A[A[A[A[A[A





Processing index 50:   2%|█▉                                                                                                | 1/50 [00:00<00:05,  8.42it/s][A[A[A[A[A[A





Processing index 50:  16%|███████████████▋                                                                                  | 8/50 [01:11<06:30,  9.31s/it][A[A[A[A[A[A





Processing index 50:  24%|███████████████████████▎                                                                         | 12/50 [02:28<08:34, 13.54s/it][A[A[A[A[A[A





Processing index 50:  32%|███████████████████████████████                                                                  | 16/50 [03:26<07:53, 13.93s/it][A[A[A[A[A[A





Processing index 50:  40%|██████████████████████████████████████▊                                  

Batch starting at index 50 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\batch_rollingCentroids\batch_50.pkl








Processing index 100:   0%|                         | 0/50 [00:00<?, ?it/s][A[A[A[A[A[A





Processing index 100:  16%|██▋              | 8/50 [04:15<22:20, 31.92s/it][A[A[A[A[A[A





Processing index 100:  24%|███▊            | 12/50 [10:47<37:40, 59.50s/it][A[A[A[A[A[A





Processing index 100:  32%|█████           | 16/50 [20:14<51:03, 90.12s/it][A[A[A[A[A[A





Processing index 100:  40%|██████         | 20/50 [28:40<51:23, 102.79s/it][A[A[A[A[A[A





Processing index 100:  48%|███████▋        | 24/50 [33:26<40:02, 92.39s/it][A[A[A[A[A[A





Processing index 100:  56%|████████▉       | 28/50 [37:54<30:53, 84.27s/it][A[A[A[A[A[A





Processing index 100:  64%|██████████▏     | 32/50 [42:54<24:24, 81.35s/it][A[A[A[A[A[A





Processing index 100:  72%|███████████▌    | 36/50 [50:42<21:33, 92.38s/it][A[A[A[A[A[A





Processing index 100:  80%|████████████▊   | 40/50 [56:46<15:19, 91.97s/it][A[A[A[A[A[A





Proc

Batch starting at index 100 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\batch_rollingCentroids\batch_100.pkl








Processing index 150:   0%|                         | 0/50 [00:00<?, ?it/s][A[A[A[A[A[A





Processing index 150:  16%|██▋              | 8/50 [10:20<54:17, 77.55s/it][A[A[A[A[A[A





Processing index 150:  24%|███          | 12/50 [22:07<1:15:19, 118.92s/it][A[A[A[A[A[A





Processing index 150:  32%|████▏        | 16/50 [33:37<1:18:41, 138.86s/it][A[A[A[A[A[A





Processing index 150:  40%|█████▏       | 20/50 [44:05<1:12:34, 145.16s/it][A[A[A[A[A[A





Processing index 150:  48%|██████▏      | 24/50 [54:42<1:04:56, 149.88s/it][A[A[A[A[A[A





Processing index 150:  56%|███████▎     | 28/50 [1:02:21<50:48, 138.56s/it][A[A[A[A[A[A





Processing index 150:  64%|████████▎    | 32/50 [1:07:22<35:35, 118.63s/it][A[A[A[A[A[A





Processing index 150:  72%|█████████▎   | 36/50 [1:14:38<26:59, 115.65s/it][A[A[A[A[A[A





Processing index 150:  80%|██████████▍  | 40/50 [1:25:05<21:22, 128.28s/it][A[A[A[A[A[A





Proc

Batch starting at index 150 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\batch_rollingCentroids\batch_150.pkl








Processing index 200:   0%|                         | 0/27 [00:00<?, ?it/s][A[A[A[A[A[A





Processing index 200:  15%|██▌              | 4/27 [00:00<00:00, 36.51it/s][A[A[A[A[A[A





Processing index 200:  15%|██▌              | 4/27 [00:21<00:00, 36.51it/s][A[A[A[A[A[A





Processing index 200:  30%|█████            | 8/27 [09:17<25:58, 82.01s/it][A[A[A[A[A[A





Processing index 200:  44%|██████▋        | 12/27 [18:40<27:11, 108.75s/it][A[A[A[A[A[A





Processing index 200:  59%|████████▉      | 16/27 [27:13<21:21, 116.53s/it][A[A[A[A[A[A





Processing index 200:  74%|███████████    | 20/27 [38:04<15:32, 133.17s/it][A[A[A[A[A[A





Processing index 200: 100%|███████████████| 27/27 [46:49<00:00, 104.06s/it][A[A[A[A[A[A


Batch starting at index 200 saved to C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/analyses/modelling_listhist/GER\batch_rollingCentroids\batch_200.pkl


In [88]:
with open(output_dir + '\\batch_0.pkl', 'rb') as f:
    results_1 = pickle.load(f)

with open(output_dir + '\\batch_50.pkl', 'rb') as f:
    results_2 = pickle.load(f)

with open(output_dir + '\\batch_100.pkl', 'rb') as f:
    results_3 = pickle.load(f)

with open(output_dir + '\\batch_150.pkl', 'rb') as f:
    results_4 = pickle.load(f)

with open(output_dir + '\\batch_200.pkl', 'rb') as f:
    results_5 = pickle.load(f)

In [89]:
all_results_ls = [results_1, results_2, results_3, results_4, results_5]

In [90]:
all_centroids = []

for r in all_results_ls:
    all_centroids.extend(r)

In [27]:
#with open(output_dir + '\\all_rollingCentroids.pkl', 'wb') as f:
#    pickle.dump(all_centroids, f)

with open(output_dir + '\\all_rollingCentroids.pkl', 'rb') as f:
    all_centroids = pickle.load(f)

In [28]:
valid_indices = [i for i, c in enumerate(all_centroids) if c[1] is not None and len(c) > 0]
valid_centroids = [np.ravel(all_centroids[i][1]) for i in valid_indices]

In [29]:
interp_func = interp1d(valid_indices, valid_centroids, axis=0, fill_value="extrapolate")

# Impute missing centroids using the interpolation function
imputed_centroids = []
imputed_years = []

for idx in range(len(all_centroids)):
    if all_centroids[idx][1] is None or len(all_centroids[idx][1]) == 0:  # Check the second part of the tuple
        print(f"Centroid {idx} is missing. Imputing with interpolation.")
        # Interpolate missing centroid using the function
        interpolated_centroid = interp_func(idx)
        # Keep the same publication year and apply the interpolated centroid
        imputed_centroids.append(interpolated_centroid)
        imputed_years.append(int(years[idx]))
    else:
        # If centroid exists, flatten the numerical array and preserve the text id
        imputed_centroids.append((np.ravel(all_centroids[idx][1])))
        imputed_years.append(int(all_centroids[idx][0]))

centroid_matrix = np.array(imputed_centroids)
print(f"Centroid matrix shape: {centroid_matrix.shape}")

Centroid 20 is missing. Imputing with interpolation.
Centroid 21 is missing. Imputing with interpolation.
Centroid 22 is missing. Imputing with interpolation.
Centroid 23 is missing. Imputing with interpolation.
Centroid 24 is missing. Imputing with interpolation.
Centroid 25 is missing. Imputing with interpolation.
Centroid 26 is missing. Imputing with interpolation.
Centroid 27 is missing. Imputing with interpolation.
Centroid 28 is missing. Imputing with interpolation.
Centroid 29 is missing. Imputing with interpolation.
Centroid 30 is missing. Imputing with interpolation.
Centroid 31 is missing. Imputing with interpolation.
Centroid 32 is missing. Imputing with interpolation.
Centroid 33 is missing. Imputing with interpolation.
Centroid 34 is missing. Imputing with interpolation.
Centroid 35 is missing. Imputing with interpolation.
Centroid 36 is missing. Imputing with interpolation.
Centroid 37 is missing. Imputing with interpolation.
Centroid 38 is missing. Imputing with interpol

In [30]:
def compute_text_centroids(models, set_of_texts, df_texts, id):
    centroids = []  # List to store centroids of each text
    text_ids = []   # List to store corresponding text_id labels

    for text_id in set_of_texts:
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        chunk_vectors = create_dv_list(models, df_texts, filtered_indices, id)
        document_centroid = np.mean(chunk_vectors, axis=(0, 1)) 
        
        centroids.append(document_centroid)
        text_ids.append(text_id)
    
    centroid_matrix = np.array(centroids)
    
    return centroid_matrix, text_ids

In [31]:
text_matrix = compute_text_centroids(models, set_of_texts, chunks_pd, 'chunk_id')
print(f"Centroid matrix shape: {text_matrix[0].shape}")

Centroid matrix shape: (571, 100)


In [32]:
similarity_matrix = cosine_similarity(centroid_matrix, text_matrix[0])

In [33]:
def compute_similarity_relative(similarity_matrix, yearly_labels, text_publication_years, text_labels):
    similarities_by_text = {}
    total_valid_similarities = 0  

    for idx, text_label in enumerate(text_labels):
        publication_year = text_publication_years[idx]
        
        if publication_year is None:  # Skip texts with missing publication year
            continue

        # Get the similarities for the current text
        similarities = similarity_matrix[:, idx]
        
        # Calculate relative years and corresponding similarities, filtering for the 50 years before and after
        relative_similarities = [
            (year - publication_year, similarities[i])
            for i, year in enumerate(yearly_labels)
            if year is not None and not np.isnan(similarities[i])
            and (year >= publication_year - 50 and year <= publication_year + 50)
        ]
        
        # Update the total counter with the number of valid relative similarities
        total_valid_similarities += len(relative_similarities)
        
        similarities_by_text[text_label] = {'sim': relative_similarities}

    print(f"Total valid relative similarities: {total_valid_similarities}")  
    return similarities_by_text

In [34]:
# Alternative: only text with full range of 100 years before and after publication are considered

# def compute_similarity_relative(similarity_matrix, yearly_labels, text_publication_years, text_labels):
#     similarities_by_text = {}

#     for idx, text_label in enumerate(text_labels):
#         publication_year = text_publication_years[idx]
        
#         if publication_year is None:  # Skip texts with missing publication year
#             continue

#         # Get the similarities for the current text
#         similarities = similarity_matrix[:, idx]
        
#         # Calculate relative years and corresponding similarities, filtering for the 50 years before and after
#         relative_similarities = [
#             (year - publication_year, similarities[i])
#             for i, year in enumerate(yearly_labels)
#             if year is not None and not np.isnan(similarities[i])
#             and (year >= publication_year - 50 and year <= publication_year + 50)
#         ]
        
#         # Check if there is a full 100-year span (50 years before and 50 years after)
#         years_before = [year for year, _ in relative_similarities if year < 0]
#         years_after = [year for year, _ in relative_similarities if year > 0]
        
#         if len(years_before) >= 50 and len(years_after) >= 50:
#             similarities_by_text[text_label] = {'sim': relative_similarities}
    
#     return similarities_by_text

In [35]:
publications_years = []
text_labels = []

for text_id in set_of_texts:
    pubyear_series = corpus_meta['pub_year'][corpus_meta['text_id'] == text_id]
    if not pubyear_series.empty:
        pubyear = pubyear_series.iloc[0]
        publications_years.append(pubyear)
    else:
        publications_years.append(None)  # or handle it as needed
        print(text_id)
    label_series = corpus_meta['text_id'][corpus_meta['text_id'] == text_id]
    if not label_series.empty:
        label = label_series.iloc[0]
        text_labels.append(label)
    else:
        text_labels.append(None)  # or handle it as needed
        print(text_id)

In [36]:
similarities_by_text = compute_similarity_relative(similarity_matrix, imputed_years, publications_years, text_labels)

Total valid relative similarities: 50441


In [46]:
rows = []
for text, values in similarities_by_text.items():
    for idx, sim in values['sim']:
        rows.append((text, idx, sim))

sim_for_each_text = pd.DataFrame(rows, columns=['Text', 'Index', 'Similarity'])

In [47]:
sim_for_each_text.to_csv(path_results + '\\sim_for_each_text_GER.csv')

In [39]:
canonisation_score = pd.read_csv(path_data + '\\scores/canonisation_scores/GER/GER_canonisationScore.csv',
                                encoding='UTF-8', sep=',')

In [84]:
for key, values in similarities_by_text.items():
    matching_row = canonisation_score['canonisation_score'][canonisation_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['canonisation_score'] = score

In [85]:
data = {
    'ID': [],
    'Year': [],
    'Similarity': [],
    'Canonisation Score': []
}

for text, values in similarities_by_text.items():
    canon_score = values['canonisation_score']
    for year, similarity in values['sim']:
        data['ID'].append(text)
        data['Year'].append(year)
        data['Similarity'].append(similarity)
        data['Canonisation Score'].append(canon_score)

df = pd.DataFrame(data)

In [86]:
df_mean = df.groupby(['Year'])['Similarity'].agg(['mean', 'std']).reset_index()

In [87]:
df_mean.to_csv(path_results + '\\rollingCentroids_mean_GER.csv')

In [88]:
# Define bins based on specified thresholds
bins = [0, 0.25, 0.5, 0.75, 1.0]
labels = ['Low (0-0.25)', 'Mid-Low (0.25-0.5)', 'Mid-High (0.5-0.75)', 'High (0.75-1.0)']

df['Score Range'] = pd.cut(df['Canonisation Score'], bins=bins, labels=labels, include_lowest=True)

grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()

  grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()


In [89]:
df.to_csv(path_results + '\\rollingCentroids_canonisation_sim.csv')

In [90]:
grouped.to_csv(path_results + '\\rollingCentroids_canonisation_sim_groupeddf_GER.csv')

In [91]:
# Define bins based on specified thresholds
bins = [0, 0.25, 0.75, 1.0]
labels = ['Low (0-0.25)', 'Mid (0.25-0.75)', 'High (0.75-1.0)']

df['Score Range'] = pd.cut(df['Canonisation Score'], bins=bins, labels=labels, include_lowest=True)

grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()

  grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()


In [92]:
grouped.to_csv(path_results + '\\rollingCentroids_canonisation_sim_groupeddf_three_GER.csv')

In [93]:
# Define bins based on specified thresholds
bins = [0, 0.5, 1.0]
labels = ['Low (0-0.5)', 'High (0.5-1.0)']

df['Score Range'] = pd.cut(df['Canonisation Score'], bins=bins, labels=labels, include_lowest=True)

grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()

  grouped = df.groupby(['Year', 'Score Range'])['Similarity'].agg(['mean', 'std']).reset_index()


In [94]:
grouped.to_csv(path_results + '\\rollingCentroids_canonisation_sim_groupeddf_binary_GER.csv')

In [95]:
reception_score = pd.read_csv(path_data + '\\scores/reception_scores/reception_scores_classes_GER.csv', sep=',',
                             encoding='UTF-8')

In [96]:
for key, values in similarities_by_text.items():
    matching_row = reception_score['circl_binary'][reception_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['circl_binary'] = score
    
    matching_row = reception_score['reviews_binary'][reception_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['reviews_binary'] = score

In [97]:
data = {
    'ID': [],
    'Year': [],
    'Similarity': [],
    'Circulating Library': [],
    'Review': []
}

for text, values in similarities_by_text.items():
    circl_binary = values['circl_binary']
    review = values['reviews_binary']
    for year, similarity in values['sim']:
        data['ID'].append(text)
        data['Year'].append(year)
        data['Similarity'].append(similarity)
        data['Circulating Library'].append(circl_binary)
        data['Review'].append(review)

df = pd.DataFrame(data)

In [98]:
grouped = df.groupby(['Year', 'Circulating Library'])['Similarity'].agg(['mean', 'std']).reset_index()
grouped.to_csv(path_results + '\\rollingCentroids_reception_circllibs_sim_groupeddf_GER.csv')

In [99]:
grouped = df.groupby(['Year', 'Review'])['Similarity'].agg(['mean', 'std']).reset_index()
grouped.to_csv(path_results + '\\rollingCentroids_reception_reviews_sim_groupeddf_GER.csv')

In [100]:
for key, values in similarities_by_text.items():
    matching_row = reception_score['class'][reception_score['ID'] == key]
    if not matching_row.empty:
        score = matching_row.iloc[0]  # Get the scalar value
    else:
        score = None
    similarities_by_text[key]['reception_class'] = score

In [101]:
data = {
    'ID': [],
    'Year': [],
    'Similarity': [],
    'Reception Class': []
}

for text, values in similarities_by_text.items():
    reception_class = values['reception_class']
    for year, similarity in values['sim']:
        data['ID'].append(text)
        data['Year'].append(year)
        data['Similarity'].append(similarity)
        data['Reception Class'].append(reception_class)

df = pd.DataFrame(data)

In [102]:
df.to_csv(path_results + '\\rollingCentroids_reception_sim.csv')

In [103]:
grouped = df.groupby(['Year', 'Reception Class'])['Similarity'].mean().reset_index()

In [104]:
grouped.to_csv(path_results + '\\rollingCentroids_reception_sim_groupeddf_GER.csv')

In [105]:
def compute_chunk_to_centroid_similarities(models, df_texts, rolling_centroids, set_of_texts, id):
    all_similarities = []  # To store similarities for each rolling centroid

    for text_id, centroid in zip(set_of_texts, rolling_centroids):
        filtered_indices = df_texts.index[df_texts.text_id == text_id].tolist()
        chunk_vectors = create_dv_list(models, df_texts, filtered_indices, id)

        # Flatten the chunk vectors for distance calculation
        chunk_vectors_flat = np.vstack(chunk_vectors)
        
        # Compute cosine similarities of chunks to the centroid
        cosine_similarities = cosine_similarity(chunk_vectors_flat, centroid.reshape(1, -1))
            
        # Store the distances
        all_similarities.append(cosine_similarities)
    
    return all_similarities

In [106]:
chunk_to_centroid_similarities = compute_chunk_to_centroid_similarities(models, chunks_pd, imputed_centroids, set_of_texts, 'chunk_id')

In [107]:
mean_chunk_to_centroid_similarities = np.array([np.mean(sim) for sim in chunk_to_centroid_similarities])
std_similarities = np.array([np.std(sim) for sim in chunk_to_centroid_similarities])
lower_percentiles = [np.percentile(sim, 25) for sim in chunk_to_centroid_similarities]
upper_percentiles = [np.percentile(sim, 75) for sim in chunk_to_centroid_similarities]
num_chunks = [len(sim) for sim in chunk_to_centroid_similarities]

In [108]:
years_np = np.asarray(years, dtype=np.float32)

In [109]:
df_chunk_to_centroid_similarities= pd.DataFrame(np.column_stack((mean_chunk_to_centroid_similarities, std_similarities, 
                                                                 lower_percentiles, upper_percentiles, years_np, num_chunks)), 
                                                columns=['mean', 'std', 'Q1', 'Q4', 'year', 'num_chunks'])

In [110]:
df_chunk_to_centroid_similarities.to_csv(path_results + '\\df_chunk_to_centroid_similarities.csv')

In [130]:
cosine_similarities = [
    cosine_similarity([imputed_centroids[i]], [imputed_centroids[i + 1]])[0][0]
    for i in range(len(imputed_centroids) - 1)
]

In [131]:
df_cosine_similarities = pd.DataFrame(np.column_stack((cosine_similarities, years_np[1:], num_chunks[1:])), 
                                      columns=['cosine_similarity','year', 'num_chunks'])

In [132]:
df_cosine_similarities.to_csv(path_results + '\\df_centroid_to_centroid_similarities.csv')

In [40]:
canonisation_score = pd.merge(canonisation_score, corpus_meta, 'left', left_on='ID', right_on='text_id')

In [65]:
results = []

for text in similarity_df.index:
    # Get publication year and canonisation score for the current text
    current_row = canonisation_score.loc[canonisation_score['ID'] == text]
    pub_year = current_row['pub_year'].values[0]
    canonisation = current_row['canonisation_score'].values[0]
    
    # Get texts published before the current text
    valid_texts = canonisation_score[canonisation_score['pub_year'] < pub_year]['ID'].tolist()
    
    # Get similarities for the current text and filter by valid texts
    similarities = similarity_df.loc[text]
    similarities = similarities.loc[valid_texts]
    
    if not similarities.empty:
        # Find the nearest neighbour
        nearest_neighbour = similarities.idxmax()  # Index of max similarity
        similarity_score = similarities.max()      # Corresponding similarity value
        
        # Append results only if NearestNeighbour and SimilarityScore are valid
        results.append({
            'Text': text,
            'CanonisationScore': canonisation,
            'NearestNeighbour': nearest_neighbour,
            'SimilarityScore': similarity_score
        })

In [66]:
# Convert results to a DataFrame
nearest_neighbours_df = pd.DataFrame(results)

In [67]:
nearest_neighbours_df.to_csv(path_results + '\\nearest_neighbours_1_df_GER.csv')

In [41]:
def decay_function(time_diff, lambda_param):
    """
    Apply exponential decay to the time difference between texts.
    
    Parameters:
    - time_diff (int): The time difference between two texts.
    - lambda_param (float): The decay constant (controls how quickly the weight decays).
    
    Returns:
    - weight (float): The weight of the relationship after applying decay.
    """
    return np.exp(-lambda_param * time_diff)

In [42]:
def calculate_similarity_weight(pub_year_text1, pub_year_text2, lambda_param=0.1):
    """
    Calculate the similarity weight between two texts based on their publication years
    using a decay function.
    
    Parameters:
    - pub_year_text1 (int): Publication year of the first text.
    - pub_year_text2 (int): Publication year of the second text.
    - lambda_param (float): The decay constant.
    
    Returns:
    - similarity_weight (float): The similarity weight based on temporal decay.
    """
    # Calculate the time difference between the two texts
    time_diff = abs(pub_year_text1 - pub_year_text2)
    
    # Calculate the similarity weight using the decay function
    similarity_weight = decay_function(time_diff, lambda_param)
    
    return similarity_weight

In [43]:
results = []

lambda_param = 0.01

# Number of nearest neighbours to retrieve
num_neighbours = 3

# Loop through the similarity dataframe
for text in similarity_df.index:
    # Get publication year and canonisation score for the current text
    current_row = canonisation_score.loc[canonisation_score['ID'] == text]
    pub_year = current_row['pub_year'].values[0]
    canonisation = current_row['canonisation_score'].values[0]
    
    # Get texts published before the current text
    valid_texts = canonisation_score[canonisation_score['pub_year'] < pub_year]['ID'].tolist()
    
    # Get similarities for the current text and filter by valid texts
    similarities = similarity_df.loc[text]
    similarities = similarities.loc[valid_texts]
    
    if not similarities.empty:
        # Apply temporal decay to each similarity score
        weighted_similarities = []
        
        for neighbour, similarity in similarities.items():
            # Get publication year for the neighbour text
            neighbour_pub_year = canonisation_score.loc[canonisation_score['ID'] == neighbour, 'pub_year'].values[0]
            
            # Calculate the temporal similarity weight
            time_weight = calculate_similarity_weight(pub_year, neighbour_pub_year, lambda_param)
            
            # Apply the time weight to the similarity score
            weighted_similarity = similarity * time_weight
            weighted_similarities.append((neighbour, weighted_similarity))
        
        # Sort by weighted similarity and retrieve the top `num_neighbours` texts
        sorted_similarities = sorted(weighted_similarities, key=lambda x: x[1], reverse=True)
        
        # Take the top `num_neighbours` entries
        top_neighbours = sorted_similarities[:num_neighbours]
        
        # Append the results
        for neighbour, similarity_score in top_neighbours:
            results.append({
                'Text': text,
                'CanonisationScore': canonisation,
                'NearestNeighbour': neighbour,
                'SimilarityScore': similarity_score
            })

In [44]:
# Convert results to a DataFrame
nearest_neighbours_decay_df = pd.DataFrame(results)

In [45]:
nearest_neighbours_decay_df.to_csv(path_results + '\\nearest_neighbours_3_decay=0.01_df_GER.csv')