Importing required libraries

In [1]:
import pandas as pd
from collections import Counter
import csv
import re
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

Load footnotes from previously created CSV to facilitate analysis.

In [2]:
footnotes_df = pd.read_csv('data/footnotes/the_land_of_the_perumauls_xml_export/files/The_Land_of_the_Perumauls/page/ordered_footnotes.csv')
footnotes = footnotes_df['Footnote'].to_list()
page_files = footnotes_df['File Name'].to_list()
page_files = [re.sub(r'(\d+)_', '', item) for item in page_files]
print(f'Footnotes:{len(footnotes)} \nPage files:{len(page_files)}')

Footnotes:206 
Page files:206


Creating a function to count number of words in a string

In [3]:
def count_words(string):
    string_split = string.split()
    i = 0
    while i < len(string_split):
        if len(re.findall('\w+', string_split[i])) == 0: # Checking to ensure the 'word' is not just a punctuation mark
            del string_split[i]
        elif re.findall('^-\w+', string_split[i]): # Combining words that moved across lines that were hyphenated
            if i > 0:
                string_split[i-1] = string_split[i-1] + string_split[i]
                del string_split[i]
            else:
                i += 1
        else:
            i+=1
    return len(string_split)

Assessing for source of records

In [4]:
portuguese_word_count = []
dutch_word_count = []
other_sources_word_count = []
for item in footnotes:
    item_len = count_words(item)
    if 'Portuguese' in item:
        portuguese_word_count.append(item_len)
    if 'Dutch' in item:
        dutch_word_count.append(item_len)
    elif 'Portuguese' not in item and 'Dutch' not in item:
        other_sources_word_count.append(item_len)
    
def find_avg(list):
    return sum(list)/len(list)
    
print(f'Portuguese: {find_avg(portuguese_word_count), len(portuguese_word_count)} \nDutch: {find_avg(dutch_word_count), len(dutch_word_count)} \nOther: {find_avg(other_sources_word_count), len(other_sources_word_count)}')

Portuguese: (105.46666666666667, 15) 
Dutch: (59.4, 30) 
Other: (44.207317073170735, 164)


Assessing for geographic scope of records

In [5]:
cochin_word_count = []
malabar_word_count = []
travancore_word_count = []
madras_word_count = []
ceylon_word_count=[]
other_places_word_count = []
for item in footnotes:
    item_len = count_words(item)
    if 'Cochin' in item:
        cochin_word_count.append(item_len)
    if 'Malabar' in item:
        malabar_word_count.append(item_len)
    if 'Travancore' in item:
        travancore_word_count.append(item_len)
    if 'Madras' in item:
        madras_word_count.append(item_len)
    if 'Ceylon' in item:
        ceylon_word_count.append(item_len)
    elif 'Cochin' not in item and 'Malabar' not in item and 'Travancore' not in item and 'Madras' not in item and 'Ceylon' not in item:
        other_places_word_count.append(item_len)
    
def find_avg(list):
    return sum(list)/len(list)
    
print(f'Cochin: {find_avg(cochin_word_count), len(cochin_word_count)} \nMalabar: {find_avg(malabar_word_count), len(malabar_word_count)} \nTravancore: {find_avg(travancore_word_count), len(travancore_word_count)} \nMadras: {find_avg(madras_word_count), len(madras_word_count)} \nCeylon: {find_avg(ceylon_word_count), len(ceylon_word_count)} \nOther: {find_avg(other_places_word_count), len(other_places_word_count)}')

Cochin: (57.79245283018868, 53) 
Malabar: (70.57142857142857, 14) 
Travancore: (69.72727272727273, 11) 
Madras: (52.7, 10) 
Ceylon: (48.42857142857143, 7) 
Other: (44.12903225806452, 124)


Assessing general patterns in length of footnotes

In [6]:
# Counting max, min, and average word counts across all footnotes
max_value = 0
min_value = 4 # Set based on knowledge that smallest footnote is 3 words long
word_counts = []
for item in footnotes:
    word_count = count_words(item)
    word_counts.append(word_count)
    if word_count > max_value:
        max_value = word_count
    elif word_count < min_value:
        min_value = word_count
avg_value = sum(word_counts)/len(word_counts)
print(f'Max: {max_value} \nMin: {min_value} \nAvg: {avg_value}')

Max: 302 
Min: 3 
Avg: 49.66019417475728


Clustering word counts to make analysis easier

In [10]:
word_counts_array = np.array(word_counts).reshape(-1,1) # Reshaping the word counts into a 2D array to enable clustering

# # Generating an elbow plot to determine the right number of clusters
# inertia = []
# for k in range(2,10):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     kmeans.fit(word_counts_array)
#     inertia.append(kmeans.inertia_)

# plt.plot(range(2,10), inertia, marker='o')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Inertia')
# plt.title('Elbow Plot to Determine Optimum Number of Clusters')
# plt.show()

# # Calculating silhouette scores for 4-7 clusters to determine best option
# sil_scores = []
# for k in range(4,8):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     kmeans.fit(word_counts_array)
#     sil_scores.append(silhouette_score(word_counts_array, kmeans.labels_))

# plt.plot(range(4,8), sil_scores, marker='o')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Score')
# plt.title('Silhousette Scores for Different Clusters')
# plt.show()

# Ideal number of clusters has been found to be 7
# Creating the actual clusters and storing the data as lists
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(word_counts_array)
# print(kmeans.cluster_centers_)

cluster_1 = (word_counts_array[kmeans.labels_ == 0]).reshape(-1)
cluster_2 = (word_counts_array[kmeans.labels_ == 1]).reshape(-1)
cluster_3 = (word_counts_array[kmeans.labels_ == 2]).reshape(-1)
cluster_4 = (word_counts_array[kmeans.labels_ == 3]).reshape(-1)
cluster_5 = (word_counts_array[kmeans.labels_ == 4]).reshape(-1)
cluster_6 = (word_counts_array[kmeans.labels_ == 5]).reshape(-1)
cluster_7 = (word_counts_array[kmeans.labels_ == 6]).reshape(-1)

def cluster_stats(cluster):
    min_val = __builtins__.min(cluster)
    max_val = __builtins__.max(cluster)
    avg_val = sum(cluster)/len(cluster)
    print(f'Min: {min_val} | Max: {max_val} | Avg: {avg_val} | Length: {len(cluster)}')

cluster_stats(cluster_1) # 3:21 (8.8, 72)
cluster_stats(cluster_2) # 170:221 (189.75, 4)
cluster_stats(cluster_3) # 48:74 (59.9, 42)
cluster_stats(cluster_4) # 77:108 (92, 23)
cluster_stats(cluster_5) # 283:302 (291.6, 3)
cluster_stats(cluster_6) # 22:47 (34.88, 50)
cluster_stats(cluster_7) # 117:153 (132, 12)

Min: 3 | Max: 21 | Avg: 8.805555555555555 | Length: 72
Min: 170 | Max: 221 | Avg: 189.75 | Length: 4
Min: 48 | Max: 74 | Avg: 59.904761904761905 | Length: 42
Min: 77 | Max: 108 | Avg: 92.0 | Length: 23
Min: 283 | Max: 302 | Avg: 291.6666666666667 | Length: 3
Min: 22 | Max: 47 | Avg: 34.88 | Length: 50
Min: 117 | Max: 153 | Avg: 132.16666666666666 | Length: 12




Creating a CSV with all the relevant information about the footnote

In [11]:
# csv field headings
headings = ['page_file', 'page_number', 'footnote_text', 'cluster_label', 'word_count']

# creating a list of page numbers
page_numbers = []
for item in page_files:
    match = re.search(r'_(\d+).jpg', item)
    page_number = match.group(1)
    page_numbers.append(page_number)

# creating the csv
with open('data/output_data/the_land_of_the_perumauls/consolidated_footnotes.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headings)
    for i in zip(page_files, page_numbers, footnotes, kmeans.labels_, word_counts):
        writer.writerow(i)