Importing the required libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import os
import re
from PIL import Image
%matplotlib widget

Loading the saved statistics CSV as a dataframe

In [None]:
data = pd.read_csv('data/output_data/the_land_of_the_perumauls/consolidated_footnotes.csv')
data.head()

Creating a function that prints the footnote and opens the relevant page given a keyword parameter.

In [None]:
def open_footnote(keyword):
    keyword_df = data[data['footnote_text'].apply(lambda x: re.search(keyword, x, re.IGNORECASE) is not None)]
    if len(keyword_df) == 0:
        print(f'No footnotes found for {keyword}')
        return None
    else:
        for i in range(len(keyword_df)):
            print(keyword_df.iloc[i]['footnote_text'])
            page_file = keyword_df.iloc[i]['page_file']
            img = Image.open(f'data/digital_archives/administrative_archives/the_land_of_the_perumauls/the_land_of_the_perumauls_page_images/{page_file}')
            img.show()
        return keyword_df
    
# filtered_footnotes = open_footnote('caste|castes')
# display(filtered_footnotes['footnote_text'].values)

In [None]:
portuguese_word_count = []
dutch_word_count = []
jew_word_count = []
syrian_word_count = []
hindu_word_count = []
muslim_word_count = []
other_sources_word_count = []
for i in range(len(data)):
    item = data['footnote_text'].iloc[i]
    item_len = data['cluster_label'].iloc[i]
    if re.search('portuguese', item, re.IGNORECASE):
        portuguese_word_count.append(item_len)
    if re.search('dutch', item, re.IGNORECASE):
        dutch_word_count.append(item_len)
    if re.search('jew|jewish|jews', item, re.IGNORECASE):
        jew_word_count.append(item_len)
    if re.search('syrian|syrians|romo-syrian|romo-syrians', item, re.IGNORECASE):
        syrian_word_count.append(item_len)
    if re.search('hindu|hindus', item, re.IGNORECASE):
        hindu_word_count.append(item_len)
    if re.search('mahomedan|mahomedans|mussalmen', item, re.IGNORECASE):
        muslim_word_count.append(item_len)
    elif not re.search('portuguese|dutch|jew|jewish|jews|syrian|syrians', item, re.IGNORECASE):
        other_sources_word_count.append(item_len)
    
def find_avg(list):
    return sum(list)/len(list)
    
print(f'Portuguese | Max: {max(portuguese_word_count)} Min: {min(portuguese_word_count)} Avg: {find_avg(portuguese_word_count)} Len: {len(portuguese_word_count)}')
print(f'Dutch | Max: {max(dutch_word_count)} Min: {min(dutch_word_count)} Avg: {find_avg(dutch_word_count)} Len: {len(dutch_word_count)}')
print(f'Jew | Max: {max(jew_word_count)} Min: {min(jew_word_count)} Avg: {find_avg(jew_word_count)} Len: {len(jew_word_count)}')
print(f'Syrian | Max: {max(syrian_word_count)} Min: {min(syrian_word_count)} Avg: {find_avg(syrian_word_count)} Len: {len(syrian_word_count)}')
print(f'Hindu | Max: {max(hindu_word_count)} Min: {min(hindu_word_count)} Avg: {find_avg(hindu_word_count)} Len: {len(hindu_word_count)}')
print(f'Muslim | Max: {max(muslim_word_count)} Min: {min(muslim_word_count)} Avg: {find_avg(muslim_word_count)} Len: {len(muslim_word_count)}')
print(f'Other sources | Max: {max(other_sources_word_count)} Min: {min(other_sources_word_count)} Avg: {find_avg(other_sources_word_count)} Len: {len(other_sources_word_count)}')

In [None]:
def plot_keyword_cluster_pie_chart(keyword, title):
    keyword_df = data[data['footnote_text'].apply(lambda x: re.search(keyword, x, re.IGNORECASE) is not None)]
    cluster_counts = keyword_df['cluster_label'].value_counts()
    fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(aspect="equal"))
    plt.subplots_adjust(left=0.1, right=0.7, top=0.9, bottom=0.1)
    ax.pie(cluster_counts, labels=cluster_counts.index, autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired(range(len(cluster_counts))))
    ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.title(f'Word Counts for Footnotes Containing "{title}"')

    # Add legend with hardcoded ranges
    cluster_ranges = {
        0: '3-21 words',
        1: '170-221 words',
        2: '48-74 words',
        3: '77-108 words',
        4: '283-302 words',
        5: '22-47 words',
        6: '117-153 words'
    }
    handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=plt.cm.Paired(i), markersize=10) for i in range(len(cluster_ranges))]
    labels = [f'Cluster {i}: {cluster_ranges[i]}' for i in cluster_counts.index]
    ax.legend(handles, labels, loc='upper left', bbox_to_anchor=(0.9, 0.23), fontsize='small')
    # Save the plot
    plt.savefig(f'data/output_data/the_land_of_the_perumauls/{title}_word_count_freq.png')
    plt.show()

plot_keyword_cluster_pie_chart('portuguese', 'portuguese')
plot_keyword_cluster_pie_chart('dutch', 'dutch')
plot_keyword_cluster_pie_chart('jew|jewish|jews', 'jew')
plot_keyword_cluster_pie_chart('syrian|syrians|romo-syrian|romo-syrians', 'syrian')
plot_keyword_cluster_pie_chart('hindu|hindus', 'hindu')
plot_keyword_cluster_pie_chart('mahomedan|mahomedans|mussalmen', 'muslim')