## Sample Analysis

The following code loads the text to be analysed.

In [2]:
# Import required libraries
import pickle

# Loading the pickled list of docs
with open("./data/docs.pkl", "rb") as descriptions_docs:
    docs = pickle.load(descriptions_docs)

### Step-1: Finding and Graphing Co-Occurrences

The following code takes a primary word as an input and loads the top ten co-occurring nouns in the corpus as output.

In [None]:
# Import required libraries
import networkx as nx
from collections import Counter
import matplotlib.pyplot as plt
import spacy
import numpy as np

# Function to get a list of the top 10 words corresponding to a specific part of speech that co-occur most frequently with a given word
def find_in_para(docs, co_occurrences, primary_word):
    final_list = [] # List to hold co-occurrences
    final_frequencies = {} # Dict to hold co-occurrences and their frequencies
    final_dict = {} # Dict to hold top 10 co-occurrences and their frequencies
    nlp = spacy.load('en_core_web_sm')

    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)  # Convert paragraph string to SpaCy Doc
            word_found = False
            co_occurrences_in_paragraph = []
            for token in paragraph_doc:
                # Check for occurrence of primary word
                if token.text.lower() == primary_word:
                    word_found = True
                # Generate list of co-occurrences
                if token.pos_ == co_occurrences and token.text != primary_word:
                    co_occurrences_in_paragraph.append(token.text)
            if word_found and co_occurrences_in_paragraph:
                final_list.extend(co_occurrences_in_paragraph)

    # Calculate frequencies of co-occurrences
    final_frequencies = Counter(final_list)
    # # Get the frequencies as a list for the boxplot
    # frequencies = list(final_frequencies.values())

    # # Create a boxplot
    # plt.boxplot(frequencies)
    # plt.title('Frequency Distribution of Co-occurrences')
    # plt.show()

    # # Calculate IQR and identify outliers
    # q1 = np.percentile(frequencies, 25)
    # q3 = np.percentile(frequencies, 75)
    # iqr = q3 - q1
    # lower_bound = q1 - 1.5 * iqr
    # upper_bound = q3 + 1.5 * iqr

    # # Create a new dictionary for outliers
    # for word, freq in final_frequencies.items():
    #     if freq < lower_bound or freq > upper_bound:
    #         final_dict[word] = freq

    # Create a final dict of the top ten co-occurrences
    final_dict = dict(final_frequencies.most_common(10))
    display(final_dict)
    return final_list, final_dict

dairy_co_occurrences, dairy_top_ten = find_in_para(docs, 'NOUN', 'dairy')
cheri_co_occurrences, cheri_top_ten = find_in_para(docs, 'NOUN', 'chēri')
menstrual_co_occurrences, menstrual_top_ten = find_in_para(docs, 'NOUN', 'menstrual')
pollution_co_occurrences, pollution_top_ten = find_in_para(docs, 'NOUN', 'pollution')

The following code saves the above generated lists of co_occurrences as json files.

In [12]:
import json

# Function to save the lists generated above as json files for future use
def write_to_file(file_path, data_list):
    with open(file_path, 'w') as file:
        json.dump(data_list, file)

write_to_file('./data/co_occurrences_lists/dairy_co_occurrences.json', dairy_co_occurrences)
write_to_file('./data/co_occurrences_lists/cheri_co_occurrences.json', cheri_co_occurrences)
write_to_file('./data/co_occurrences_lists/menstrual_co_occurrences.json', menstrual_co_occurrences)
write_to_file('./data/co_occurrences_lists/pollution_co_occurrences.json', pollution_co_occurrences)

The following code creates category lists of the above generated top ten co-occurrences in order to colour-code the co-occurrence graphs.

In [13]:
material_entities = ['buffaloes', 'milk', 'fire', 'rice', 'water', 'cloth']
social_relations = ['palol']
spaces = ['village', 'huts', 'temple', 'mand', 'ti', 'grāmam', 'temples', 'house', 'hut']

In [43]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

def create_graph(primary_word, co_occurrences_list):

    # Define the color mapping function
    def get_node_color(node):
        if node in material_entities:
            return '#e0ecf4'
        elif node in social_relations:
            return '#fde0dd'
        elif node in spaces:
            return '#bcbddc'
        else:
            return '#d3d3d3'

    # Create a new graph
    G = nx.Graph()

    # Add the primary word to the graph
    G.add_node(primary_word)

    # Add the dictionary words to the graph and connect them to the primary word
    for word, frequency in co_occurrences_list.items():
        G.add_node(word)
        G.add_edge(primary_word, word, weight=frequency)

    # Set positions for the nodes based on frequency
    pos = nx.spring_layout(G, k=0.5)

    # Calculate maximum frequency for scaling node size
    max_frequency = max(co_occurrences_list.values())

    # Set the figure size
    plt.figure(figsize=(12, 10))

    # Get the node colors based on the lists
    node_colors = ['#0a0000' if node == primary_word else get_node_color(node) for node in G.nodes()]

    # Draw the primary node with a specific color
    nx.draw_networkx_nodes(G, pos, nodelist=[primary_word], node_color='#0a0000', node_size=5000)

    # Draw the rest of the nodes and edges
    nx.draw(G, pos, with_labels=False, node_size=[5000 * (co_occurrences_list.get(node, 1) / max_frequency) for node in G.nodes()], node_color=node_colors, font_size=10)
        
    # Draw the node labels
    nx.draw_networkx_labels(G, pos, labels={node: f"{node}\n({co_occurrences_list.get(node, '')})" if node != primary_word else node for node in G.nodes()})

    # Draw the label for the primary word node separately with white font color
    nx.draw_networkx_labels(G, pos, labels={primary_word: primary_word}, font_color='white')
    
    # Add a plot title
    plt.title(f"Top 10 Nouns Co-Occurring with '{primary_word}'")

    # Create legend handles and labels
    legend_handles = [
        mpatches.Patch(color='#e0ecf4', label='Material Entities'),
        mpatches.Patch(color='#fde0dd', label='Social Relations'),
        mpatches.Patch(color='#bcbddc', label='Spaces'),
        mpatches.Patch(color='#d3d3d3', label='Other')
    ]

    # Add the legend to the plot
    plt.legend(handles=legend_handles)

    # Show the plot
    plt.show()

In [None]:
# create_graph('dairy', dairy_top_ten)
# create_graph('chēri', cheri_top_ten)
# create_graph('menstrual', menstrual_top_ten)
# create_graph('pollution', pollution_top_ten)

### Step-2: Extracting the Relevant Paragraphs

The following code loads paragraphs within the corpus which contain both (all) of the specified words.

In [None]:
# Import required libraries
import en_core_web_sm

# Function to search for co-occurrences of two or more words within the same paragraph
def find_co_occurrences(docs, words):
    nlp = en_core_web_sm.load()
    co_occurring_paragraphs = []
    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)
            words_found = {word: False for word in words}

            for token in paragraph_doc:
                if token.text.lower() in words:
                    words_found[token.text.lower()] = True

            if all(words_found.values()):
                co_occurring_paragraphs.append(paragraph)

    return co_occurring_paragraphs

# relevant_paragraphs = find_co_occurrences(docs,['pollution', "rice"])
# display(relevant_paragraphs)