## Sample Analysis

The following code loads the text to be analysed.

In [4]:
# Import required libraries
import pickle

# Loading the pickled list of docs
with open("./data/docs.pkl", "rb") as descriptions_docs:
    docs = pickle.load(descriptions_docs)

### Step-1: Finding and Graphing Co-Occurrences

The following code takes a primary word as an input and loads the top ten co-occurring nouns in the corpus as output.

In [2]:
# Import required libraries
import networkx as nx
from collections import Counter
import matplotlib.pyplot as plt
import spacy

# Function to get a list of the top 10 words corresponding to a specific part of speech that co-occur most frequently with a given word
def find_in_para(docs, co_occurrences, primary_word):
    final_list = [] # List to hold co-occurrences
    final_frequencies = {} # Dict to hold co-occurrences and their frequencies
    final_dict = {} # Dict to hold top 10 co-occurrences and their frequencies
    nlp = spacy.load('en_core_web_sm')

    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)  # Convert paragraph string to SpaCy Doc
            word_found = False
            co_occurrences_in_paragraph = []
            for token in paragraph_doc:
                # Check for occurrence of primary word
                if token.text.lower() == primary_word:
                    word_found = True
                # Generate list of co-occurrences
                if token.pos_ == co_occurrences and token.text != primary_word:
                    co_occurrences_in_paragraph.append(token.text)
            if word_found and co_occurrences_in_paragraph:
                final_list.extend(co_occurrences_in_paragraph)
    final_frequencies = Counter(final_list)
    final_dict = dict(final_frequencies.most_common(10))
    display(final_dict)
    return final_dict

# dairy_co_occurrences = find_in_para(docs, 'NOUN', 'dairy')
# cheri_co_occurrences = find_in_para(docs, 'NOUN', 'chēri')
# menstrual_co_occurrences = find_in_para(docs, 'NOUN', 'menstrual')
# pollution_co_occurrences = find_in_para(docs, 'NOUN', 'pollution')

In [7]:
import networkx as nx
import matplotlib.pyplot as plt

def create_graph(primary_word, co_occurrences_list, primary_colour, secondary_colour):

    # Create a new graph
    G = nx.Graph()

    # Add the primary word to the graph
    G.add_node(primary_word)

    # Add the dictionary words to the graph and connect them to the primary word
    for word, frequency in co_occurrences_list.items():
        G.add_node(word)
        G.add_edge(primary_word, word, weight=frequency)

    # Set positions for the nodes based on frequency
    pos = nx.spring_layout(G, k=0.5)

    # Calculate maximum frequency for scaling node size
    max_frequency = max(co_occurrences_list.values())

    # Set the figure size
    plt.figure(figsize=(10, 8))

    # Draw nodes and edges
    nx.draw(G, pos, with_labels=False, node_size=[5000 * (co_occurrences_list.get(node, 1) / max_frequency) for node in G.nodes()], font_size=10, node_color=secondary_colour)
    nx.draw_networkx_labels(G, pos, {node: f"{node}\n({co_occurrences_list.get(node, '')})" if node != primary_word else node for node in G.nodes()}, font_size=10)

    # Change the color and size of the central node
    nx.draw_networkx_nodes(G, pos, nodelist=[primary_word], node_color=primary_colour, node_size=3000)

    # Show the plot
    plt.title(f"Top 10 Nouns Co-Occurring with '{primary_word}'")
    plt.show()

In [None]:
# create_graph('dairy', dairy_co_occurrences, '#a6bddb', '#ece2f0')
# create_graph('chēri', cheri_co_occurrences, '#a8ddb5', '#e0f3db')
# create_graph('menstrual', menstrual_co_occurrences, '#9ebcda', '#e0ecf4')
# create_graph('pollution', pollution_co_occurrences, '#fa9fb5', '#fde0dd')

### Step-2: Extracting the Relevant Paragraphs

The following code loads paragraphs within the corpus which contain both (all) of the specified words.

In [None]:
# Import required libraries
import en_core_web_sm

# Function to search for co-occurrences of two or more words within the same paragraph
def find_co_occurrences(docs, words):
    nlp = en_core_web_sm.load()
    co_occurring_paragraphs = []
    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)
            words_found = {word: False for word in words}

            for token in paragraph_doc:
                if token.text.lower() in words:
                    words_found[token.text.lower()] = True

            if all(words_found.values()):
                co_occurring_paragraphs.append(paragraph)

    return co_occurring_paragraphs

# relevant_paragraphs = find_co_occurrences(docs,['pollution', "rice"])
# display(relevant_paragraphs)