## Sample Analysis

The following code loads the text to be analysed.

In [19]:
# Import required libraries
import pickle

# Loading the pickled list of docs
with open("./data/docs.pkl", "rb") as descriptions_docs:
    docs = pickle.load(descriptions_docs)

### Step-1: Finding and Graphing Co-Occurrences

The following code takes a primary word as an input and loads the top ten co-occurring nouns in the corpus as output.

In [None]:
# Import required libraries
import networkx as nx
from collections import Counter
import matplotlib.pyplot as plt
import spacy

# Function to search for co-occurrencecs of a particular part of speech and a particular word within the same paragraph
def find_in_para(docs, co_occurrences, word):
    final_list = []
    nlp = spacy.load('en_core_web_sm')

    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)  # Convert paragraph string to SpaCy Doc
            word_found = False
            co_occurrences_in_paragraph = []
            for token in paragraph_doc:
                if token.text.lower() == word:
                    word_found = True
                if token.pos_ == co_occurrences and token.text != word:
                    co_occurrences_in_paragraph.append(token.text)
            if word_found and co_occurrences_in_paragraph:
                final_list.extend(co_occurrences_in_paragraph)
    return final_list

# List the nouns that co-occur with a specific word
nouns = find_in_para(docs, 'NOUN', 'dairy')

# Count the frequency with which each of the nouns co-occur
noun_frequencies = Counter(nouns)

# Select the 10 most frequent nouns
top_10_nouns = dict(noun_frequencies.most_common(10))
display(top_10_nouns)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Primary word
primary_word = 'dairy'

# Create a new graph
G = nx.Graph()

# Add the primary word to the graph
G.add_node(primary_word)

# Add the dictionary words to the graph and connect them to the primary word
for word, frequency in top_10_nouns.items():
    G.add_node(word)
    G.add_edge(primary_word, word, weight=frequency)

# Set positions for the nodes based on frequency
pos = nx.spring_layout(G)

# Draw nodes and edges
nx.draw(G, pos, with_labels=False, node_size=3000, font_size=10, node_color='#FBF9C2')
nx.draw_networkx_labels(G, pos, {node: f"{node}\n({top_10_nouns.get(node, '')})" if node != primary_word else node for node in G.nodes()}, font_size=10)

# Change the color of the central node
nx.draw_networkx_nodes(G, pos, nodelist=[primary_word], node_color='#E1FBC2', node_size=3000)

# Show the plot
plt.title("Top 10 Nouns Co-Occurring with 'Dairy'")
plt.show()

### Step-2: Extracting the Relevant Paragraphs

The following code loads paragraphs within the corpus which contain both (all) of the specified words.

In [None]:
# Import required libraries
import en_core_web_sm

# Function to search for co-occurrences of two or more words within the same paragraph
def find_co_occurrences(docs, words):
    nlp = en_core_web_sm.load()
    co_occurring_paragraphs = []
    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)
            words_found = {word: False for word in words}

            for token in paragraph_doc:
                if token.text.lower() in words:
                    words_found[token.text.lower()] = True

            if all(words_found.values()):
                co_occurring_paragraphs.append(paragraph)

    return co_occurring_paragraphs

relevant_paragraphs = find_co_occurrences(docs,['menstrual', "hut"])
display(relevant_paragraphs)