## Sample Analysis

The following code loads the text to be analysed.

In [None]:
# Import required libraries
import pickle

# Loading the pickled list of docs
with open("./data/docs.pkl", "rb") as descriptions_docs:
    docs = pickle.load(descriptions_docs)

### Step-1: Finding Co-Occurrences

The following code takes a primary word as an input and loads the top ten co-occurring nouns in the corpus as output.

In [None]:
# Import required libraries
import networkx as nx
from collections import Counter
import matplotlib.pyplot as plt
import spacy

# Function to search for co-occurrencecs of a particular part of speech and a particular word within the same paragraph
def find_in_para(docs, co_occurrences, word):
    final_list = []
    nlp = spacy.load('en_core_web_sm')

    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)  # Convert paragraph string to SpaCy Doc
            word_found = False
            co_occurrences_in_paragraph = []
            for token in paragraph_doc:
                if token.text.lower() == word:
                    word_found = True
                if token.pos_ == co_occurrences:
                    co_occurrences_in_paragraph.append(token.text)
            if word_found and co_occurrences_in_paragraph:
                final_list.extend(co_occurrences_in_paragraph)
    return final_list

# List the nouns that co-occur with a specific word
nouns = find_in_para(docs, 'NOUN', 'paraiya')

# Count the frequency with which each of the nouns co-occur
noun_frequencies = Counter(nouns)

# Select the 10 most frequent nouns
top_10_nouns = dict(noun_frequencies.most_common(10))
display(top_10_nouns)

### Step-2: Extracting the Relevant Paragraphs

The following code loads paragraphs within the corpus which contain both (all) of the specified words.

In [None]:
# Import required libraries
import en_core_web_sm

# Function to search for co-occurrences of two or more words within the same paragraph
def find_co_occurrences(docs, words):
    nlp = en_core_web_sm.load()
    co_occurring_paragraphs = []
    for doc in docs:
        for paragraph in doc.text.split('\n\n'):
            paragraph_doc = nlp(paragraph)
            words_found = {word: False for word in words}

            for token in paragraph_doc:
                if token.text.lower() in words:
                    words_found[token.text.lower()] = True

            if all(words_found.values()):
                co_occurring_paragraphs.append(paragraph)

    return co_occurring_paragraphs

relevant_paragraphs = find_co_occurrences(docs,['menstrual', "hut"])
display(relevant_paragraphs)