In [None]:
import numpy as np
import string
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import math
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# the code block below is directly downloading commentary.txt and superheros.csv into your drive folder. Please just run it and do not comment out.
from urllib import request
module_url = [f"https://drive.google.com/uc?export=view&id=18y6hLv2bqAyJsIXwVCty58lF0u7yimVq"]
name = ['commentary.txt']
for i in range(len(name)):
    with request.urlopen(module_url[i]) as f, open(name[i],'w') as outf:
        a = f.read()
        outf.write(a.decode('ISO-8859-1'))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
import re
from tqdm import tqdm
tqdm.pandas()
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def get_word(treebank_tag):
    """
    The function takes Penn Treebank part-of-speech tags and converts them to wordnet tags.

    Parameters used:
        treebank_tag (str): The Penn Treebank part-of-speech tag.

    Returns:
        str: It returns the corresponding WordNet tag, or an empty string if the tag is not recognized.
    """
    # Converting the treebank tags to wordnet tags
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return ''

def tokenize(text):
    """
   The function tokenizes text once it has been converted to lowercase.

    Parameters used:
        text (str): The text to be tokenized.

    Returns:
        list: It returns a list of tokens (words) from the given text.
    """
    # Converting the text to lower case and tokenize
    return word_tokenize(text.lower())

def pos_tagging(tokens):
    """
    The function perform part-of-speech tagging on a list of tokens.

    Parameters used:
        tokens (list): A list of tokens (words) to tag.

    Returns:
        list: A list of tuples, each containing a token and associated part-of-speech tag.
    """
    # Apply POS tagging
    return pos_tag(tokens)

def lemmatize(pos_tags):
    """
    The function lemmatize a list of tokens based on their part-of-speech tags.

   Parameters used:
        pos_tags (list): A collection of tuples, each containing a token and associated part-of-speech tag.

    Returns:
        list: A list of tuples, each containing a lemmatized token and the part-of-speech tag.
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    # Lemmatizing words based on their POS tags
    for lem_words, tags  in pos_tags:
        pos = get_word(tags)
        lemma = lemmatizer.lemmatize(lem_words, pos) if pos else lemmatizer.lemmatize(lem_words)
        lemmatized.append((lemma, tags))
    return lemmatized

# Load the data
df = pd.read_csv('commentary.txt', delimiter='\t')

# Apply the preprocessing functions
df['Tokenized'] = df['Commentary'].apply(tokenize)
df['PoS_tagged'] = df['Tokenized'].apply(pos_tagging)
df['PoS_lemmatized'] = df['PoS_tagged'].apply(lemmatize)

In [None]:
def get_word(treebank_tag):
        """
        The function converts Penn Treebank tags to a format compatible with the WordNet Lemmatize.

        Parameters used:
                treebank_tag (str): A part-of-speech tag in Penn Treebank format.
        Returns:
           A compatible WordNet part-of-speech tag.

        """
        if treebank_tag.startswith('J'):
                return nltk.corpus.wordnet.ADJ

        elif treebank_tag.startswith('V'):
               return nltk.corpus.wordnet.VERB

        elif treebank_tag.startswith('N'):
                return nltk.corpus.wordnet.NOUN

        elif treebank_tag.startswith('R'):
                return nltk.corpus.wordnet.ADV

        else:
                return ''  # Return an empty string for unknown tags

def lemmatize(query):

    """
    The function Lemmatizes a text query by converting it to lowercase, tokenizing it and applying part-of-speech tagging. It also+ lemmatizing each word based on its POS tag.

    Parameters used:
            query (str): The text query should be lemmatized.

    Returns:
        A set of unique lemmatized words from the query.

    """
    # Tokenizing the query after converting it to lowercase
    tkn = word_tokenize(query.lower())

    # Apply part-of-speech tagging to the tokens
    pos_t = nltk.pos_tag(tkn)

    # Initialize the WordNet Lemmatizer
    lemmatize = WordNetLemmatizer()

    # Lemmatize each word based on its part-of-speech tag
    lemmatized_words= [lemmatize.lemmatize(word, get_word(pos) or nltk.corpus.wordnet.NOUN) for word, pos in pos_t]

    # Return a set of unique lemmatized words
    return set(lemmatized_words)

def retrieve_similar_commentaries(df, query, k):

        """
        The function obtains the top 'k' comparable commentaries from a DataFrame based on lemmatized token similarity between the commentaries and the query, To highlight the relevance of matched words, a bonus is added to their similarity score.

        Parameters used:
                  df (DataFrame): The Dataframe consisting of commentaries with a column 'PoS_lemmatized'.
                  query (str): The text query to find similar commentaries.
                  k (int): Number of top similar commentaries to retrieve.

        Returns:
             A list of tuples from where each tuple contains the commentary text and its similarity score.

        """
        # Tokenize and lemmatize the provided query using a predefined function
        querytkn = lemmatize(query)

        # Define a function to calculate the similarity score between the query tokens and commentary tokens
        def calculate_similarity(tokens):
                # Calculate tokens between the query and the commentary
                com_tokens = querytkn.intersection(set(token for token, _ in tokens))

                # Calculate a bonus for common nouns to emphasize their importance in similarity
                noun = sum(1 for token in com_tokens if any(t[1].startswith('N') for t in tokens if t[0] == token))

                # The final similarity score is the count of common tokens plus the noun bonus
                return len(com_tokens) + noun

        # Apply the similarity calculation to each commentary in the DataFrame
        df['similarity'] = df['PoS_lemmatized'].apply(calculate_similarity)

        # Retrieve the top 'k' commentaries based on their calculated similarity scores
        top_k = df.nlargest(k, 'similarity')[['Commentary', 'similarity']]

        # Return the top commentaries as a list of tuples without the DataFrame index
        return list(top_k.itertuples(index=False, name=None))

In [None]:
def extract_phrases(token_tuples):
    """Extract noun phrases (NPs) and verb phrases (VPs) based on POS tags.

    Parameters:
    token_tuples (list): A list of tuples containing word and its POS tag.

    Returns:
    tuple: A tuple containing two lists, the first list contains noun phrases (NPs)
    and the second list contains verb phrases (VPs).
    """
    noun_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS'}
    verb_pos_tags = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
    noun_phrases, verb_phrases = [], []
    current_noun, current_verb = [], []

    for word, pos_tag in token_tuples:
        if pos_tag in noun_pos_tags:
            if current_verb:
                verb_phrases.append(' '.join(current_verb))
                current_verb = []
            current_noun.append(word)
        elif pos_tag in verb_pos_tags:
            if current_noun:
                noun_phrases.append(' '.join(current_noun))
                current_noun = []
            current_verb.append(word)
        else:
            if current_noun:
                noun_phrases.append(' '.join(current_noun))
                current_noun = []
            if current_verb:
                verb_phrases.append(' '.join(current_verb))
                current_verb = []

    if current_noun:
        noun_phrases.append(' '.join(current_noun))
    if current_verb:
        verb_phrases.append(' '.join(current_verb))

    return noun_phrases, verb_phrases

def count_phrases(dataframe):
    """Count the frequency of NPs and VPs.

    Parameters:
    dataframe (DataFrame): A DataFrame where 'PoS_lemmatized' column contains lists of tuples (word, POS).

    Returns:
    tuple: A tuple containing two dictionaries, the first dictionary contains counts of NPs
    and the second dictionary contains counts of VPs.
    """
    noun_counts = {}
    verb_counts = {}

    for _, row in dataframe.iterrows():
        nps, vps = extract_phrases(row['PoS_lemmatized'])
        for np in nps:
            noun_counts[np] = noun_counts.get(np, 0) + 1
        for vp in vps:
            verb_counts[vp] = verb_counts.get(vp, 0) + 1

    return noun_counts, verb_counts

def compute_pmi_dataframe(dataframe):
    """Compute the PMI matrix.

    Compute the Pointwise Mutual Information (PMI) matrix based on the noun phrases (NPs)
    and verb phrases (VPs) extracted from the DataFrame.

    Parameters:
    dataframe (DataFrame): A DataFrame where 'PoS_lemmatized' column contains lists of tuples (word, POS).

    Returns:
    DataFrame: A DataFrame representing the PMI matrix with noun phrases (NPs) as rows and verb phrases (VPs) as columns.
    """
    # Extract and count phrases
    noun_counts, verb_counts = count_phrases(dataframe)

    # Calculate total counts
    total_nouns = sum(noun_counts.values())
    total_verbs = sum(verb_counts.values())

    # Find the top 100 NPs and VPs
    top_nouns = sorted(noun_counts, key=noun_counts.get, reverse=True)[:100]
    top_verbs = sorted(verb_counts, key=verb_counts.get, reverse=True)[:100]

    # Create PMI matrix
    pmi_matrix = pd.DataFrame(index=top_verbs, columns=top_nouns, data=0.0)

    for np in top_nouns:
        for vp in top_verbs:
            p_joint = (noun_counts.get(np, 0) * verb_counts.get(vp, 0)) / (total_nouns * total_verbs) if noun_counts.get(np, 0) and verb_counts.get(vp, 0) else 0
            if p_joint > 0:
                pmi_matrix.at[vp, np] = p_joint / ((noun_counts.get(np, 0) / total_nouns) * (verb_counts.get(vp, 0) / total_verbs))

    return pmi_matrix

data = {
    'PoS_lemmatized': [
        [('cat', 'NN'), ('run', 'VB'), ('fast', 'RB')],
        [('dog', 'NN'), ('jump', 'VB'), ('high', 'JJ')],
        [('bird', 'NN'), ('sing', 'VB'), ('beautifully', 'RB')],
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute PMI matrix
pmi_df = compute_pmi_dataframe(df)