In [1]:
import nltk
from nltk.corpus import brown
import math
from collections import Counter
import pandas as pd
import re

def clean_word(word):
    '''Remove special characters and convert to lowercase.'''

    word = word.lower().strip()
    return re.sub(r'[^a-zA-Z]', '', word)


def load_corpus_from_txt(filename):
   
    '''Load and split a text file into sentences of words.'''
    
    with open(filename, 'r') as file:
        text = file.read()
    sentences = re.split(r'[.!?]', text)

    tokenized_sentences = []
    for sentence in sentences:
        if sentence.strip():  # ensure sentence is not empty or just a white space 
            tokens = re.findall(r'\b\w+\b', sentence)
            tokenized_sentences.append(tokens)

    return tokenized_sentences


def compute_pmi_ppmi(sentences, min_freq=10):
    '''Method to compute PMI and PPMI for word bigrams. Returns a dataframe containing the all the word pairs and the PMI and PPMI values '''
    all_words = []# list to stoare all the cleaned words in the corpora 
    for sentence in sentences:
        for word in sentence:
            cleaned = clean_word(word)
            all_words.append(cleaned)

    total_words = len(all_words)

    word_counts = Counter(all_words)
    
    valid_words = set()
    
    for word, count in word_counts.items():
        if count >= min_freq:
            valid_words.add(word) # Only add words which occur more than or equal to the minimum frequency specified we consider it to be 10

    bigram_counts = Counter()
    
    for sentence in sentences:
        sentence = [clean_word(word) for word in sentence]
        for i in range(len(sentence) - 1):
            if sentence[i] in valid_words and sentence[i+1] in valid_words:
                bigram_counts[(sentence[i], sentence[i+1])] += 1 # number of word pair occurances 

    pmi_data = []
    for (w1, w2), bigram_count in bigram_counts.items():
        p_w1 = word_counts[w1]/total_words
        p_w2 = word_counts[w2]/total_words
        p_bigram = bigram_count/(total_words - len(sentences))  # probability of a bigram(w1, w2) appearing

        pmi = math.log2(p_bigram/(p_w1 * p_w2)) # Calculating pmi
        ppmi = max(pmi, 0) # Removing any negative pmi values 

        pmi_data.append({"word1": w1, "word2": w2, "pmi": pmi, "ppmi": ppmi, "count": bigram_count})

    df = pd.DataFrame(pmi_data)
    return df


In [2]:
def run_analysis(sentence=None, save_txt=False):
    '''Function to run analysis on specified Corpus '''
    
    if sentence is None:
        sentence = brown.sents()

    df= compute_pmi_ppmi(sentence)
    
    top_pmi = df.sort_values(by="pmi", ascending=False).head(20)
    top_ppmi = df.sort_values(by="ppmi", ascending=False).head(20)
    bottom_pmi = df.sort_values(by="pmi", ascending=True).head(20)
    
    output = []
    output.append("Top 20 PMI pairs:")
    output.append(top_pmi.to_string(index=False))
    output.append("Bottom 20 PMI pairs:")
    output.append(bottom_pmi.to_string(index=False))
    output.append("Top 20 PPMI pairs:")
    output.append(top_ppmi.to_string(index=False))

    full_output = "\n\n".join(output)
    print(full_output)

    if save_txt:
        with open("pmi_ppmi_summary.txt", "w") as f:
            f.write(full_output)
        print(" Summary saved to 'pmi_ppmi_summary.txt' ")
 



In [None]:
print("Analysis of the Entire Borwn Corpus")
run_analysis(save_txt=False)
# We observe very High PMI values for proper noun collocations such as ('viet','nam') and negative values for function word pairs such as 'the', 'and' 
# eventhough the words occur frequently, they are gramatically incorrect when used together.


In [None]:
print("Analysis for brown100.txt")
brown100_sentence = load_corpus_from_txt('../data/brown_100.txt')
run_analysis(sentence=brown100_sentence, save_txt=False)
# Rare word pair may artifically inflate the PMI, for example ('certain', 'questions') have a PMI of almost 14 with just 1 count in the corpus.
#  Which is a consequence of the small denominator value even if the signficance is low.
# We can concur that PMI is sensitive to low-frequency data making it robust for rare pairs.