In [12]:
import nltk
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()


In [6]:
from nltk.corpus import genesis
len(genesis.words())

315268

In [14]:
from nltk.collocations import BigramCollocationFinder
finder = BigramCollocationFinder.from_words(genesis.words("english-web.txt"))

In [15]:
## calculates best bigrams using pointwise mutual information
finder.nbest(bigram_measures.pmi, 10)

[('Allon', 'Bacuth'),
 ('Ashteroth', 'Karnaim'),
 ('Ben', 'Ammi'),
 ('En', 'Mishpat'),
 ('Jegar', 'Sahadutha'),
 ('Salt', 'Sea'),
 ('Whoever', 'sheds'),
 ('appoint', 'overseers'),
 ('aromatic', 'resin'),
 ('cutting', 'instrument')]

these are all highly correlated, but infrequent - filter out bigrams that appear less than 3 times

In [16]:
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

[('Beer', 'Lahai'),
 ('Lahai', 'Roi'),
 ('gray', 'hairs'),
 ('ewe', 'lambs'),
 ('Most', 'High'),
 ('many', 'colors'),
 ('burnt', 'offering'),
 ('Paddan', 'Aram'),
 ('east', 'wind'),
 ('living', 'creature')]

## applying to documents

In [62]:
from pathlib import Path
import yaml
from nltk.tokenize import word_tokenize

documents_dir = Path("documents") 

tokens = []
documents = {}

for file in documents_dir.glob("*.txt"):
    print(file)
    with open(file, "r") as f:
        content = f.read()
        meta, body = yaml.safe_load_all(content)
        words = word_tokenize(body)
        tokens.extend(words)
        documents[meta["title"].lower()] = [word for word in words if word.isalpha()]
        print(meta["title"])

documents/aptn_fournier_03.txt
Archeological Dig at Old Montreal Hospital on Hold by McGill University
documents/tribune_grewal_01.txt
mcgill-reports-nine-potential-grave-zones-at-new-vic-site-a-week-after-security-verbally-assaulted-mohawk-mothers
documents/mm_01.txt
Concern About Recent Actions
documents/aptn_fournier_02.txt
Mohawk Mothers Win Injunction Against McGill at David and Goliath Hearing
documents/gazette_tomesco.txt
its-empty-montreal-mohawk-womens-group-slams-popes-apology
documents/mm_03.txt
Mohawk Mothers Win Superior Court Judgement
documents/city_madocjones.txt
Mohawk Mothers Settlement at Allan Memorial Institute
documents/mm_02.txt
Historic Human Remains Detection Dogs Detect Scent of Human Remains on the Old Royal Victoria Hospital Site in Montreal
documents/gazette_dunlevy.txt
McGill's Royal Vic plans prompt call to check for unmarked graves
documents/aptn_fournier_01.txt
cadaver-dogs-sniff-out-potential-human-remains-near-old-royal-victoria-hospital-site
document

In [84]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents(list(documents.values()))

finder.apply_freq_filter(5)

print("raw frequencies")
for bigram in finder.nbest(bigram_measures.raw_freq, 10):
    print(bigram)
    
print("\n")
print("pointwise mutual information")
for bigram in finder.nbest(bigram_measures.pmi, 10):
    print(bigram)

print("\n")   
print("likelihood ratio")
for bigram in finder.nbest(bigram_measures.likelihood_ratio, 10):
    print(bigram)
    
print("\n")
print("Dice's coefficient")
for bigram in finder.nbest(bigram_measures.dice, 10):
    print(bigram)
    
print("\n")
print("phi square (pearsons r squared)")
for bigram in finder.nbest(bigram_measures.phi_sq, 10):
    print(bigram)
    

raw frequencies
('of', 'the')
('in', 'the')
('Mohawk', 'Mothers')
('on', 'the')
('and', 'the')
('the', 'site')
('the', 'Mohawk')
('to', 'the')
('by', 'the')
('that', 'the')


pointwise mutual information
('Missing', 'Children')
('Unmarked', 'Graves')
('Know', 'History')
('civil', 'suit')
('Ottawa', 'Valley')
('Rescue', 'Dog')
('Valley', 'Search')
('Doug', 'Mitchell')
('Health', 'Center')
('Turtle', 'Island')


likelihood ratio
('Mohawk', 'Mothers')
('Royal', 'Victoria')
('Victoria', 'Hospital')
('unmarked', 'graves')
('New', 'Vic')
('McGill', 'University')
('of', 'the')
('kehá', 'ka')
('Superior', 'Court')
('human', 'remains')


Dice's coefficient
('Attorney', 'General')
('Know', 'History')
('Missing', 'Children')
('Reconciliation', 'Commission')
('Rotinonshón', 'ni')
('Unmarked', 'Graves')
('Settlement', 'Agreement')
('Special', 'Interlocutor')
('New', 'Vic')
('Ottawa', 'Valley')


phi square (pearsons r squared)
('Attorney', 'General')
('Know', 'History')
('Missing', 'Children')
('Re

In [87]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_documents(list(documents.values()))

finder.apply_freq_filter(3)

print("raw frequencies")
for trigram in finder.nbest(trigram_measures.raw_freq, 10):
    print(trigram)
    
print("\n")
print("pointwise mutual information")
for trigram in finder.nbest(trigram_measures.pmi, 10):
    print(trigram)

print("\n")   
print("likelihood ratio")
for trigram in finder.nbest(trigram_measures.likelihood_ratio, 10):
    print(trigram)
    
print("\n")
print("mi like")
for trigram in finder.nbest(trigram_measures.mi_like, 10):
    print(trigram)
    
print("\n")
print("jacard index")
for trigram in finder.nbest(trigram_measures.jaccard, 10):
    print(trigram)
    

raw frequencies
('the', 'Mohawk', 'Mothers')
('Royal', 'Victoria', 'Hospital')
('the', 'New', 'Vic')
('the', 'Royal', 'Victoria')
('on', 'the', 'site')
('The', 'Mohawk', 'Mothers')
('New', 'Vic', 'project')
('Victoria', 'Hospital', 'site')
('former', 'Royal', 'Victoria')
('the', 'former', 'Royal')


pointwise mutual information
('Central', 'Intelligence', 'Agency')
('Ground', 'Penetrating', 'Radar')
('Karonhia', 'nó', 'ron')
('Human', 'Remains', 'Detection')
('Remains', 'Detection', 'Dogs')
('Burial', 'Sites', 'associated')
('Ottawa', 'Valley', 'Search')
('Rotinonshón', 'ni', 'confederacy')
('spokesperson', 'Cynthia', 'Lee')
('Societe', 'quebecoise', 'des')


likelihood ratio
('Royal', 'Victoria', 'Hospital')
('the', 'Mohawk', 'Mothers')
('The', 'Mohawk', 'Mothers')
('Kahnistensera', 'Mohawk', 'Mothers')
('kahnistensera', 'Mohawk', 'Mothers')
('Mohawk', 'Mothers', 'and')
('former', 'Royal', 'Victoria')
('Mohawk', 'Mothers', 'are')
('Mohawk', 'Mothers', 'kanien')
('Mohawk', 'Mothers', '

In [89]:
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
finder = nltk.collocations.QuadgramCollocationFinder.from_documents(list(documents.values()))

#finder.apply_freq_filter(3)

print("raw frequencies")
for fourgram in finder.nbest(fourgram_measures.raw_freq, 10):
    print(fourgram)
    
print("\n")
print("pointwise mutual information")
for fourgram in finder.nbest(fourgram_measures.pmi, 10):
    print(fourgram)

print("\n")   
print("likelihood ratio")
for fourgram in finder.nbest(fourgram_measures.likelihood_ratio, 10):
    print(fourgram)
    
print("\n")
print("mi like")
for fourgram in finder.nbest(fourgram_measures.mi_like, 10):
    print(fourgram)
    
print("\n")
print("jacard index")
for fourgram in finder.nbest(fourgram_measures.jaccard, 10):
    print(fourgram)


raw frequencies
('the', 'Royal', 'Victoria', 'Hospital')
('Royal', 'Victoria', 'Hospital', 'site')
('former', 'Royal', 'Victoria', 'Hospital')
('the', 'former', 'Royal', 'Victoria')
('the', 'New', 'Vic', 'project')
('of', 'the', 'former', 'Royal')
('the', 'site', 'of', 'the')
('the', 'Allan', 'Memorial', 'Institute')
('Kanien', 'keha', 'ka', 'Kahnistensera')
('of', 'the', 'Royal', 'Victoria')


pointwise mutual information
('Ad', 'Hoc', 'Advisory', 'Committee')
('Kontihwe', 'Ne', 'Iotiian', 'shon')
('Tionni', 'tio', 'tià', 'kon')
('documentary', 'Main', 'basse', 'sur')
('hour', 'wasted', 'changing', 'courtrooms')
('murderer', 'rapists', 'thief', 'colonizer')
('shastnsera', 'Kontihwe', 'Ne', 'Iotiian')
('wasted', 'changing', 'courtrooms', 'Though')
('IQ', 'tests', 'juvenile', 'courts')
('Ka', 'shastnsera', 'Kontihwe', 'Ne')


likelihood ratio
('former', 'Royal', 'Victoria', 'Hospital')
('Royal', 'Victoria', 'Hospital', 'site')
('of', 'the', 'Mohawk', 'Mothers')
('old', 'Royal', 'Victori