# Import Required Libraries
Import the necessary libraries, including pandas, os, yaml, and nltk.

In [1]:
# Importing required libraries
# pandas for data manipulation and analysis
import pandas as pd
# os for interacting with the operating system
import os
# yaml for YAML parser and emitter for Python
import yaml
# re for YAML parsing
import re
# nltk for natural language processing
import nltk

# Read .txt Files from Directory
Use os library to read all .txt files from the specified directory, excluding files in the 'court_documents' subdirectory.

In [3]:
# Define the directory path
directory_path = 'data'

# Initialize an empty list to store file paths
txt_files = []

# Walk through the directory
for root, dirs, files in os.walk(directory_path):
    # Exclude 'court_documents' subdirectory
    if 'court_documents' in dirs:
        dirs.remove('court_documents')
    
    # Iterate over the files in the directory/subdirectory
    for file in files:
        # Check if the file is a .txt file
        if file.endswith('.txt'):
            # Get the full file path
            file_path = os.path.join(root, file)
            
            # Append the file path to the list
            txt_files.append(file_path)

# Now txt_files list contains paths of all .txt files in the directory excluding 'court_documents' subdirectory

for file in txt_files:
    print(file)

data/aptn_fournier_03.txt
data/tribune_grewal_01.txt
data/mm_01.txt
data/aptn_fournier_02.txt
data/gazette_tomesco.txt
data/mm_03.txt
data/city_madocjones.txt
data/mm_02.txt
data/gazette_dunlevy.txt
data/aptn_fournier_01.txt
data/gazette_magder.txt
data/ctv_harold.txt
data/gazette_petition.txt
data/mm_06.txt
data/mm_05.txt
data/mm_04.txt
data/ctv_lofaro.txt
data/tribune_grewal.txt
data/tribune_cason.txt
data/mm_petition.txt
data/city_henriques.txt
data/the_site.txt
data/mcgill_manfredi.txt
data/cultural_survival.txt
data/tribune_wexler.txt
data/the_mission.txt
data/ed_cable_01.txt
data/mm_sep_12.txt
data/the_stakeholders.txt
data/cbc_03.txt
data/the_challenge.txt
data/cbc_02.txt
data/cbc_01.txt
data/global_omalley.txt
data/globe_01.txt
data/city_rubertucci.txt
data/ed_cable.txt
data/cbc_lapierre.txt
data/daily_catlin.txt
data/global_jelowicki.txt
data/globe_amador.txt
data/global_carpenter.txt
data/ed_bonspiel.txt


# Extract Metadata and Store in DataFrame
Use the defined function to extract yaml frontmatter from each .txt file and store the metadata in a pandas DataFrame.

In [4]:
def extract_data(txt_file):
    with open(txt_file, 'r') as file:
        # Read the file content
        content = file.read()
        
        # Extract the yaml frontmatter
        frontmatter = re.search(r'---(.*?)---', content, re.DOTALL)
        
        # If frontmatter is found, load it as yaml
        if frontmatter:
            metadata = yaml.safe_load(frontmatter.group(1))
            document_content = nltk.word_tokenize(content.replace(frontmatter.group(0), ''))
            
            # Create a dictionary for the document
            document_dict = metadata
            document_dict['tokens'] = document_content
            
            return {txt_file: document_dict}
        else:
            return {}

In [5]:
corpus = {}
for file in txt_files:
    # Extract the metadata
    data = extract_data(file)
    
    # Add the data to the corpus
    corpus.update(data)

# Print the metadata
for document in corpus:
    print(document)
    for key, value in corpus[document].items():
        print(key, ':', value)
    print('\n')

data/aptn_fournier_03.txt
title : Archeological Dig at Old Montreal Hospital on Hold by McGill University
author : Emelia Fournier
publisher : aptn news
URL : https://www.aptnnews.ca/national-news/archeological-dig-old-montreal-hospital-on-hold-mcgill-university/
summary : This article reports on the temporary halt of an archeological dig at an old Montreal hospital by McGill University.
tags : ['news', 'indigenous']
tokens : ['A', 'spokesperson', 'for', 'the', 'Mohawk', 'Mothers', ',', 'or', 'Kahnistensera', ',', 'says', 'the', 'group', 'feels', 'pushed', 'aside', 'in', 'the', 'search', 'for', 'unmarked', 'graves', 'on', 'a', 'site', 'owned', 'by', 'Société', 'Québécoise', 'des', 'Infrastructures', ',', 'or', 'SQI', '.', 'McGill', 'says', 'it', 'leases', 'part', 'of', 'the', 'property', '.', '“', 'The', 'process', 'can', 'no', 'longer', 'by', 'any', 'means', 'be', 'considered', 'Indigenous-led', ',', 'as', 'the', 'SQI', 'and', 'McGill', 'attempt', 'to', 'control', 'the', 'whole', 'pro

In [6]:
all_tokens = []

for document in corpus:
    
    for token in corpus[document]['tokens']:
        if token.isalpha():
            token = token.lower()
            all_tokens.append(token)
        
print(all_tokens[:100])
print("total tokens :", len(all_tokens))

['a', 'spokesperson', 'for', 'the', 'mohawk', 'mothers', 'or', 'kahnistensera', 'says', 'the', 'group', 'feels', 'pushed', 'aside', 'in', 'the', 'search', 'for', 'unmarked', 'graves', 'on', 'a', 'site', 'owned', 'by', 'société', 'québécoise', 'des', 'infrastructures', 'or', 'sqi', 'mcgill', 'says', 'it', 'leases', 'part', 'of', 'the', 'property', 'the', 'process', 'can', 'no', 'longer', 'by', 'any', 'means', 'be', 'considered', 'as', 'the', 'sqi', 'and', 'mcgill', 'attempt', 'to', 'control', 'the', 'whole', 'process', 'reducing', 'the', 'role', 'of', 'indigenous', 'people', 'to', 'performing', 'ceremonies', 'on', 'the', 'site', 'said', 'kahentinetha', 'one', 'of', 'the', 'mothers', 'who', 'added', 'that', 'they', 'feel', 'blindsided', 'by', 'the', 'communications', 'that', 'happened', 'without', 'consulting', 'them', 'quebec', 's', 'infrastructure', 'society', 'or', 'sqi', 'and', 'mcgill']
total tokens : 29564


## n-gram Analysis

In [8]:
from collections import Counter

In [9]:
def count_ngrams(tokens, n):
    """
    This function takes a list of tokens and returns a dictionary of n-grams and their counts.
    """
    # Initialize an empty dictionary
    ngrams_counts = Counter()
    
    # Iterate over the list of tokens
    for i in range(len(tokens)-n+1):
        # Get the n-gram from tokens
        ngram = tuple(tokens[i:i+n])
        
        # Check if the n-gram is in the dictionary
        if ngram in ngrams_counts:
            # Increment the count for n-gram
            ngrams_counts[ngram] += 1
        else:
            # Initialize the count for n-gram
            ngrams_counts[ngram] = 1
    
    # Return the n-grams dictionary
    return ngrams_counts

In [10]:
unigrams = count_ngrams(all_tokens, 1)
unigrams = count_ngrams(all_tokens, 2)
trigrams = count_ngrams(all_tokens, 3)
fourgrams = count_ngrams(all_tokens, 4)
fivegrams = count_ngrams(all_tokens, 5)

### most common n-grams

In [11]:
unigrams.most_common(10)

[(('the',), 2467),
 (('of',), 972),
 (('to',), 914),
 (('and',), 819),
 (('a',), 588),
 (('that',), 523),
 (('in',), 520),
 (('on',), 329),
 (('for',), 267),
 (('is',), 265)]

In [12]:
bigrams.most_common(10)

[(('of', 'the'), 314),
 (('in', 'the'), 167),
 (('mohawk', 'mothers'), 132),
 (('the', 'mohawk'), 123),
 (('on', 'the'), 123),
 (('and', 'the'), 111),
 (('the', 'site'), 106),
 (('to', 'the'), 92),
 (('by', 'the'), 82),
 (('that', 'the'), 82)]

In [13]:
trigrams.most_common(10)

[(('the', 'mohawk', 'mothers'), 107),
 (('royal', 'victoria', 'hospital'), 67),
 (('the', 'new', 'vic'), 48),
 (('on', 'the', 'site'), 37),
 (('the', 'royal', 'victoria'), 36),
 (('new', 'vic', 'project'), 35),
 (('kanien', 'kehá', 'ka'), 30),
 (('victoria', 'hospital', 'site'), 25),
 (('the', 'settlement', 'agreement'), 23),
 (('former', 'royal', 'victoria'), 23)]

In [14]:
fourgrams.most_common(10)

[(('the', 'royal', 'victoria', 'hospital'), 32),
 (('the', 'new', 'vic', 'project'), 29),
 (('royal', 'victoria', 'hospital', 'site'), 25),
 (('the', 'former', 'royal', 'victoria'), 22),
 (('former', 'royal', 'victoria', 'hospital'), 22),
 (('société', 'québécoise', 'des', 'infrastructures'), 19),
 (('kanien', 'kehá', 'ka', 'kahnistensera'), 17),
 (('the', 'société', 'québécoise', 'des'), 16),
 (('the', 'allan', 'memorial', 'institute'), 15),
 (('of', 'the', 'former', 'royal'), 15)]

In [15]:
fivegrams.most_common(10)

[(('the', 'former', 'royal', 'victoria', 'hospital'), 21),
 (('the', 'société', 'québécoise', 'des', 'infrastructures'), 16),
 (('of', 'the', 'former', 'royal', 'victoria'), 15),
 (('société', 'québécoise', 'des', 'infrastructures', 'sqi'), 14),
 (('the', 'kanien', 'kehá', 'ka', 'kahnistensera'), 13),
 (('of', 'the', 'royal', 'victoria', 'hospital'), 13),
 (('the', 'kanien', 'keha', 'ka', 'kahnistensera'), 12),
 (('the', 'old', 'royal', 'victoria', 'hospital'), 12),
 (('the', 'royal', 'victoria', 'hospital', 'site'), 12),
 (('former', 'royal', 'victoria', 'hospital', 'site'), 10)]

# finding words that appear relatively more often in indigenous documents than non-indigenous ones

In [35]:
from nltk.tokenize import wordpunct_tokenize
from numpy import average
from collections import Counter

def list_tokens(input_list, n):
    return [tuple(input_list[i:i+n]) for i in range(len(input_list)-n+1)]

In [92]:
def extract_tokens(txt_file):
    with open(txt_file, 'r') as file:
        # Read the file content
        content = file.read()
        
        # Extract the yaml frontmatter
        frontmatter = re.search(r'---(.*?)---', content, re.DOTALL)
        
        # If frontmatter is found, load it as yaml
        if frontmatter:
            metadata = yaml.safe_load(frontmatter.group(1))
            document_content = nltk.wordpunct_tokenize(content.replace(frontmatter.group(0), ''))
            
            # Create a dictionary for the document
            document_dict = metadata
            document_dict['tokens'] = document_content
            
            return {txt_file: document_dict}
        else:
            return {}

In [93]:
corpus = {}
for file in txt_files:
    # Extract the metadata
    data = extract_tokens(file)
    
    # Add the data to the corpus
    corpus.update(data)

In [94]:
distinct_tags = set()
for document in corpus:
    for tag in corpus[document]['tags']:
        distinct_tags.add(tag)
        
for tag in distinct_tags:
    print(tag)

news
corporate
mcgill
montreal
indigenous
student


In [95]:
## a dictionary of all documents in corpus with indigenous tag
indigenous_corpus = {}

for document in corpus:
    
    if 'indigenous' in corpus[document]['tags']:
        indigenous_corpus[document] = corpus[document]
        
for document in indigenous_corpus:
    author = indigenous_corpus[document]['author']
    tags = indigenous_corpus[document]['tags']
    print(document,"\n", author, "\n", tags ,"\n")
    
indigenous_tokens = []
for document in indigenous_corpus:
    for token in indigenous_corpus[document]['tokens']:
        if token.isalpha():
            token = token.lower()
            indigenous_tokens.append(token)

data/aptn_fournier_03.txt 
 Emelia Fournier 
 ['news', 'indigenous'] 

data/mm_01.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/aptn_fournier_02.txt 
 emelia fournier 
 ['news', 'indigenous'] 

data/mm_03.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/mm_02.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/aptn_fournier_01.txt 
 Emelia Fournier 
 ['news', 'indigenous'] 

data/mm_06.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/mm_05.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/mm_04.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/mm_petition.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/the_site.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/cultural_survival.txt 
 Take Back Tekanontak Committee 
 ['news', 'indigenous'] 

data/the_mission.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/ed_cable_01.txt 
 Eve Cable 
 ['indigenous', 'news'] 

data/mm_sep_12.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/the_stakeholders.txt 
 Mohawk Mothers 
 ['indigenous'] 

data/the_challenge.txt 
 Mohawk Mothers 
 ['indig

In [96]:
non_indigenous_corpus = {}

for document in corpus:
    if 'indigenous' not in corpus[document]['tags']:
        non_indigenous_corpus[document] = corpus[document]
        
for document in non_indigenous_corpus:
    author = non_indigenous_corpus[document]['author']
    tags = non_indigenous_corpus[document]['tags']
    print(document,"\n", author, "\n", tags ,"\n")
    
non_indigenous_tokens = []
for document in non_indigenous_corpus:
    for token in non_indigenous_corpus[document]['tokens']:
        if token.isalpha():
            token = token.lower()
            non_indigenous_tokens.append(token)

data/tribune_grewal_01.txt 
 jasjot grewal 
 ['news', 'student'] 

data/gazette_tomesco.txt 
 frédéric tomesco 
 ['news'] 

data/city_madocjones.txt 
 gareth madoc-jones 
 ['news', 'corporate'] 

data/gazette_dunlevy.txt 
 T'Cha Dunlevy 
 ['news'] 

data/gazette_magder.txt 
 Jason Magder 
 ['news', 'montreal'] 

data/ctv_harold.txt 
 max harrold 
 ['news', 'corporate'] 

data/gazette_petition.txt 
 Montreal Gazette 
 ['news'] 

data/ctv_lofaro.txt 
 joe lofaro 
 ['news', 'corporate'] 

data/tribune_grewal.txt 
 jasjot_grewal 
 ['news', 'student'] 

data/tribune_cason.txt 
 Lily_Cason 
 ['news', 'student'] 

data/city_henriques.txt 
 brittany henriques 
 ['news', 'corporate'] 

data/mcgill_manfredi.txt 
 Christopher Manfredi 
 ['mcgill'] 

data/tribune_wexler.txt 
 benjamin wexler 
 ['news', 'student'] 

data/cbc_03.txt 
 cbc news 
 ['news', 'corporate'] 

data/cbc_02.txt 
 cbc 
 ['news', 'corporate'] 

data/cbc_01.txt 
 cbc 
 ['news', 'corporate'] 

data/global_omalley.txt 
 Olivia O'M

In [97]:
print(indigenous_tokens[:10])
print(non_indigenous_tokens[:10])

['a', 'spokesperson', 'for', 'the', 'mohawk', 'mothers', 'or', 'kahnistensera', 'says', 'the']
['on', 'july', 'the', 'kanien', 'kehá', 'ka', 'kahnistensera', 'mohawk', 'mothers', 'were']


In [98]:
def ngram_frequencies(tokens, n):
    """
    This function takes a list of tokens and returns a dictionary of n-grams and their frequencies.
    """
    # Initialize an empty dictionary
    ngrams_freq = {}
    num_tokens = len(tokens)
    
    # Iterate over the list of tokens
    for i in range(len(tokens)-n+1):
        # Get the n-gram from tokens
        ngram = tuple(tokens[i:i+n])
        
        # Check if the n-gram is in the dictionary
        if ngram in ngrams_freq:
            # Increment the count for n-gram
            ngrams_freq[ngram] += 1
        else:
            # Initialize the count for n-gram
            ngrams_freq[ngram] = 1
    
    # Return the n-grams dictionary
    ngram_frequencies = {k: v/num_tokens for k, v in ngrams_freq.items()}
    return ngram_frequencies

### odds of a unigram being in a mohawk mothers document vs. a non-mohawk mothers document

In [99]:
indigenous_unigrams = Counter(list_tokens(indigenous_tokens, 1))
non_indigenous_unigrams = Counter(list_tokens(non_indigenous_tokens, 1))

print("indigenous unigrams (average freq.):", average(list(indigenous_unigrams.values())), "n_unigrams:", len(indigenous_unigrams))
print("non indigenous unigrams (average freq.):", average(list(non_indigenous_unigrams.values())), "n_unigrams:", len(non_indigenous_unigrams))

indigenous_unigram_odds = {}
for unigram in indigenous_unigrams:
    if unigram in non_indigenous_unigrams:
        indigenous_unigram_odds[unigram] = indigenous_unigrams[unigram]/non_indigenous_unigrams[unigram]
    else:
        continue
    
indigenous_unigram_odds = {k: v for k, v in sorted(indigenous_unigram_odds.items(), key=lambda item: item[1], reverse=True)}
top_20_indigenous_unigram_odds = {k: indigenous_unigram_odds[k] for k in list(indigenous_unigram_odds)[:20]}
top_20_indigenous_unigram_odds

indigenous unigrams (average freq.): 5.635911994322214 n_unigrams: 2818
non indigenous unigrams (average freq.): 6.4644635581711185 n_unigrams: 2209


{('ke',): 21.0,
 ('life',): 18.0,
 ('institutions',): 16.0,
 ('ancestors',): 11.0,
 ('location',): 11.0,
 ('settler',): 11.0,
 ('historical',): 10.0,
 ('québec',): 10.0,
 ('zone',): 9.0,
 ('long',): 9.0,
 ('june',): 8.0,
 ('territory',): 8.0,
 ('considered',): 7.0,
 ('role',): 7.0,
 ('refused',): 7.0,
 ('term',): 7.0,
 ('caretakers',): 7.0,
 ('tfr',): 7.0,
 ('archival',): 7.0,
 ('september',): 7.0}

### odds of a bigram being in a mohawk mothers document vs. a non-mohawk mothers document   

In [100]:
indigenous_bigrams = Counter(list_tokens(indigenous_tokens, 2))
non_indigenous_bigrams = Counter(list_tokens(non_indigenous_tokens, 2))

print("indigenous bigrams (average freq.):", average(list(indigenous_bigrams.values())), "n_bigrams:", len(indigenous_bigrams))
print("non indigenous bigrams (average freq.):", average(list(non_indigenous_bigrams.values())), "n_bigrams:", len(non_indigenous_bigrams))

indigenous_bigram_odds = {}
for bigram in indigenous_bigrams:
    if bigram in non_indigenous_bigrams:
        indigenous_bigram_odds[bigram] = indigenous_bigrams[bigram]/non_indigenous_bigrams[bigram]
    else:
        continue
    
indigenous_bigram_odds = {k: v for k, v in sorted(indigenous_bigram_odds.items(), key=lambda item: item[1], reverse=True)}
top_20_indigenous_bigram_odds = {k: indigenous_bigram_odds[k] for k in list(indigenous_bigram_odds)[:20]}
top_20_indigenous_bigram_odds

indigenous bigrams (average freq.): 1.5566555577337777 n_bigrams: 10202
non indigenous bigrams (average freq.): 1.7012987012987013 n_bigrams: 8393


{('the', 'kahnistensera'): 14.0,
 ('the', 'public'): 11.0,
 ('following', 'the'): 9.0,
 ('indigenous', 'and'): 8.0,
 ('the', 'other'): 7.0,
 ('caretakers', 'of'): 7.0,
 ('government', 'of'): 7.0,
 ('ways', 'of'): 7.0,
 ('commitment', 'to'): 6.0,
 ('investigation', 'is'): 6.0,
 ('using', 'the'): 6.0,
 ('dog', 'teams'): 6.0,
 ('victims', 'of'): 5.5,
 ('the', 'settlement'): 5.25,
 ('indigenous', 'led'): 5.0,
 ('be', 'a'): 5.0,
 ('for', 'this'): 5.0,
 ('was', 'not'): 5.0,
 ('how', 'to'): 5.0,
 ('ensure', 'that'): 5.0}

### odds of a trigram being in a mohawk mothers document vs. a non-mohawk mothers document

In [101]:
indigenous_trigrams = Counter(list_tokens(indigenous_tokens, 3))
non_indigenous_trigrams = Counter(list_tokens(non_indigenous_tokens, 3))

print("indigenous trigrams (average freq.):", average(list(indigenous_trigrams.values())), "n_trigrams:", len(indigenous_trigrams))
print("non indigenous trigrams (average freq.):", average(list(non_indigenous_trigrams.values())), "n_trigrams:", len(non_indigenous_trigrams))

indigenous_trigram_odds = {}
for trigram in indigenous_trigrams:
    if trigram in non_indigenous_trigrams:
        indigenous_trigram_odds[trigram] = indigenous_trigrams[trigram]/non_indigenous_trigrams[trigram]
    else:
        continue
    
indigenous_trigram_odds = {k: v for k, v in sorted(indigenous_trigram_odds.items(), key=lambda item: item[1], reverse=True)}
top_20_indigenous_trigram_odds = {k: indigenous_trigram_odds[k] for k in list(indigenous_trigram_odds)[:20]}
top_20_indigenous_trigram_odds

indigenous trigrams (average freq.): 1.1581941506819342 n_trigrams: 13711
non indigenous trigrams (average freq.): 1.2109235857857688 n_trigrams: 11791


{('of', 'the', 'new'): 9.0,
 ('by', 'the', 'mohawk'): 8.0,
 ('the', 'government', 'of'): 7.0,
 ('the', 'settlement', 'agreement'): 6.666666666666667,
 ('government', 'of', 'quebec'): 6.0,
 ('in', 'the', 'case'): 5.0,
 ('that', 'it', 'is'): 5.0,
 ('caretakers', 'of', 'the'): 5.0,
 ('of', 'indigenous', 'people'): 4.5,
 ('recommendations', 'of', 'the'): 4.0,
 ('settlement', 'agreement', 'was'): 4.0,
 ('to', 'ensure', 'that'): 4.0,
 ('been', 'engaged', 'in'): 4.0,
 ('engaged', 'in', 'a'): 4.0,
 ('hospital', 'site', 'until'): 4.0,
 ('site', 'until', 'a'): 4.0,
 ('victoria', 'hospital', 'and'): 4.0,
 ('the', 'heart', 'of'): 4.0,
 ('the', 'attorney', 'general'): 3.5,
 ('attorney', 'general', 'of'): 3.5}