### attempt 2

In [27]:
import spacy
import re
import yaml
from pathlib import Path

nlp = spacy.load("en_core_web_sm")

doc_dir = Path("documents")
corpus = []

for doc in doc_dir.iterdir():
    if doc.suffix == ".txt":
        print(f"Processing {doc}")
        with doc.open() as f:
            text = f.read()
        
        # Extract metadata from YAML frontmatter
        yaml_header = re.search(r'---\n(.*?)\n---', text, re.DOTALL)
        metadata = yaml.safe_load(yaml_header.group(1))
        
        # Remove YAML frontmatter from the document text
        text = re.sub(r'---\n.*?\n---', '', text, flags=re.DOTALL)
        
        doc = nlp(text)
        corpus.append({'metadata': metadata, 'text': doc})

print(corpus)

token_corpus = [document['text'] for document in corpus]

doc = token_corpus[0]
for token in doc:
    print(f"Token: {token.text}\nLemma: {token.lemma_}\nPOS: {token.pos_}\nTag: {token.tag_}\nDependency: {token.dep_}\nShape: {token.shape_}\nIs Alpha: {token.is_alpha}\nIs Stop: {token.is_stop}\n")


Processing documents/aptn_fournier_03.txt
Processing documents/tribune_grewal_01.txt
Processing documents/mm_01.txt
Processing documents/aptn_fournier_02.txt
Processing documents/gazette_tomesco.txt
Processing documents/mm_03.txt
Processing documents/city_madocjones.txt
Processing documents/mm_02.txt
Processing documents/gazette_dunlevy.txt
Processing documents/aptn_fournier_01.txt
Processing documents/gazette_magder.txt
Processing documents/ctv_harold.txt
Processing documents/gazette_petition.txt
Processing documents/mm_06.txt
Processing documents/mm_05.txt
Processing documents/mm_04.txt
Processing documents/ctv_lofaro.txt
Processing documents/tribune_grewal.txt
Processing documents/tribune_cason.txt
Processing documents/mm_petition.txt
Processing documents/city_henriques.txt
Processing documents/the_site.txt
Processing documents/mcgill_manfredi.txt
Processing documents/cultural_survival.txt
Processing documents/tribune_wexler.txt
Processing documents/the_mission.txt
Processing docume

In [28]:
for document in corpus:
    print(f"title : {document['metadata']['title']}")
    print(f"author : {document['metadata']['author']}")
    print(f"tags : {document['metadata']['tags']}")
    print(f"publisher : {document['metadata']['publisher']}")
    print(f"URL : {document['metadata']['URL']}")
    print(f"text : {document['text']}")

title : Archeological Dig at Old Montreal Hospital on Hold by McGill University
author : Emelia Fournier
tags : ['news', 'indigenous']
publisher : aptn news
URL : https://www.aptnnews.ca/national-news/archeological-dig-old-montreal-hospital-on-hold-mcgill-university/
text : 
A spokesperson for the Mohawk Mothers, or Kahnistensera, says the group feels pushed aside in the search for unmarked graves on a site owned by Société Québécoise des Infrastructures, or SQI. McGill says it leases part of the property.
“The process can no longer by any means be considered Indigenous-led, as the SQI and McGill attempt to control the whole process, reducing the role of Indigenous people to performing ceremonies on the site,” said Kahentinetha, one of the Mothers who added that they feel blindsided by the communications that happened without consulting them.
Quebec’s infrastructure society, or SQI, and McGill both put out statements on Aug. 3 saying nine potential gravesites were identified through gr

In [None]:
for token in token_corpus[0]:
    print(f"Token: {token.text}\nLemma: {token.lemma_}\nPOS: {token.pos_}\nTag: {token.tag_}\nDependency: {token.dep_}\nShape: {token.shape_}\nIs Alpha: {token.is_alpha}\nIs Stop: {token.is_stop}\n")

In [46]:
def print_cleaned_tokens(token_corpus):
    cleaned_tokens = []

    for token in token_corpus[1]:
        if token.is_alpha:
            cleaned_tokens.append(token)
        
    for token in cleaned_tokens:
        print(token.lemma_.lower())

print_cleaned_tokens(token_corpus)


on
july
the
ka
kahnistensera
mohawk
mothers
be
verbally
assault
by
security
personnel
on
mcgill
new
vic
project
site
a
week
later
on
mcgill
issue
a
comprehensive
press
release
about
the
ongoing
investigation
into
the
new
vic
project
site
announce
that
nine
anomaly
be
find
during
archaeological
work
however
the
mohawk
mothers
soon
release
their
own
statement
allege
that
mcgill
and
the
société
québécoise
des
infrastructure
sqi
have
fail
to
report
all
the
finding
on
the
site
and
isolate
the
mothers
from
the
investigation
the
mothers
have
be
present
for
the
archaeological
excavation
on
the
new
vic
site
alongside
archaeologist
and
security
personnel
hire
by
sqi
as
stipulate
by
their
settlement
agreement
with
mcgill
the
sqi
the
royal
victoria
hospital
rvh
the
city
of
montreal
and
the
attorney
general
of
canada
court
appoint
indigenous
cultural
monitor
who
be
responsible
for
perform
cultural
ceremony
and
ensure
that
excavation
be
complete
in
accordance
with
indigenous
protocol
have
also
be
pr

In [49]:
from collections import Counter

def noun_phrase_counter(token_corpus):
    noun_phrases = Counter()

    for doc in token_corpus:
        for np in doc.noun_chunks:
            noun_phrases[np.text] += 1

    return noun_phrases

for np, count in noun_phrase_counter(token_corpus).most_common(100):
    print(np, count)

that 164
it 160
we 156
they 121
the site 104
McGill 103
I 76
which 72
It 65
the Mohawk Mothers 63
We 63
who 57
this 53
unmarked graves 50
McGill University 47
them 45
what 45
Kahentinetha 43
us 41
he 40
the SQI 38
They 38
Kwetiio 38
she 37
you 36
Canada 33
SQI 31
court 31
the Mothers 29
the land 29
Montreal 27
Quebec 27
part 26
This 26
human remains 24
graves 23
The Mohawk Mothers 22
Mohawk Mothers 20
Indigenous people 19
He 19
the work 18
the New Vic project 18
The group 18
the Kahnistensera 17
the case 17
Indigenous children 17
the university 17
The SQI 16
ka Kahnistensera 16
the grounds 16
our children 16
Indigenous peoples 16
the Royal Victoria Hospital 16
Tiohtià 16
ke 16
work 15
place 15
children 15
evidence 15
Indigenous Peoples 15
the Allan Memorial Institute 15
the 1950s 15
people 15
Montréal 15
That 14
reconciliation 14
an interview 14
me 14
She 14
all 14
the city 14
the island 14
the panel 13
things 13
April 13
some 12
the investigation 12
the project 12
the agreement 12
the

## comparing noun phrases in documents with indigenous tag to those in documents without

a function should use the nlp objects in corpus["text"] (in the dictionary corpus from earlier). It should return a list of the most common noun phrases in the documents with the tag "indigenous" and the most common noun phrases in the documents without the tag "indigenous".

a comparison function should generate the relative odds of a noun phrase being in a document with the tag "indigenous" vs. a document without the tag "indigenous". It should return a list of the noun phrases with the highest relative odds. It should use a collections.Counter object to count the noun phrases and then either a Counter or a dict to store the relative odds.

to the greatest extent possible, the functions should use snippets of code from the spacy_corpus_analysis notebook.

In [54]:
#The code is defining two functions: `get_common_noun_phrases` and `get_relative_odds`. These functions are used to analyze a corpus of text and calculate the common noun phrases and their relative odds based on a given tag.
from collections import Counter
from spacy.tokens import Doc

#This code is adding a new attribute called "tags" to the `Doc` class in the `spacy.tokens` module. This attribute will be used to store the tags associated with each document in the corpus.
# Add a new attribute to the Doc class to store the tags
if not Doc.has_extension("tags"):
    Doc.set_extension("tags", default=[])

#The code is adding the tags from the metadata to the `Doc` objects in the corpus. It iterates over each document in the corpus and its corresponding metadata, and assigns the tags from the metadata to the `tags` attribute of the `Doc` object. This allows each document to have associated tags that can be used for further analysis.
# Add the tags from the metadata to the Doc objects
for doc, metadata in zip(token_corpus, corpus):
    doc._.tags = metadata['metadata']['tags']

# The `get_common_noun_phrases` function takes in a corpus of text and a tag as parameters. It initializes two counters, `noun_phrases_with_tag` and `noun_phrases_without_tag`, to keep track of the noun phrases that have the given tag and those that don't.
def get_common_noun_phrases(corpus, tag):
    noun_phrases_with_tag = Counter()
    noun_phrases_without_tag = Counter()

    for doc in corpus['text']:
        if tag in doc._.tags:
            noun_phrases_with_tag.update(np.text for np in doc.noun_chunks)
        else:
            noun_phrases_without_tag.update(np.text for np in doc.noun_chunks)

    return noun_phrases_with_tag.most_common(), noun_phrases_without_tag.most_common()

#The `get_relative_odds` function is calculating the relative odds of noun phrases with a given tag compared to noun phrases without that tag in a corpus of text.
def get_relative_odds(corpus, tag):
    noun_phrases_with_tag, noun_phrases_without_tag = get_common_noun_phrases(corpus, tag)

    relative_odds = Counter()

    for np, count in noun_phrases_with_tag.items():
        relative_odds[np] = count / (noun_phrases_without_tag[np] + 1e-7)

    return relative_odds.most_common()

In [55]:
get_relative_odds(corpus, 'indigenous')

TypeError: list indices must be integers or slices, not str