In [16]:
import nltk
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import re
import glob
from nltk.corpus import stopwords
import os
from nltk.collocations import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import altair as alt


In [17]:
stoplist = stopwords.words('english')
stoplist.extend([")", "(", ".", ",", "?", "could", "would", "“", "”", "’", ";", "!","much", "like", "one", "many", "though", "without", "upon"])

In [18]:
def read_and_tokenize(directory_name):
    files = os.listdir(directory_name)
    listoftokens = []
    for file in files:
        if file.endswith(".txt"):
            with open(os.path.join(directory_name, file), 'r') as f:
                text = f.read()
                alltext = re.sub("\n", " ", text)
                alltokens = nltk.word_tokenize(alltext)
                listoftokens.append(alltokens)
    return listoftokens
pagelist = read_and_tokenize("Endangered_animals")


In [19]:
def common_bigrams(tokenlist, display_limit=10):
    bigrams = nltk.ngrams(tokenlist, 2)
    bigramlist = list(bigrams)

    print("** Most frequent bigrams **")
    bigramfreq = nltk.FreqDist(bigramlist)
    for bigram, _ in bigramfreq.most_common(display_limit):
        print(f"{bigram[0]} {bigram[1]}")

    print("\n** Most frequent bigrams with no stop words **")
    count = 0
    for b, freq in bigramfreq.most_common(50):
        if b[0].lower() not in stoplist and b[1].lower() not in stoplist:
            if count < display_limit:
                print(f"{b[0]} {b[1]}")
                count += 1
            else:
                break


In [20]:
for each in pagelist:
    if len(each) > 1000:
        common_bigrams(each, display_limit=5)  # Only the top 5 bigrams will be printed


** Most frequent bigrams **
. The
of the
, and
wattled curassow
in the

** Most frequent bigrams with no stop words **
wattled curassow
C. globulosa
black plumage
** Most frequent bigrams **
. The
in the
of the
, the
, and

** Most frequent bigrams with no stop words **
queen conch
also used
conch shell
musical instrument
conch meat
** Most frequent bigrams **
fur seals
Galápagos fur
. The
of the
, and

** Most frequent bigrams with no stop words **
fur seals
Galápagos fur
fur seal
El Niño
Galápagos Islands
** Most frequent bigrams **
of the
. The
in the
, which
for the

** Most frequent bigrams with no stop words **
bird show
working group
Endangered Parrots
parrot species
Santa Marta
** Most frequent bigrams **
. The
, and
olive ridley
of the
in the

** Most frequent bigrams with no stop words **
olive ridley
ridley sea
sea turtle
olive ridleys
nesting females
** Most frequent bigrams **
African bush
. The
, and
bush elephant
. In

** Most frequent bigrams with no stop words **
Afric

In [21]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
def print_collocations(tokenlist):

    print("** Common Collocations **"),
    finder = BigramCollocationFinder.from_words(tokenlist, window_size = 5)
    finder.apply_freq_filter(2)
    # print(finder.nbest(bigram_measures.pmi, 10)),
    for c in finder.nbest(bigram_measures.pmi, 10):
        print(" ".join(c))
    print("\n")

In [22]:
for each in pagelist:
    if len(each) > 1000:
        print_collocations(each)

** Common Collocations **
A captive
Development Reserve
Mamirauá Development
Mamirauá Reserve
Mamirauá Sustainable
Sustainable Development
Sustainable Reserve
gallery forests
von Spix
yellow-knobbed daubentoni


** Common Collocations **
Lord Shiva
Turks Caicos
United States
closely related
consisting marinated
lime juice
marinated juice
marinated lime
organic matter
salads cooked


** Common Collocations **
infectious diseases
kg lb
weigh kg
19th century
Isabela Island
National Park
able recover
average weigh
environmental factors
grow m


** Common Collocations **
interested parties
: für
Santa Marta
close with
have been
members interested
same time
Apart from
Endangered Parrots
been held


** Common Collocations **
For example
named Chelonia
played role
practice allowing
raccoons coyotes
sexual maturity
sharks crocodiles
Species Convention
Eschscholtz 1829
days conditions


** Common Collocations **
Palaeoloxodon iolensis
licks visited
mineral licks
IUCN List
IUCN Red
Red List
Since

In [23]:
directoryname = "Endangered_animals"

text_files = glob.glob(directoryname + "/*.txt")
file_names = [Path(text).stem for text in text_files]


tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [24]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=file_names, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

Unnamed: 0,document,term,tfidf
225521,African_bush_elephant,elephants,0.405020
225519,African_bush_elephant,elephant,0.366639
223898,African_bush_elephant,bulls,0.343386
223050,African_bush_elephant,african,0.302098
223918,African_bush_elephant,bush,0.275296
...,...,...,...
245427,White-collared_kite,flight,0.086623
244796,White-collared_kite,eastern,0.081682
243166,White-collared_kite,brazil,0.072960
243208,White-collared_kite,broad,0.072960


In [25]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
top_tfidf[top_tfidf['term'].str.contains('animal')]

Unnamed: 0,document,term,tfidf
793933,Michoacan_pocket_gopher,animal,0.093273


In [26]:
top_tfidf[top_tfidf['document'].str.contains('Anacropora_spinosa')]

Unnamed: 0,document,term,tfidf
456833,Anacropora_spinosa,coral,0.517546
455342,Anacropora_spinosa,anacropora,0.300993
462835,Anacropora_spinosa,spinosa,0.300993
455884,Anacropora_spinosa,bleaching,0.200662
456834,Anacropora_spinosa,corallites,0.200662
461828,Anacropora_spinosa,reef,0.172515
455968,Anacropora_spinosa,branches,0.149791
463190,Anacropora_spinosa,susceptible,0.149791
460936,Anacropora_spinosa,pacific,0.127904
462484,Anacropora_spinosa,shallow,0.127904


In [27]:
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

(heatmap + text).properties(width = 600)