<a href="https://colab.research.google.com/github/jazzjastine/Natural_Language_Processing/blob/main/Mini_Corpus_Programming_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
!pip install pdfplumber gensim spacy
!python -m spacy download en_core_web_sm

import pdfplumber
import spacy
from gensim import corpora, models
from gensim.models.phrases import Phrases, Phraser

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [30]:
# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

In [31]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text += ' ' + text
    return all_text

In [32]:
# Function to Preprocess Text with spaCy
def preprocess_text(document):
    # Parse the document with spaCy
    doc = nlp(document)
    # Filter out punctuation, stopwords, and lemmatize
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.like_num]

In [33]:
# Google drive folder
folder_path = '/content/drive/My Drive/NLP_PDFs'

# Example list of PDF filenames
pdf_filenames = ["sustainability-11-02738.pdf", "PIIS2405844019358414.pdf", "kundzewicz-et-al-2018-flood-risk-reduction-structural-measures-and-diverse-strategies.pdf", "Academic Emergency Medicine - 2008 - Lai - Strategies of Disaster Response in the Health Care System for Tropical Cyclones .pdf", "1-s2.0-S2225603221000138-main.pdf", "1-s2.0-S2225603219300475-main.pdf"]

In [48]:
# Extract and preprocess text from PDFs
processed_texts = []
for filename in pdf_filenames:
    pdf_path = f"{folder_path}/{filename}"
    text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(text)
    processed_texts.append(processed_text)

In [50]:
# 1. Preprocess and Vectorization using Gensim
dictionary = corpora.Dictionary(processed_texts)

In [65]:
# 2. Document to Bag-of-Words
corpus = [dictionary.doc2bow(text) for text in processed_texts]

In [66]:
# Storing your generated corpus
corpora.MmCorpus.serialize('disaster_corpus.mm', corpus)

In [67]:
# 3. Bag-of-Words to TF-IDF Representation
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

In [68]:
# 4. N-Gramming
bigram_phrases = Phrases(processed_texts, min_count=3, threshold=10)
bigram = Phraser(bigram_phrases)
texts_with_bigrams = [bigram[text] for text in processed_texts]

In [69]:
# Update dictionary and corpus to include bigrams
dictionary.add_documents(texts_with_bigrams)

In [70]:
# Filter out tokens that appear too frequently or too infrequently
dictionary.filter_extremes(no_below=2, no_above=0.9)

In [71]:
# Re-create corpus after filtering
corpus_filtered = [dictionary.doc2bow(text) for text in texts_with_bigrams]

In [74]:
print("First Document's Original Tokens:", processed_texts[0])
print("\nFirst Document's Bag-of-Words Representation:", corpus[0])
print("\nFirst Document's TF-IDF Representation:", list(corpus_tfidf)[0])
print("\nFirst Document with Bigrams:", texts_with_bigrams[0])
print("\nDocument to Bag-of-Words Representation after filtering:", corpus_filtered[0])
print("\nDictionary after filtering extremes and adding bigrams:", dictionary.token2id)


First Document's Bag-of-Words Representation: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (1