In [1]:
import pandas as pd
from utils import *
import os
import fitz
from pdf2image import convert_from_path
import pytesseract
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jerramclaughlin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jerramclaughlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jerramclaughlin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Step 1: Change Working Directory

In [2]:
os.chdir('/Users/jerramclaughlin/Policy-Language-Analysis/data/BP_AR_3514.1')

In [3]:
print(os.getcwd())

/Users/jerramclaughlin/Policy-Language-Analysis/data/BP_AR_3514.1


#### Step 2: Load Data

In [4]:
policy_pdf_collection_doc_path = "/Users/jerramclaughlin/Policy-Language-Analysis/data/BP_AR_3514_1.xlsx" # insert pdf collection doc path here

policy_pdfs = pd.read_excel(policy_pdf_collection_doc_path) 

#### Step 3: Extract Text from PDFs

In [5]:
# Change dtype from float to obbject for text columns 
policy_pdfs[['AR3514.1: PDF Text', 'BP3514.1: PDF Text']] = policy_pdfs[['AR3514.1: PDF Text', 'BP3514.1: PDF Text']].astype(object)

In [6]:
policy_pdfs = extract_and_update_pdf_texts(policy_pdfs, 'AR3514.1: Path to PDF', 'AR3514.1: PDF Text')

# check all relevant columns and pdfs have been extracted
check_and_report_missing_texts(policy_pdfs, 'AR: 3514.1 Hazardous Substances', 'AR3514.1: Path to PDF', 'AR3514.1: PDF Text')

MuPDF error: format error: No default Layer config

All relevant rows have text extracted.


In [7]:
policy_pdfs = extract_and_update_pdf_texts(policy_pdfs, 'BP3514.1: Path to PDF', 'BP3514.1: PDF Text')

# check all relevant columns and pdfs have been extracted
check_and_report_missing_texts(policy_pdfs, 'BP: 3514.1 Hazardous Substances', 'BP3514.1: Path to PDF', 'BP3514.1: PDF Text')

MuPDF error: format error: No default Layer config

All relevant rows have text extracted.


### Step 4: Clean up Text

In [25]:
def clean_pdf_text(text):
    """
    Cleans up textual data by lowercasing all letters, removing punctuation, removing stop words,
    and lemmatizing. 
    """
    text = re.sub(r'Education code.*$', '', text, flags=re.IGNORECASE)
    text = re.sub(r'legal reference.*', '', text, flags=re.IGNORECASE | re.DOTALL) # Use regex to remove 'legal reference' and everything after
    text = text.lower() # Lowercase text 
    tokens = word_tokenize(text) # Tokenize text
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens if re.sub(r'[^\w\s]', '', token)] # Regular expression to match punctuation
    stop_words = set(stopwords.words('english')) # Define stop words
    tokens = [word for word in tokens if word not in stop_words] # Remove stop words 
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmetize 
    cleaned_text = ' '.join(tokens) # Join tokens back into a string
    
    return cleaned_text

### Step 5: Load cleaned Text into Data folder for Analysis

In [26]:
# change dtype to string bc the function only takes in str 
policy_pdfs[['BP3514.1: PDF Text', 'AR3514.1: PDF Text']] = policy_pdfs[['BP3514.1: PDF Text', 'AR3514.1: PDF Text']].astype(str)

# apply function to all rows in text columns 
policy_pdfs['BP3514.1: PDF Text'] = policy_pdfs['BP3514.1: PDF Text'].apply(clean_pdf_text)
policy_pdfs['AR3514.1: PDF Text'] = policy_pdfs['AR3514.1: PDF Text'].apply(clean_pdf_text)

In [27]:
# Save as csv in data folder
policy_pdfs.to_csv('/Users/jerramclaughlin/Policy-Language-Analysis/cleaned_data/BP_AR_3514.1.csv', index='False')