In [8]:
from docx import Document
import pandas as pd
from datetime import datetime
import os
import time

In [2]:
import glob
docx_files = glob.glob(os.path.join('..', 'data', 'raw', '*', '*.DOCX'))
print("Files to process: ", len(docx_files))

Files to process:  88


In [None]:
def clean_date(cleaned_date_str):
    parsed_date = datetime.strptime(cleaned_date_str, '%B %d, %Y')
    return parsed_date.strftime('%Y-%m-%d')

In [3]:
def extract_articles_to_dataframe(doc_path):
    # Start timer
    start_time = time.time()
    print("Processing document: ", doc_path)
    doc = Document(doc_path)
    count = 0
    articles = []
    while count < len(doc.paragraphs):
        #for para in doc.paragraphs:
        para = doc.paragraphs[count]
        count += 1
        if "Heading 1" in para.style.name: # @Guo: note that Docx files contain metadate on each paragraph, such as the paragraph style, not just the text. I leverage this to obtain the titles.
            article = {}
            correction_appended = False
            article['title'] = para.text.strip()
            article['publisher'] =  doc.paragraphs[count].text  
            raw_date = doc.paragraphs[count+1].text
            # Remove trailing weekday if present (e.g., "Tuesday")
            date_parts = raw_date.split()[:3]  # "November 19, 2024"
            cleaned_date_str = " ".join(date_parts).strip(",")
            try:
                article['date'] = clean_date(cleaned_date_str)
            except:
                if doc.paragraphs[count+2].text.strip("\n ") == "Correction Appended":
                    correction_appended = True
            #except ValueError:
             #   continue
            for i in range(3,9):
                text = doc.paragraphs[count+i].text
                if text[:8] == "Section:":
                    section = text[9:]
                    article["section"] = section
                if text[:7] == "Length:":
                    article["length"] = text[8:]
                if text == "Body":
                    body = []
                    new_count = count + i + 1
                    continue
            while text.strip().lower() != "end of document":
                text = doc.paragraphs[new_count].text
                body.append(text)
                new_count += 1
            article["body"] = "".join(body[:-1])
            if correction_appended:
                date_parts  = article["body"].split("Correction-Date: ")[-1].split()[:3]
                cleaned_date_str = " ".join(date_parts).strip(",")
                article['date'] = clean_date(cleaned_date_str)
            count = new_count
            article["correction_appended"] = correction_appended
            articles.append(article)
    print("Number of articles: ", len(articles))

    df = pd.DataFrame(articles)
    df["source_file"] = os.path.basename(doc_path)

    # End timer
    end_time = time.time()

    # Calculate the difference
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time / 60:.2f} minutes")
    return df

In [5]:
# temp
docx_files = docx_files[:5]

In [None]:
from concurrent.futures import ThreadPoolExecutor

# Define a helper function to process a single file

# Use ThreadPoolExecutor to process files in parallel
with ThreadPoolExecutor() as executor:
    results = list(executor.map(extract_articles_to_dataframe, docx_files))
    
# Concatenate all resulting DataFrames
df_articles = pd.concat(results, ignore_index=True)

Processing document:  ../data/raw/Other publishers/Files (500) (21).DOCX
Processing document:  ../data/raw/Other publishers/Files (500) (2).DOCX
Processing document:  ../data/raw/Other publishers/Files (500) (23).DOCX
Processing document:  ../data/raw/Other publishers/Files (500) (27).DOCX
Processing document:  ../data/raw/Other publishers/Files (500) (4).DOCX


In [None]:
from pathlib import Path

folder = Path("../data/processed/")
folder.mkdir(parents=True, exist_ok=True)

df_articles.to_pickle("../data/processed/parsed_articles.pkl")

In [None]:


# check if "processed" folder exists. If not create it before saving the file

# number the script. e.g. 1 for parsing, 2 for cleaning.
