Parsing Code For NYT

In [None]:
import os
from docx import Document
import pandas as pd
from tqdm import tqdm
import re

folder_path = '/Users/jingguo/Desktop/NYT'

# Collect DOCX files sorted numerically
docx_files = sorted(
    [f for f in os.listdir(folder_path) if f.endswith('.DOCX') and not f.startswith('~')],
    key=lambda x: int(x.split('.')[0].split('-')[0])
)

# Helper to break text into article blocks
def extract_article_blocks(text):
    return [a.strip() for a in text.split("End of Document") if a.strip()]

# Logic to extract metadata + body from each article block
def parse_article(text_block):
    lines = [line.strip() for line in text_block.split("\n") if line.strip()]

    #  Title before "The New York Times"
    title = None
    for i, line in enumerate(lines):
        if "The New York Times" in line:
            if i > 0:
                candidate = lines[i-1]
                if candidate.lower() not in ["user name", "no headline in original"]:
                    title = candidate
            break
    title = title or "No headline"

    #  Date
    date = next((line for line in lines if re.search(r"\d{4}", line)), None)

    #  Section and Length
    section = next((line for line in lines if line.lower().startswith("section")), None)
    length = next((line for line in lines if line.lower().startswith("length")), None)

    #  Body text
    body = ""
    if "Body" in lines:
        body_start = lines.index("Body") + 1
        body_end = lines.index("Graphic") if "Graphic" in lines else len(lines)
        body_lines = lines[body_start:body_end]
        body = "\n".join(body_lines).strip()

    return {
        "title": title,
        "date": date,
        "section": section,
        "length": length,
        "body": body
    }

#  Parse all files
records = []
for file in tqdm(docx_files, desc="Parsing DOCX files"):
    doc = Document(os.path.join(folder_path, file))
    full_text = "\n".join(p.text for p in doc.paragraphs)
    articles = extract_article_blocks(full_text)

    for article in articles:
        parsed = parse_article(article)
        parsed["source_file"] = file
        records.append(parsed)

#  Convert to DataFrame and save
df = pd.DataFrame(records)
output_path = os.path.join(folder_path, "nyt_1980_2024_parsed_clean.csv")
df.to_csv(output_path, index=False)

print(f"✅ Saved to: {output_path}")


Parsing Code For Other Publishers

In [None]:
import os
from docx import Document
import pandas as pd
from tqdm import tqdm

# === Setup ===
folder_path = '/Users/jingguo/Desktop/OPT/NLP/NYT/Archive'
output_csv = os.path.join(folder_path, 'publisher_articles_parsed.csv')

# === File list ===
docx_files = sorted([
    f for f in os.listdir(folder_path)
    if f.endswith('.DOCX') and not f.startswith('~')
], key=lambda x: int(x.split('.')[0].split('(')[1].split(')')[0]))

def extract_article_blocks(text):
    # Split by "End of Document"
    return [a.strip() for a in text.split("End of Document") if a.strip()]

def parse_article(text_block):
    lines = [line.strip() for line in text_block.split("\n") if line.strip()]

    # Title line before "The New York Times" or fallback
    title = None
    for i, line in enumerate(lines):
        if "The New York Times" in line:
            if i > 0:
                candidate = lines[i - 1]
                if candidate.lower() not in ["user name", "no headline in original"]:
                    title = candidate
            break
    title = title or "No headline"

    # Date line
    date = next((line for line in lines if any(month in line for month in [
        "January", "February", "March", "April", "May", "June", "July",
        "August", "September", "October", "November", "December"
    ])), None)

    # Section and Length lines
    section = next((line for line in lines if line.lower().startswith("section")), None)
    length = next((line for line in lines if line.lower().startswith("length")), None)

    # Body extraction
    body = ""
    if "Body" in lines:
        body_start = lines.index("Body") + 1
        if "Graphic" in lines:
            body_end = lines.index("Graphic")
            body_lines = lines[body_start:body_end]
        else:
            body_lines = lines[body_start:]
        body = "\n".join(body_lines).strip()

    return {
        "title": title,
        "date": date,
        "section": section,
        "length": length,
        "body": body
    }

# === Parse all files ===
records = []
for file in tqdm(docx_files, desc="Parsing publisher DOCX files"):
    doc = Document(os.path.join(folder_path, file))
    full_text = "\n".join(p.text for p in doc.paragraphs)
    articles = extract_article_blocks(full_text)

    for article in articles:
        parsed = parse_article(article)
        parsed["source_file"] = file
        records.append(parsed)

# === Save output ===
df = pd.DataFrame(records)
print(f"✅ Total articles parsed: {len(df)}")
df.to_csv(output_csv, index=False)
print(f"📁 Saved to: {output_csv}")


Merging

In [None]:
import pandas as pd

# === 1. File paths ===
nyt_path = "/Users/jingguo/Desktop/OPT/NLP/NYT/nyt_1980_2024_articles_block_parsed.csv"
other_path = "/Users/jingguo/Desktop/OPT/NLP/NYT/Archive/publisher_articles_parsed.csv"
output_path = "/Users/jingguo/Desktop/OPT/NLP/NYT/combined_articles_1980_2024.csv"

# === 2. Load datasets ===
df_nyt = pd.read_csv(nyt_path)
df_other = pd.read_csv(other_path)

# Optional: Tag source
df_nyt["source_group"] = "NYT"
df_other["source_group"] = "Other Publishers"

# === 3. Merge ===
df_combined = pd.concat([df_nyt, df_other], ignore_index=True)

# === 4. Save ===
df_combined.to_csv(output_path, index=False)
print(f"✅ Combined CSV saved to: {output_path}")
print(f"🔢 Total articles: {len(df_combined)}")
