In [7]:
import sys; sys.path.insert(0, "../")
from typing import List
import re
import os
import shutil

import pandas as pd
import deba

from lib.dropbox import sync_local_to_dropbox
from lib.pdf import subset_pdf


In [None]:
# minutes/export/src/subset-pdfs.R


def create_pdf_index(hrgs: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
    meta = meta.loc[
        meta.filetype.isin(["word", "pdf"]), ["fileid", "filepath", "filetype"]
    ]
    driver = (
        hrgs.loc[
            hrgs.hrg_acc_uid.notna() | hrgs.hrg_type.isin(["police", "unknown"]),
            ["docid", "fileid", "doc_pg_from", "doc_pg_to"],
        ]
        .drop_duplicates()
        .merge(meta, on="fileid", how="left")
    )
    driver.loc[:, "ext"] = driver.filepath.str.replace(
        r".+\.(\w+)$", r"\1", regex=True
    ).str.lower()
    driver.loc[:, "pdfname"] = driver.apply(
        lambda row: os.path.join("pdfs", "%s.%s" % (row.docid, row.ext)),
        result_type="reduce",
    )
    driver.loc[:, "page_count"] = driver.apply(
        lambda row: row.doc_pg_to - row.doc_pg_from, result_type="reduce"
    )
    for _, row in driver.iterrows():
        new_filepath = os.path.join(MINUTES_LOCAL_ROOT, row.pdfname)
        if row.filetype == "word":
            shutil.copy(row.filepath, new_filepath)
        else:
            subset_pdf(row.filepath, new_filepath, row.doc_pg_from, row.doc_pg_to)
    return driver[["docid", "page_count", "pdfname"]]

In [None]:
# minutes/export/src/docs2txt.R


def create_txt_index(docs: pd.DataFrame)->pd.DataFrame:
    newlines_re = re.compile(r"(?:\n\s*){3,}")

    def combine_texts(rows: pd.DataFrame) -> pd.Series:
        first_row = rows.iloc[0].copy()
        first_row.loc["txtname"] = os.path.join("txt", "%s.txt" % first_row.docid)
        with open(os.path.join(MINUTES_LOCAL_ROOT, first_row.loc["txtname"]), "w") as f:
            f.write(newlines_re.sub(rows.text.str.cat(sep="\n"), "\n\n").strip())
        return first_row

    return (
        docs.sort_values(by=["docid", "fileid", "pageno"])
        .groupby("docid")
        .agg(combine_texts)[["docid", "txtname"]]
    )

In [None]:
# minutes/export/src/dropbox-up.R

MINUTES_DB_ROOT = "/PPACT/meeting-minutes-extraction/export"
MINUTES_LOCAL_ROOT = deba.data("minutes_documents")


def synced_to_dropbox(df: pd.DataFrame, file_paths: List[str]) -> pd.DataFrame:
    return df.join(
        pd.DataFrame(
            sync_local_to_dropbox(
                MINUTES_LOCAL_ROOT, MINUTES_DB_ROOT, file_paths, dry_run=True
            ),
            columns=["db_path", "db_id", "db_content_hash"],
            index=df.index,
        )
    ).set_index("docid")


def combine_pdfs_txts(pdfs: pd.DataFrame, txts: pd.DataFrame) -> pd.DataFrame:
    return synced_to_dropbox(pdfs, pdfs.pdfname.tolist()).join(
        synced_to_dropbox(txts, txts.txtname.tolist()),
        how="inner",
        lsuffix="pdf_",
        rsuffix="txt_",
    )

In [None]:
# minutes/export/src/make-output.R


def create_hearings(hrg: pd.DataFrame) -> pd.DataFrame:
    df = hrg
    df.loc[:, "hrg_text"] = df.hrg_head.str.cat(df.hrg_text, sep="\n")

    def create_title(row: pd.Series) -> str:
        return "Appeal hearing: {hrg_accused} on {date}".format(
            {
                "hrg_accused": "(unknown)"
                if pd.isna(row["hrg_accused"])
                else row["hrg_accused"],
                "date": "(unknown)"
                if (pd.isna(row.year) or pd.isna(row.month) or pd.isna(row.day))
                else "-".join(row.year, row.month, row.day),
            }
        )

    df.loc[:, "title"] = df.apply(create_title, result_type="reduce")
    return (
        df.rename(
            columns={
                "mtg_year": "year",
                "mtg_month": "month",
                "mtg_day": "day",
                "mtg_dt_source": "dt_source",
                "hrgno": "hrg_no",
                "hrg_accused": "accused",
                "hrg_acc_uid": "matched_uid",
            }
        )
        .loc[
            df.hrg_type != "fire",
            [
                "docid",
                "hrg_type",
                "year",
                "month",
                "day",
                "dt_source",
                "hrg_no",
                "accused",
                "matched_uid",
                "hrg_text",
                "title",
                "agency",
            ],
        ]
        .set_index("docid", drop=False)
    )


def create_output(ind: pd.DataFrame, hearings: pd.DataFrame) -> pd.DataFrame:
    df = ind.join(hearings, how="inner")
    return df.loc[df.matched_uid.notna() | df.agency.notna()]