In [1]:
import numpy as np
import pandas as pd

import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"

In [2]:
FILE_NAMES = [
"210331 233 Randolph 74 Solar I EPC Agreement executed.pdf",
"EPC - Blue Sky - Felicita Town Center.pdf",
"EPC - Emerald Garden - Mt Kimble.pdf",
"EPC Agreement - Bullrock - Lakeville.pdf",
"EPC Agreement - GLD - Canton.pdf",
"EPC Agreement - Novel - Bartel.pdf",
"EPC Agreement - RSP - Mt Hope.pdf",
"EPC Agreement - Shine - DuQuoin.pdf",
"EPC Agreement - SunRaise - Enterprise.pdf",
"EPC Agreement - SunRaise - Pequawket Trail Baldwin.pdf",
]

In [3]:
from src.pipelines.term_extraction.pipeline_config import EPCPipelineConfig
from src.pipelines.term_extraction.utils import get_project_preview

pipeline_config = EPCPipelineConfig(use_gcs_storage=True)
legal_terms = []
for file_name in FILE_NAMES:
    print(f"File: {file_name}")

    correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)

    legal_terms.append(correct_project_preview[["Key Items", "Legal Terms"]].assign(file_name=file_name))

legal_terms = pd.concat(legal_terms)

In [4]:
legal_terms.sort_values(["Key Items", "file_name"]).to_csv("legal_terms.csv", index=False)

In [5]:
legal_terms.sort_values(["Key Items", "file_name"])

In [6]:
legal_terms["Legal Terms"].dropna().apply(len).hist(bins=20)
print(legal_terms["Legal Terms"].dropna().apply(len).mean())

In [7]:
key_items_counts = legal_terms["Key Items"].value_counts().reset_index()

In [8]:
# key_items_counts[key_items_counts["count"] == key_items_counts["count"].max()]
key_items_counts

In [9]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts = legal_terms_with_counts.rename(
    {"file_name": "Number of files", "Legal Terms": "Number of examples"}, axis=1)

legal_terms_with_counts.merge(pipeline_config.get_terms_and_definitions(), on="Key Items", how="outer").sort_values(
    "Number of files")

In [14]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts

In [15]:
legal_terms.groupby("file_name").count().sort_values(["Legal Terms", "Key Items"], ascending=False)

In [38]:
legal_terms.pivot_table(index="Key Items", columns="file_name", values="Legal Terms", aggfunc=lambda x: x).to_csv(
    "legal_terms_pivot.csv")

In [16]:
terms_and_definitions = pipeline_config.get_terms_and_definitions()
terms_and_definitions

In [49]:
legal_terms.groupby("Key Items").count().reset_index().merge(terms_and_definitions, how="left", on="Key Items")

In [50]:
legal_terms_with_counts.merge(terms_and_definitions, how="left", on="Key Items").to_csv("terms-definitions-counts.csv",
                                                                                        index=False)

In [53]:
terms_and_definitions_counts = pd.read_csv("terms-definitions-counts.csv")

few_shots_examples = [
    "Site Green - Emerald Garden - Cape Fear.pdf",
    "Site Lease - Novel - Bartel (ES).pdf",
    "Site Lease- SunRaise - Plympton.pdf",
]

for file_name in few_shots_examples:
    correct_project_preview = get_project_preview(PROJECT_PREVIEWS_PATH, file_name).assign(file_name=file_name)
    terms_and_definitions_counts = terms_and_definitions_counts.merge(
        correct_project_preview[["Key Items", "Legal Terms"]], how="left", on="Key Items",
        suffixes=("", f"_example_{file_name}")
    )

terms_and_definitions_counts
terms_and_definitions_counts.to_csv("terms-definitions-examples.csv", index=False)