In [1]:
import numpy as np
import pandas as pd

In [2]:
FILE_NAMES = [
    "Carmen.Site Lease.Blue Sky.csv",
    "Caroline_SiteLease_Updated Ex 2_2018.12.31.csv",
    "Site Green - Emerald Garden - Cape Fear.csv",
    "Site Lease - Blue Sky- Felicita Town Center.csv",
    "Site Lease - Bullrock - Lakeville.csv",
    "Site Lease - Emerald Garden - Marshfield Mass.csv",
    "Site Lease - Emerald Garden - Mt Kimble.csv",
    "Site Lease - GLD - Canton (ES).csv",
    "Site Lease - GLD - Hi Lo Biddy.csv",
    "Site Lease - Neighborhood Power - Mt Hope.csv",
    "Site Lease - Novel - Bartel (ES).csv",
    "Site Lease - Novel - Shelly.csv",
    "Site Lease - NPC - Williams Acres.csv",
    "Site Lease - Shine - DuQuoin.csv",
    "Site Lease - Shine - John A Logan (ES).csv",
    "Site Lease - SunRaise - Enterprise Ave. Gardiner.csv",
    "Site Lease - SunRaise - Happy Hollow (ES).csv",
    "Site Lease - SunRaise - Pequawket.csv",
    "Site Lease- SunRaise - Nutting Ridge.csv",
    "Site Lease- SunRaise - Plympton.csv",
]

In [4]:
from src.pipelines.term_extraction.pipeline_config import SiteLeasePipelineConfig
from src.pipelines.term_extraction.utils import get_project_preview

pipeline_config = SiteLeasePipelineConfig(use_gcs_storage=False)
legal_terms = []
for file_name in FILE_NAMES:
    print(f"File: {file_name}")

    correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)

    legal_terms.append(correct_project_preview[["Key Items", "Legal Terms"]].assign(file_name=file_name))

legal_terms = pd.concat(legal_terms)

In [85]:
legal_terms.sort_values(["Key Items", "file_name"])

In [86]:
legal_terms["Legal Terms"].dropna().apply(len).hist(bins=20)
print(legal_terms["Legal Terms"].dropna().apply(len).mean())

In [87]:
key_items_counts = legal_terms["Key Items"].value_counts().reset_index()

In [88]:
# key_items_counts[key_items_counts["count"] == key_items_counts["count"].max()]
key_items_counts

In [94]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts = legal_terms_with_counts.rename({"file_name": "Number of files", "Legal Terms": "Number of examples"}, axis=1)

legal_terms_with_counts.merge(pipeline_config.get_terms_and_definitions(), on="Key Items", how="outer").sort_values("Number of files")

In [82]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts

In [37]:
legal_terms.groupby("file_name").count().sort_values(["Legal Terms", "Key Items"], ascending=False)

In [38]:
legal_terms.pivot_table(index="Key Items", columns="file_name", values="Legal Terms", aggfunc=lambda x: x).to_csv(
    "legal_terms_pivot.csv")

In [48]:
from src.pipeline.utils import get_terms_and_definitions

terms_and_definitions = get_terms_and_definitions()
terms_and_definitions

In [49]:
legal_terms.groupby("Key Items").count().reset_index().merge(terms_and_definitions, how="left", on="Key Items")

In [50]:
legal_terms_with_counts.merge(terms_and_definitions, how="left", on="Key Items").to_csv("terms-definitions-counts.csv",
                                                                                        index=False)

In [53]:
terms_and_definitions_counts = pd.read_csv("terms-definitions-counts.csv")

few_shots_examples = [
    "Site Green - Emerald Garden - Cape Fear.pdf",
    "Site Lease - Novel - Bartel (ES).pdf",
    "Site Lease- SunRaise - Plympton.pdf",
]

for file_name in few_shots_examples:
    correct_project_preview = get_project_preview(PROJECT_PREVIEWS_PATH, file_name).assign(file_name=file_name)
    terms_and_definitions_counts = terms_and_definitions_counts.merge(
        correct_project_preview[["Key Items", "Legal Terms"]], how="left", on="Key Items",
        suffixes=("", f"_example_{file_name}")
    )

terms_and_definitions_counts
terms_and_definitions_counts.to_csv("terms-definitions-examples.csv", index=False)