In [1]:
import numpy as np
import pandas as pd

In [2]:
FILE_NAMES = [
"Blue Sky.Interconnection Agreeement.Carmen.csv",
"Blue Sky.Interconnection Agreement.Felicita Town Centre.csv",
"GLD.Interconnection Agreeement.Hi Lo Biddy.csv",
"GLD.Interconnection Agreeement.Weathersfield.csv",
"GLD.Interconnection Agreement.Canton.csv",
"Interconnection Agreement - Emerald Green - Cape Fear.csv",
"Interconnection Agreement - Emerald Green - Mt Kimble BLDG A.csv",
"Interconnection Agreement - Emerald Green - Mt Kimble BLDG B.csv",
"Novel.Interconnection Agreement.Caroline.csv",
"SunRaise.Interconnection Agreeement.Lake St.csv",
"SunRaise.Interconnection Agreement.Nutting Ridge.csv",
]

In [3]:
from src.pipelines.term_extraction.pipeline_config import InterconnectionAgreementPipelineConfig
from src.pipelines.term_extraction.utils import get_project_preview

pipeline_config = InterconnectionAgreementPipelineConfig(use_gcs_storage=False)
legal_terms = []
for file_name in FILE_NAMES:
    print(f"File: {file_name}")

    correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)

    legal_terms.append(correct_project_preview[["Key Items", "Legal Terms"]].assign(file_name=file_name))

legal_terms = pd.concat(legal_terms)

In [64]:
legal_terms["Legal Terms"].dropna().apply(len).hist(bins=20)
print(legal_terms["Legal Terms"].dropna().apply(len).mean())

In [65]:
legal_terms.sort_values(["Key Items", "file_name"]).to_csv("legal_terms.csv", index=False)

In [66]:
legal_terms.sort_values(["Key Items", "file_name"])

In [67]:
key_items_counts = legal_terms["Key Items"].value_counts().reset_index()

In [68]:
# key_items_counts[key_items_counts["count"] == key_items_counts["count"].max()]
key_items_counts

In [70]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts = legal_terms_with_counts.rename({"file_name": "Number of files", "Legal Terms": "Number of examples"}, axis=1)

legal_terms_with_counts.merge(pipeline_config.get_terms_and_definitions(), on="Key Items", how="outer").sort_values("Number of files")

In [60]:
key_items_counts.merge(pipeline_config.get_terms_and_definitions(), on="Key Items", how="outer").sort_values("count")

In [30]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts

In [31]:
legal_terms.groupby("file_name").count().sort_values(["Legal Terms", "Key Items"], ascending=False)

In [10]:
legal_terms.pivot_table(index="Key Items", columns="file_name", values="Legal Terms", aggfunc=lambda x: x)

In [11]:
from src.pipeline.utils import get_terms_and_definitions

terms_and_definitions_path = "/Users/odeine/PycharmProjects/ilios-DocAI/data/interconnection-agreement/terms-definitions.csv"
terms_and_definitions = get_terms_and_definitions(terms_and_definitions_path)
terms_and_definitions

In [12]:
legal_terms.groupby("Key Items").count().reset_index().merge(terms_and_definitions, how="left", on="Key Items")

In [13]:
legal_terms_with_counts.merge(terms_and_definitions, how="left", on="Key Items").to_csv("terms-definitions-counts.csv",
                                                                                        index=False)

In [14]:
terms_and_definitions_counts = pd.read_csv("terms-definitions-counts.csv")

few_shots_examples = [
    "Site Green - Emerald Garden - Cape Fear.pdf",
    "Site Lease - Novel - Bartel (ES).pdf",
    "Site Lease- SunRaise - Plympton.pdf",
]

for file_name in few_shots_examples:
    correct_project_preview = get_project_preview(PROJECT_PREVIEWS_PATH, file_name).assign(file_name=file_name)
    terms_and_definitions_counts = terms_and_definitions_counts.merge(
        correct_project_preview[["Key Items", "Legal Terms"]], how="left", on="Key Items",
        suffixes=("", f"_example_{file_name}")
    )

terms_and_definitions_counts
terms_and_definitions_counts.to_csv("terms-definitions-examples.csv", index=False)