# Prepare the dataframe for LLM task

In [None]:
import duckdb
import pandas as pd

from bs4 import BeautifulSoup


# dev_mode = True
dev_mode = False
if dev_mode:
    # DEV (user specific)
    database = "/home/heiler/development/projects/ascii/research-space/src/pipelines/ascii/ascii_dbt/ascii_pipeline.duckdb"
    prefix = "ascii_dev"
else:
    # prod
    database = "/data/raid5/data/ascii/mastered-data/ascii_pipeline.duckdb"
    prefix = "ascii"

con = duckdb.connect(
    database=database,
    read_only=True,
)

In [None]:
%store -r df_extr_text

In [None]:
from langdetect import detect, LangDetectException


def detect_language(text):
    try:
        # Detect the language of the text
        return detect(text)
    except LangDetectException:
        # Return a placeholder if language detection fails
        return "unknown"


# Apply the language detection function to the 'extr_text' column
df_extr_text["language"] = df_extr_text["extr_text"].apply(detect_language)

# Create a count table for the languages
language_counts = df_extr_text["language"].value_counts()

# Display the language count table
language_counts

In [None]:
query = """
    SELECT * 
    FROM READ_CSV('/data/raid5/data/ascii/mastered-data/reference-data/data_raw_direct_source_drop/joshua/georgetown/provision.csv', HEADER=TRUE);
    """
provision = con.execute(query).fetchdf()

In [None]:
import rich

rich.print(provision.iloc[6:11, 0:4].to_latex(index=False))

In [None]:
provision

In [None]:
query = """
    SELECT input_id, input_name, type, stage_name, stage_id, description
    FROM READ_CSV('/data/raid5/data/ascii/mastered-data/reference-data/data_raw_direct_source_drop/joshua/georgetown/inputs.csv', HEADER=TRUE);
    """
input_desc = con.execute(query).fetchdf()

In [None]:
input_desc.tail(30)

In [None]:
print(input_desc.iloc[3:8, :5].to_latex(index=False))

In [None]:
query = """
SELECT
    acsr.ascii_id_company,
    csv.provider_id,
    csv.provider_name,
    csv.provided_name,
    csv.provided_id
    
FROM
    READ_CSV('/data/raid5/data/ascii/mastered-data/reference-data/data_raw_direct_source_drop/joshua/georgetown/provision.csv', HEADER=TRUE) AS csv
JOIN
    ascii.company_source_rel AS acsr
ON
    csv.provider_id = acsr.id_number
WHERE
    acsr.id_number_type = 'georgetown_id';
"""

provision = con.execute(query).fetchdf()

In [None]:
provision["ascii_id_company"].nunique()

In [None]:
provision

In [None]:
provision[provision["provider_name"] == "TSMC"]  # example

In [None]:
362 / 232

In [None]:
len(provision)

In [None]:
len(provision) / 79

In [None]:
provision["provided_name"].value_counts().mean()

## Data selection

So the thing is that there are types which have material_resource, tool_resource and process. Then the process has 3 more subcategories, namely design, fabrication and Assembly, Testing, Packaging (ATP)

I just want to predict these stages.

ANd why did i do this? Because for instance TSMC would otherwise just have ATP, Fabrication and photomask, and then just photomask, and who would classify tsmc as photomask.

so now assign the class for these 5 steps.

In [None]:
prov_in = pd.merge(
    provision, input_desc, how="left", left_on="provided_id", right_on="input_id"
)

In [None]:
def assign_class(row):
    if pd.notna(row["stage_name"]):
        return row["stage_name"]
    elif pd.notna(row["type"]):
        return row["type"]
    else:
        return row["provided_name"]


prov_in["class"] = prov_in.apply(assign_class, axis=1)

In [None]:
print(prov_in["class"].value_counts().to_latex(index=True))

In [None]:
prov_in = prov_in[
    ["ascii_id_company", "provider_name", "provided_name", "provided_id", "class"]
]

In [None]:
prov_in.head(5)

In [None]:
provision_transformed = (
    prov_in.groupby(["ascii_id_company", "provider_name"])
    .agg(
        {
            "provided_name": lambda x: list(set(x)),
            "provided_id": lambda x: list(set(x)),
            "class": lambda x: list(set(x)),
        }
    )
    .reset_index()
)

In [None]:
# check new distribution

# Flatten the list of classes from all rows into a single list
all_classes = provision_transformed["class"].explode()

# Use value_counts() on the flattened list to get the distribution
class_distribution = all_classes.value_counts()

# Display the class distribution
print(class_distribution)

In [None]:
print(class_distribution.to_latex())

## Merge Georgetown with Orbis

In [None]:
provision_transformed.head(2)

In [None]:
print(provision_transformed.iloc[:10][["provider_name", "class"]].to_latex(index=False))

In [None]:
con.query("")

In [None]:
con.query("pragma table_info(ascii_ref_clean.orbis_company_trade_description)")

what looks promising here is products_services, trade_description_english

In [None]:
con.query("pragma table_info(ascii_ref_clean.orbis_company_overview)")

ok what looks promising here is main_products_and_services, main_activity, full_overview, primary_business_line

In [None]:
# get these columns for our companies

query = """
SELECT 
    acsr.ascii_id_company,
    ov.main_products_and_services, 
    ov.main_activity, 
    ov.full_overview, 
    ov.primary_business_line
FROM 
    ascii.company_source_rel AS acsr
LEFT JOIN
    ascii_ref_clean.orbis_company_overview ov
ON
    acsr.ascii_id_company = ov.ascii_id_company
WHERE
    acsr.id_number_type = 'georgetown_id';
"""
overview = con.execute(query).fetchdf()

In [None]:
overview

In [None]:
# get these columns for our companies

query = """
SELECT 
    acsr.ascii_id_company,
    trade.products_services, 
    trade.trade_description_english
FROM 
    ascii.company_source_rel AS acsr
LEFT JOIN
    ascii_ref_clean.orbis_company_trade_description trade
ON
    acsr.ascii_id_company = trade.ascii_id_company
WHERE
    acsr.id_number_type = 'georgetown_id';
"""
trade = con.execute(query).fetchdf()

In [None]:
trade

In [None]:
orbis = pd.merge(trade, overview, on="ascii_id_company")

Now there are quite a few companies that dont have any description in orbis. Drop those

In [None]:
orbis = orbis[
    [
        "ascii_id_company",
        "main_products_and_services",
        "full_overview",
        "primary_business_line",
        "trade_description_english",
    ]
]

In [None]:
# Keep rows with at least 1 non-NA value in columns other than 'ascii_id_company'
orbis = orbis.dropna(thresh=2, subset=orbis.columns.difference(["ascii_id_company"]))

In [None]:
len(orbis)

In [None]:
# now merge orbis with georgetown
gt_orb = pd.merge(provision_transformed, orbis, on="ascii_id_company")

In [None]:
gt_orb.head(3)

In [None]:
# put all description into one string
gt_orb["orbis_description"] = gt_orb.apply(
    lambda row: "'main_products_and_services':'{}', 'full_overview':'{}', 'primary_business_line':'{}', 'trade_description_english':'{}'".format(
        row["main_products_and_services"],
        row["full_overview"],
        row["primary_business_line"],
        row["trade_description_english"],
    ),
    axis=1,
)

# Drop the original columns
gt_orb = gt_orb.drop(
    columns=[
        "main_products_and_services",
        "full_overview",
        "primary_business_line",
        "trade_description_english",
    ]
)

In [None]:
gt_orb.iloc[1, 5]

In [None]:
# Calculate the string length
gt_orb["description_length"] = gt_orb["orbis_description"].str.len()

# Sort the DataFrame by 'description_length' in descending order
gt_orb = gt_orb.sort_values(by="description_length", ascending=False)

gt_orb = gt_orb.reset_index()

In [None]:
gt_orb = gt_orb.rename({"orbis_description": "text"}, axis=1)

In [None]:
print(gt_orb.iloc[39]["text"])

In [None]:
gt_orb["description_length"].max()

In [None]:
gt_orb[["provider_name", "text", "class"]].iloc[5:10].to_latex()

In [None]:
# truncate for latex


# Define a function to truncate text
def truncate_text(text, max_length=40):
    return text if len(text) <= max_length else text[: max_length - 3] + "..."


# Apply the truncation to the DataFrame
gt_orb_truncated = gt_orb.applymap(
    lambda x: truncate_text(x, 40) if isinstance(x, str) else x
)

# Export to LaTeX
latex_table = (
    gt_orb_truncated[["provider_name", "text", "class"]]
    .iloc[5:10]
    .to_latex(index=False)
)

print(latex_table)

In [None]:
# store as file for .py
import pickle

with open(
    "/home/zelle/development/projects/ascii/reference-data/data_raw_direct_source_drop/joshua/llm_data/gt_orb.pickle",
    "wb",
) as f:
    pickle.dump(gt_orb, f)

quite reasonable lengths

### Question

Now we could use this directly or use only the companies found also in commoncrawl...

## Merge Georgetown with CC

In [None]:
# merge with CC data
df_cc = pd.merge(df_extr_text, provision_transformed, on="ascii_id_company")

In [None]:
df_cc.head(10)

In [None]:
len(df_extr_text)

In [None]:
%store df_cc
%store input_desc

### Check how much gpt-4 would cost for CC

In [None]:
%store -r df_cc

In [None]:
import pandas as pd

df_cc["extr_text"].str.len().sum()

In [None]:
100000 * 0.01

In [None]:
# rename the extr text to text for standardization
df_cc.rename({"extr_text": "text"}, axis=1, inplace=True)

In [None]:
# store as file for .py
import pickle

with open(
    "/home/zelle/development/projects/ascii/reference-data/data_raw_direct_source_drop/joshua/llm_data/df_cc.pickle",
    "wb",
) as f:
    pickle.dump(df_cc, f)

with open(
    "/home/zelle/development/projects/ascii/reference-data/data_raw_direct_source_drop/joshua/llm_data/input_desc.pickle",
    "wb",
) as f:
    pickle.dump(input_desc, f)

In [None]:
print(df_cc.iloc[35, 1])