In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# take the data

path = "/data/raid5/data/ascii/mastered-data/reference-data/data_raw_old/orbis_georgetown/orbis_georgetown.parquet"

df1 = pd.read_parquet(path)  # to be updated/ cleaned with sql

df2 = pd.read_csv(
    "/home/zelle/development/projects/ascii/my_dev/georgetown_orbis/ml_data_orbisgt.csv",
    index_col=0,
)  # to be updated

df = pd.merge(
    df2,
    df1[["georgetown_name", "website_address", "bv_d_id_number"]],
    on="georgetown_name",
    how="inner",
)

input_path = "/home/zelle/development/projects/ascii/reference-data/data_raw_direct_source_drop/georgetown/inputs.csv"

inputs = pd.read_csv(input_path)

### Data Exploration



In [None]:
inputs.head(5)

In [None]:
frequency = df["provided_name"].value_counts()

df["type"].unique()


# Iterate over the Series and print each index-value pair
for index, value in frequency.items():
    print(f"{index}: {value}")

### Consolidation of classes

we see that the classes are very sparsely populated. 80 classes with only 174 companies. Lets do consolidation of classes. Try to do it with clustering algorithm.

In [None]:
input_names = inputs["input_name"].values

input_names_cleaned = [
    name.lower().replace(":", "").replace("-", " ") for name in input_names
]
input_names

In [None]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(input_names_cleaned)  # turn input names into vectors

In [None]:
# Number of desired clusters
n_clusters = 8

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels to your DataFrame
inputs["cluster"] = clusters

In [None]:
for i in range(n_clusters):
    print(f"Cluster {i}:")
    print(inputs[inputs["cluster"] == i]["input_name"].values, "\n")

But actually it seems more sensible to do it manually or with the help of gpt-4
This is the manual clustering found with gpt-4:

In [None]:
# Updated clusters with all products included
clusters = {
    "Logic Chip Design & Software": [
        "Logic chip design: Advanced CPUs",
        "Logic chip design: Discrete GPUs",
        "Logic chip design: FPGAs",
        "Logic chip design: AI ASICs",
        "Electronic design automation software",
        "Core intellectual property",
        "Finished logic chip",
    ],
    "Material & Wafer Fabrication": [
        "Crystal growing furnaces",
        "Crystal machining tools",
        "Wafer",
        "Wafer bonding and aligning tools",
        "Wafer handlers",
        "Photomask handlers",
        "Wafer and photomask handling",
        "Ion implanters",
        "Electronic gases",
        "Wet chemicals",
    ],
    "Lithography, Photomasks & Imaging": [
        "Advanced photolithography equipment",
        "EUV scanners",
        "ArF scanners",
        "ArF immersion scanners",
        "Photolithography",
        "Maskless lithography equipment",
        "Electron-beam lithography tools",
        "Laser lithography tools",
        "Photoresists",
        "Resist processing tools",
        "Advanced photomasks",
        "Photomask inspection and repair tools",
    ],
    "Deposition, Layering & Thermal Processing": [
        "Deposition",
        "Deposition tools",
        "Plasma CVD tools",
        "Low-pressure CVD tools",
        "High-temperature CVD tools",
        "Atomic layer deposition tools",
        "Physical vapor deposition tools",
        "Tube-based diffusion and deposition tools",
        "Electrochemical coating tools",
        "Chemical vapor deposition tools",
        "Deposition materials",
        "Rapid thermal processing tools",
    ],
    "Etching, Cleaning & Surface Preparation": [
        "Etch and clean",
        "Dry etching and cleaning tools",
        "Wet etching and cleaning tools",
        "Conductor etching tools",
        "Dielectric etching tools",
        "Etching and cleaning tools",
    ],
    "Planarization, Inspection & Metrology": [
        "Chemical mechanical planarization",
        "Chemical mechanical planarization tools",
        "Process control",
        "Process monitoring equipment",
        "Wafer inspection equipment",
        "Wafer level packaging inspection tools",
        "Film and wafer measuring tools",
        "Critical dimensions measurement tools",
        "Defect inspection tools",
        "Assembly inspection tools",
        "CMP materials",
    ],
    "Assembly, Packaging & Interconnects": [
        "Assembly and packaging",
        "Dicing tools",
        "Bonding tools",
        "Die attaching tools",
        "Wire bonding tools",
        "Advanced interconnect tools",
        "Packaging tools",
        "Integrated assembly tools",
        "Handlers and probes",
        "Lead frames",
        "Bond wires",
        "Ceramic packages",
        "Substrates",
        "Encapsulation resins",
        "Die attach materials",
        "Packaging materials",
    ],
    "Testing & Quality Assurance": [
        "Testing",
        "SoC test equipment",
        "Burn-in test equipment",
        "Linear and discrete testing tools",
        "General-purpose microscopy tools",
    ],
}


# Function to assign cluster based on product name
def assign_cluster(product_name):
    for cluster_name, products in clusters.items():
        if product_name in products:
            return cluster_name
    return "Uncategorized"  # Fallback category, should ideally be empty


# Apply the function to each row in the DataFrame
inputs["Cluster"] = inputs["input_name"].apply(assign_cluster)

# Verify the updated clustering
print(inputs["Cluster"].value_counts())

In [None]:
# check the distribution now:

# inputs[inputs['Cluster']=='Assembly, Packaging & Interconnects']['input_name'].value_counts()

In [None]:
df.columns

In [None]:
def add_cluster_information(df, inputs):
    # Merge the 'df' DataFrame with the 'inputs' DataFrame to get the 'Cluster' information
    # Use 'provided_name' from 'df' and 'input_name' from 'inputs' for merging
    merged_df = pd.merge(
        df,
        inputs[["input_name", "Cluster"]],
        left_on="provided_name",
        right_on="input_name",
        how="left",
    )

    # Drop the extra 'input_name' column from the merge if not needed
    merged_df.drop("input_name", axis=1, inplace=True)

    return merged_df


# Apply the function to add 'Cluster' column to 'df'
df_with_clusters = add_cluster_information(df, inputs)

# Check the first few rows to verify the 'Cluster' column is added
df_with_clusters.head()

# To count how many companies are in each cluster
# cluster_counts = df_with_clusters['Cluster'].value_counts()
# print(cluster_counts)

problem is that many companies may fall into more than one category.

In [None]:
# Split the 'provided_id' string on commas and explode into separate rows
df_expanded = df.assign(provided_id=df["provided_id"].str.split(", ")).explode(
    "provided_id"
)

# Remove any leading/trailing whitespace that might be left after splitting
df_expanded["provided_id"] = df_expanded["provided_id"].str.strip()
df_expanded

In [None]:
# Assuming 'input_id' is the column in 'inputs' that corresponds to 'provided_id' in 'df_expanded'
df_with_clusters = pd.merge(
    df_expanded,
    inputs[["input_id", "Cluster"]],
    left_on="provided_id",
    right_on="input_id",
    how="left",
)
df_with_clusters

In [None]:
# Group by 'provider_id' and concatenate 'provided_id' and 'Cluster' into comma-separated strings
df_grouped = (
    df_with_clusters.groupby("provider_id")
    .agg({"provided_id": lambda x: ", ".join(x), "Cluster": lambda x: "; ".join(x)})
    .reset_index()
)

In [None]:
df = df.merge(df_grouped[["provider_id", "Cluster"]], "inner", on="provider_id")

In [None]:
# check how many companies have multiple clusters:

# Split the 'Cluster' column on the semicolon, then explode so each cluster has its own row
df_exploded = df.assign(Cluster=df["Cluster"].str.split("; ")).explode("Cluster")

# Group by 'provider_id' and count unique 'Cluster'
cluster_counts = df_exploded.groupby("georgetown_name")["Cluster"].nunique()

# Sort the counts in decreasing order
cluster_counts_sorted = cluster_counts.sort_values(ascending=False)
cluster_counts_sorted

### Removing multi cluster companies?
Now the question is if I should just remove companies that have more than one or more than 2 clusters. Maybe just run the classification problem and then decide. See how well models will perform.



In [None]:
# take only companies with up to 2 clusters
subset = cluster_counts_sorted[cluster_counts_sorted < 3]

print("companies remaining:", len(subset))
print("percentage kept:", len(subset) / len(df))

In [None]:
# check frequency

# Filter 'df' to include only firms with at most 2 clusters
df_filtered = df[df["georgetown_name"].isin(subset.index)]

# Split the 'Cluster' column on semicolons, then explode into separate rows
df_clusters_exploded = df_filtered.assign(
    Cluster=df_filtered["Cluster"].str.split("; ")
).explode("Cluster")

# Perform value count on the 'Cluster' column
cluster_value_counts = df_clusters_exploded["Cluster"].value_counts()

# Display the value count frequency for the Cluster column
cluster_value_counts

In [None]:
#### TEST: kick out small classes:

df_filtered = df
# Define the classes you want to drop
classes_to_drop = [
    "Testing & Quality Assurance",
    "Etching, Cleaning & Surface Preparation",
    "Deposition, Layering & Thermal Processing",
]


# Function to check if any of the classes to drop are in the company's cluster
def has_class_to_drop(cluster_string):
    company_classes = cluster_string.split("; ")
    return any(class_to_drop in company_classes for class_to_drop in classes_to_drop)


# Apply the function to each row and filter the DataFrame
df_filtered = df[~df["Cluster"].apply(has_class_to_drop)]

# test again the distribution:
df_clusters_exploded = df_filtered.assign(
    Cluster=df_filtered["Cluster"].str.split("; ")
).explode("Cluster")

# Perform value count on the 'Cluster' column
cluster_value_counts = df_clusters_exploded["Cluster"].value_counts()

# Display the value count frequency for the Cluster column
cluster_value_counts

### Unbalanced data?
Ok so now I have this new distribution, it doesnt seem too bad yet the assembly class still looks quite big. I might want to split it up. And the last 2 classes are very small

And/Or maybe I can try oversampling techniques for the actual ml task.

Maybe I just try to run the model and then adjust these things where I have the immedeate feedback loop as to how the model performs given different fixes.

### Final data preparation steps:
- drop unnecessary columns
- put text into one column
- in the next notebook

In [None]:
df = df_filtered

df

In [None]:
%store df