In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Data preparation for classification task

In [None]:
%store -r df

In [None]:
df.columns

### column selection

Now take only the `georgetown_name` as identifier, the Cluster renamed to `consolidated_input` as the predicted variable and put the text columns into one called `firm_descript`, which we will use to predict the input. Here are the columns we found the most apropriate. For instance NACE code description would be perhaps the same for some firms thus not adding variation in the text.

In [None]:
# Selecting the required columns
df_selected = df[
    [
        "georgetown_name",
        "Cluster",
        "description_and_history",
        "full_overview",
        "trade_description_english",
        "products_services",
        "main_products_and_services",
    ]
]

# Renaming 'Cluster' to 'consolidated_input'
df_selected = df_selected.rename(columns={"Cluster": "consolidated_input"})

# Concatenating the text columns into 'firm_descript'
df_selected["firm_descript"] = df_selected[
    [
        "description_and_history",
        "full_overview",
        "trade_description_english",
        "products_services",
        "main_products_and_services",
    ]
].apply(lambda x: " ".join(x.dropna()), axis=1)

# Now df_selected has the columns 'georgetown_name', 'consolidated_input', and 'firm_descript'
# You can drop the original text columns if they are no longer needed
df = df_selected[["georgetown_name", "consolidated_input", "firm_descript"]]

In [None]:
df.head()

Now prepare the consolidated input for classification by making every class its own column and give binary values if a company is in this class or not

In [None]:
# Creating an explicit copy of the DataFrame to avoid SettingWithCopyWarning
df = df.copy()

# Splitting the classes in 'consolidated_input' and creating a list of labels
df["consolidated_input"] = df["consolidated_input"].apply(lambda x: x.split("; "))

# Initializing the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transforming 'consolidated_input' into binary indicators
df_labels = pd.DataFrame(
    mlb.fit_transform(df["consolidated_input"]), columns=mlb.classes_, index=df.index
)

# Concatenating the new binary label columns back to the original DataFrame
df = pd.concat([df.drop("consolidated_input", axis=1), df_labels], axis=1)

In [None]:
df

### Text preprocessing

We use Natural Language Tool Kit (nltk) library to do some text preparation.


In [None]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords and punctuation
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)


# Applying preprocessing to the 'firm_descript' column
df["firm_descript_processed"] = df["firm_descript"].apply(preprocess_text)

In [None]:
# df.columns # reorder columns
# df = df[['georgetown_name', 'firm_descript', 'firm_descript_processed',
#        'Assembly, Packaging & Interconnects',
#        'Deposition, Layering & Thermal Processing',
#        'Etching, Cleaning & Surface Preparation',
#        'Lithography, Photomasks & Imaging', 'Logic Chip Design & Software',
#        'Material & Wafer Fabrication', 'Planarization, Inspection & Metrology',
#        'Testing & Quality Assurance']]

In [None]:
print("unprocessed: ", df["firm_descript"][0])
print("processed: ", df["firm_descript_processed"][0])

### Vectorization
Now turn this prepared text into numbers that a mathematical model can work with them. Try first the simple tfidf vectorization.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df["firm_descript_processed"])

## Actual Machine Learning Task

Now that the data is prepared we can test the first models.

In [None]:
from sklearn.model_selection import train_test_split

# Extracting label columns
label_columns = df.columns.difference(
    ["georgetown_name", "firm_descript", "firm_descript_processed"]
)
Y = df[label_columns]
# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(
    X_tfidf, Y, test_size=0.2, random_state=42
)

### Logistic Regression
Lets try one of the simplest models. Logistic regression. Unfortunately it doesnt seem to work at all since it predicts 0 companies correctly.
We used the multi output classifier of logistic regression.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier

# Wrapping Logistic Regression in MultiOutputClassifier for multi-label classification
model = MultiOutputClassifier(LogisticRegression(max_iter=1000), n_jobs=-1)

model.fit(X_train, Y_train)

# Predictions
Y_pred = model.predict(X_test)

# Evaluation
print(classification_report(Y_test, Y_pred))

### Problem 
is that some classes are so sparsely populated that it doesnt work properly. First try to just drop some classes that are low and then see better approaches.



### Naive Bayes

Since this didnt work at all lets try the Naive Bayes model, wrapped such that it can also do the multi label prediction.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(
    X_tfidf, Y, test_size=0.2, random_state=42
)

# Make sure your feature values are non-negative (TF-IDF should be fine)
# Wrapping Multinomial Naive Bayes in MultiOutputClassifier
model_nb = MultiOutputClassifier(MultinomialNB(), n_jobs=-1)

model_nb.fit(X_train, Y_train)

# Predictions
Y_pred_nb = model_nb.predict(X_test)

# Evaluation
print(classification_report(Y_test, Y_pred_nb, zero_division=1))

### Result

looks like the model is very conservative in predicting. Lets look at the actual probabilities to get more information, maybe we just have to tweak the threshold when a company is classified as positive.

In [None]:
# Get predicted probabilities
Y_pred_probs = model_nb.predict_proba(X_test)
Y_pred_probs

In [None]:
# Now lets try with a different threshold, which was chosen by looking at the predicted probabilies. Given that they are quite low we choose a low 0.2 threshold.

import numpy as np

# Define custom threshold
threshold = 0.2

# Initialize an empty array for custom predictions
Y_pred_custom = np.zeros_like(Y_test)

# Apply threshold to each set of probabilities
for i, probs in enumerate(Y_pred_probs):
    # Each probs corresponds to a classifier for one label
    Y_pred_custom[:, i] = (probs[:, 1] >= threshold).astype(int)

# Evaluation
print(classification_report(Y_test, Y_pred_custom, zero_division=1))

### Outcome of Naive Bayes

Ok now we at least have some results. But the problem remains that there are just so few datapoints that each class in the test dataset is only represented in the order of magnitude of 10. 

This seems to confirm that we cannot follow this approach of training a new model from scratch. At least not a classical machnie learning model.

What other options are there?
- try llm with api
- try fine tuning llm
- just using embedding model and then calculating the similarity between docs, that is company description and consolidated_inputs desctiption

In [None]:
# lets take this data to new notebook
df1 = df
%store df1