In [41]:
import numpy as np
import pandas as pd 
import torch
from transformers import pipeline
from sklearn.metrics import accuracy_score

In [42]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
# Load dataset
df = pd.read_csv("/kaggle/input/preprocessedcompany/mapped_company")

In [46]:
# Sampling 500 points
sampled_df = df.sample(n=500, random_state=42)

In [47]:
sampled_df = sampled_df[['NAME_CLEANED', 'INDUSTRY_GROUP']]
sampled_df.head()

Unnamed: 0,NAME_CLEANED,INDUSTRY_GROUP
15491963,bc decorators,Construction & Real Estate
7238198,bms baustelleneinrichtungen detlef wenzig,Other
13339616,dynamic masonry,Construction & Real Estate
6791797,ingeytec oficina,Construction & Real Estate
15360571,johnson bros,Retail & Consumer Goods


In [48]:
candidate_labels = sampled_df['INDUSTRY_GROUP'].unique()
candidate_labels

array(['Construction & Real Estate', 'Other', 'Retail & Consumer Goods',
       'Technology & IT', 'Media, Entertainment & Arts', 'Energy',
       'Food & Hospitality', 'Legal', 'Education & Training',
       'Government & Non-Profit', 'Resources', 'Business Services',
       'Manufacturing & Engineering', 'Finance',
       'Transportation & Logistics', 'Healthcare & Wellness',
       'Environmental Services & Agriculture'], dtype=object)

In [49]:
# Initialize the classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)



In [50]:
# Function to make predictions and calculate accuracy
def predict_and_evaluate(df, classifier, target, text, candidate_labels):
    # Predict industry group for each NAME_CLEANED
    predictions = []
    for name in df[text]:
        result = classifier(name, candidate_labels=candidate_labels)
        # Get the label with the highest score
        predicted_label = result['labels'][0]
        predictions.append(predicted_label)
    
    # Add predictions to the DataFrame
    df['PREDICTED_GROUP'] = predictions
    
    # Calculate accuracy
    accuracy = accuracy_score(df[target], df['PREDICTED_GROUP'])
    return df, accuracy


In [51]:
df, accuracy = predict_and_evaluate(df=sampled_df,
                                    classifier=classifier,
                                    target='INDUSTRY_GROUP',
                                    text='NAME_CLEANED',
                                    candidate_labels = candidate_labels)

In [56]:
# Output results
df.head(30)

Unnamed: 0,NAME_CLEANED,INDUSTRY_GROUP,PREDICTED_GROUP
15491963,bc decorators,Construction & Real Estate,Business Services
7238198,bms baustelleneinrichtungen detlef wenzig,Other,Business Services
13339616,dynamic masonry,Construction & Real Estate,Legal
6791797,ingeytec oficina,Construction & Real Estate,Energy
15360571,johnson bros,Retail & Consumer Goods,Legal
6322572,itechx,Technology & IT,Technology & IT
2401577,droneascent,"Media, Entertainment & Arts",Technology & IT
7005558,smart drilling solutions,Energy,Resources
14114035,rei das bicas,Construction & Real Estate,Resources
15500313,white subways,Food & Hospitality,Other


In [55]:
print(f"\nOverall Accuracy: {accuracy:.3%}")


Overall Accuracy: 24.200%
