In [None]:
from openai import OpenAI
import os
import pandas as pd
import math
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json

In [None]:
nltk.download('wordnet')          
nltk.download('stopwords')       
nltk.download('punkt') 

# Product Cleaning

In [None]:
with open('product_table.pkl', 'rb') as f:
 product_table = pd.read_pickle(f)

In [None]:
products_to_clean = set(product_table['product_name'])

In [None]:
#removing empty products
products_to_clean = {product for product in products_to_clean if product.strip()}

In [None]:
#Removing words in ()
def remove_words_in_parentheses(product):
    return re.sub(r'\([^)]*\)', '', product)
products_to_clean = {remove_words_in_parentheses(product) for product in products_to_clean}

In [None]:
# Spliting activities and keep everything before '/'
cleaned_products = set()
for product in products_to_clean:
    if '/' in product:
        cleaned_product = product.split('/')[0].strip()
    else:
        cleaned_product = product.strip()
    cleaned_products.add(cleaned_product)

In [None]:
#Converting to lowercase
cleaned_products = {product.lower() for product in cleaned_products}

In [None]:
#Lemmitizing activities
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))  # assuming English stopwords

    # Tokenize the text into words
    tokens = word_tokenize(text.lower())  # lowercase tokens
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a cleaned text string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text
cleaned_texts = [lemmatize_text(text) for text in cleaned_products]

In [None]:
len(cleaned_texts)

In [None]:
with open('clean_products.json', 'w') as file:
    json.dump(cleaned_texts, file)

In [None]:
import matplotlib.pyplot as plt


# Product Clustering

In [None]:
with open('clean_products.json', 'r') as file:
    cleaned_texts = json.load(file)

In [None]:
texts_to_cluster = [item for item in cleaned_texts if len(item) <= 50]

In [None]:
len(texts_to_cluster)

In [None]:
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

In [None]:
#API call for clustering
def industry_cluster_extractor(content):
    role = """You are given a list of business areas or industries.

            Generate names for clusters that the business areas or industries belong to.

            Cluster based on the semantic meaning of the business areas or industries. 

            Output a list of semantically coherent and distinct cluster names.

            Do not concentrate on the operational model (i.e., import, export, manufacturing, etc.).

            Concentrate on the sector/industry of the business activity.

            Do not add bullet points, numbering, or any other text formatting.
            """
    
    # Calculate the number of parts to split based on token limit (30,000 per request)
    max_tokens_per_request = 30000
    num_parts = math.ceil(len(content) / max_tokens_per_request)
    
    # Function to make API request and get completion
    def make_api_request(content_part):
        chat_completion = client.chat.completions.create(
            model="gpt-4o",  
            messages=[
                {"role": "system", "content": role},
                {"role": "user", "content": content_part}
            ]
        )
        return chat_completion.choices[0].message.content

    
    outputs = []
    for i in range(num_parts):
        start_idx = i * max_tokens_per_request
        end_idx = (i + 1) * max_tokens_per_request
        content_part = content[start_idx:end_idx]
        output_part = make_api_request(content_part)
        outputs.append(output_part)

    joined_output = ''.join(outputs)

    return joined_output

In [None]:
industry_str = "\n\n".join(texts_to_cluster)
industry_clusters = industry_cluster_extractor(industry_str)

In [None]:
industry_cluster_names = [industry.strip() for industry in industry_clusters.replace('- ', '').splitlines()]
api_clusters = {item for item in industry_cluster_names if item}

In [None]:
api_clusters = list(api_clusters)

In [None]:
prod_clusters = ['Transportation and Logistics',
'Construction and Infrastructure',
'Hospitality and Tourism',
'Food and Beverage',
'Technology and IT Services',
'Healthcare and Medical Equipment',
'Legal and Consulting Services',
'Arts and Crafts',
'Education and Training',
'Real Estate and Property Management',
'Retail and Consumer Goods',
'Manufacturing and Industrial Equipment',
'Financial and Insurance Services',
'Marketing and Advertising',
'Personal Care and Beauty Services',
'Agriculture and Farming',
'Environmental and Safety Services',
'Furniture and Interior Design',
'Energy and Utilities',
'Entertainment and Event Management',
'Engineering and Technical Services',
'Fashion and Apparel',
'Printing and Publishing',
'Childcare and Educational Services']

# Product Labeling

In [None]:
products = set(product_table['product_name'])

In [None]:
product_list = list(products)

In [None]:
len(product_list)

In [None]:
#API call for labeling
def industry_clusterer(products, clusters): 
    role = f"""You are given a list of business areas or industry clusters below. 
    {clusters}
    I will give you a single list of business areas or industry. Determine which of the business areas or industry clusters the given business areas or industry belongs to.
    Do not return any text other than cluster. Do not leave any labels empty and do not add any additional text of code formatting markers.
    """
        
    chat_completion =client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [{"role": "system", "content": role},
                    {"role": "user", "content": products}]
    )
    output = chat_completion.choices[0].message.content
    return output

In [None]:
labeled_products = {}
def label_products(products, clusters):
    for product in products:
        cluster = industry_clusterer(product, clusters)
        labeled_products[product] = cluster
        print(f'{product} added to dict as {cluster}')
    
    print("Labeling process completed.")
    print(f"Total activities labeled: {len(labeled_products)}")
    return labeled_products
labeled_products = label_products(product_list, prod_clusters)

In [None]:
labeled_products = label_products(product_list, prod_clusters)

In [None]:
labeled_products