In [None]:
from openai import OpenAI
import os
import pandas as pd
import math
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json
nltk.download('wordnet')          
nltk.download('stopwords')       
nltk.download('punkt') 

# Activity cleaning

In [None]:
with open('activity_table.pkl', 'rb') as f:
    activity_table = pd.read_pickle(f)

In [None]:
#Extracting activities to clean and prepare for clustering
activities_to_clean = set(activity_table['activity_name'])

In [None]:
#removing empty actvities
activities_to_clean = {activity for activity in activities_to_clean if activity.strip()}

In [None]:
#Removing words in ()
def remove_words_in_parentheses(activity):
    return re.sub(r'\([^)]*\)', '', activity)
activities_to_clean = {remove_words_in_parentheses(activity) for activity in activities_to_clean}

In [None]:
# Spliting activities and keep everything before '/'
cleaned_activities = set()
for activity in activities_to_clean:
    if '/' in activity:
        cleaned_activity = activity.split('/')[0].strip()
    else:
        cleaned_activity = activity.strip()
    cleaned_activities.add(cleaned_activity)

In [None]:
#Converting to lowercase
cleaned_activities = {activity.lower() for activity in cleaned_activities}

In [None]:
#Lemmitizing activities
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))  # assuming English stopwords

    # Tokenize the text into words
    tokens = word_tokenize(text.lower())  # lowercase tokens
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a cleaned text string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text
cleaned_texts = [lemmatize_text(text) for text in cleaned_activities]

In [None]:
# Finding anagrams and keeping one instance of anagrams
def keep_one_anagram(activities):
    anagram_map = defaultdict(list)

    # Create a dictionary where keys are sorted activities and values are the original activities
    for activity in activities:
        sorted_activity = ' '.join(sorted(activity.split()))
        anagram_map[sorted_activity].append(activity)

    # List to store unique activities (one from each set of anagrams)
    unique_activities = []

    # Keep one instance of each set of anagrams
    for sorted_activity, orig_activities in anagram_map.items():
        unique_activities.append(orig_activities[0])  

    return unique_activities
clean_activities = keep_one_anagram(cleaned_texts)

In [None]:
with open('clean_activities.json', 'w') as file:
    json.dump(clean_activities, file)

# Clustering with OpenAI API

In [None]:
os.environ["OPENAI_API_KEY"] = "openai_key_here"
client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

In [None]:
#API call for clustering
def industry_cluster_extractor(content):
    role = """You are given a list of business areas or industries.

            Generate names for clusters that the business areas or industries belong to.

            Cluster based on the semantic meaning of the business areas or industries. 

            Output a list of semantically coherent and distinct cluster names.

            Do not concentrate on the operational model (i.e., import, export, manufacturing, etc.).

            Concentrate on the sector/industry of the business activity.

            Do not add bullet points, numbering, or any other text formatting.
            """
    

    chat_completion = client.chat.completions.create(model = "gpt-4o",
                                                     messages = [{"role": "system", "content": role},
                                                                 {"role": "user", "content": content}])

    output = chat_completion.choices[0].message.content

    return output

In [None]:
industry_str = "\n\n".join(clean_activities)
industry_clusters = industry_cluster_extractor(industry_str)
industry_cluster_names = [industry.strip() for industry in industry_clusters.replace('- ', '').splitlines()]
api_clusters = {item for item in industry_cluster_names if item}

In [None]:
api_clusters = list(api_clusters)

In [None]:
api_clusters = ['Education and Training Services',
'Healthcare and Medical Services',
'Manufacturing (General)',
'Agriculture and Farming',
'Retail and E-commerce',
'Food and Beverage Production',
'Hospitality and Tourism',
'Automotive and Transportation Services',
'Construction and Real Estate',
'Technology and IT Services',
'Energy and Utilities',
'Financial Services',
'Arts, Crafts, and Entertainment',
'Environmental Services',
'Consulting and Business Services',
'Logistics and Supply Chain',
'Legal and Security Services',
'Media and Communication',
'Social and Community Services',
'Beauty and Personal Care Services',
'Cleaning and Maintenance Services',
'Sports and Recreation',
'Government and Public Services']


# Cluster Labeling

In [None]:
unique_activities = set(activity_table['activity_name'])
activities = list(unique_activities)

In [None]:
#API call for labeling
def industry_clusterer(activities, clusters): 
    role = f"""You are given a list of business areas or industry clusters below. 
    {clusters}
    I will give you a single list of business areas or industry. Determine which of the business areas or industry clusters the given business areas or industry belongs to.
    Do not return any text other than cluster. Do not leave any labels empty and do not add any additional text of code formatting markers.
    """
    content = f'{activities}'
        
    chat_completion =client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [{"role": "system", "content": role},
                    {"role": "user", "content": content}]
    )
    output = chat_completion.choices[0].message.content
    return output

In [None]:
labeled_activities = {}
def label_activities(activities, clusters):
    global labeled_activities = {}
    for activity in activities:
        cluster = industry_clusterer(activity, clusters)
        labeled_activities[activity] = cluster
        print(f'{activity} added to dict as {cluster}')
    
    print("Labeling process completed.")
    print(f"Total activities labeled: {len(labeled_activities)}")
    return labeled_activities

In [None]:
labeled_activities = label_activities(activities, act_clusters)


In [None]:
with open('labeled_act.json', 'w') as json_file:   
    json.dump(labeled_activities, json_file, indent=4)

In [None]:
with open('labeled_act.json', 'r') as file:
    labeled_activities = json.load(file)

In [None]:
activity_table['cluster'] = activity_table['activity_name'].map(labeled_activities) 

In [None]:
activity_table.to_pickle('activity_table.pkl')