In [None]:
from openai import OpenAI
import os
import pandas as pd
import math
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json
nltk.download('wordnet')          
nltk.download('stopwords')       
nltk.download('punkt') 

# Activity cleaning

In [2]:
with open('activity_table.pkl', 'rb') as f:
    activity_table = pd.read_pickle(f)

In [None]:
activity_table

In [24]:
#Extracting activities to clean and prepare for clustering
activities_to_clean = set(activity_table['activity_name'])

In [25]:
#removing empty actvities
activities_to_clean = {activity for activity in activities_to_clean if activity.strip()}

In [26]:
#Removing words in ()
def remove_words_in_parentheses(activity):
    return re.sub(r'\([^)]*\)', '', activity)
activities_to_clean = {remove_words_in_parentheses(activity) for activity in activities_to_clean}

In [27]:
# Spliting activities and keep everything before '/'
cleaned_activities = set()
for activity in activities_to_clean:
    if '/' in activity:
        cleaned_activity = activity.split('/')[0].strip()
    else:
        cleaned_activity = activity.strip()
    cleaned_activities.add(cleaned_activity)

In [29]:
#Converting to lowercase
cleaned_activities = {activity.lower() for activity in cleaned_activities}

In [32]:
#Lemmitizing activities
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))  # assuming English stopwords

    # Tokenize the text into words
    tokens = word_tokenize(text.lower())  # lowercase tokens
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a cleaned text string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text
cleaned_texts = [lemmatize_text(text) for text in cleaned_activities]

In [46]:
# Finding anagrams and keeping one instance of anagrams
def keep_one_anagram(activities):
    anagram_map = defaultdict(list)

    # Create a dictionary where keys are sorted activities and values are the original activities
    for activity in activities:
        sorted_activity = ' '.join(sorted(activity.split()))
        anagram_map[sorted_activity].append(activity)

    # List to store unique activities (one from each set of anagrams)
    unique_activities = []

    # Keep one instance of each set of anagrams
    for sorted_activity, orig_activities in anagram_map.items():
        unique_activities.append(orig_activities[0])  

    return unique_activities
clean_activities = keep_one_anagram(cleaned_texts)

In [47]:
len(clean_activities)

2337

In [48]:
clean_activities

['gynecology center',
 'power equipment installation',
 'cosmetic shop',
 'organizing child event',
 'import thread yarn',
 'hearing restoration',
 'online shop organic food',
 'online shop medical clothing',
 'organic certification',
 'gas equipment installation',
 'cultural center',
 'mobile phone operator',
 'household repair',
 'import specialty vehicle',
 'airline',
 'infrared heating system heating equipment installation',
 'sewing accessory',
 'import laminate',
 'import men clothing',
 'manufacture heating system',
 'mobile car service',
 'holiday home resort',
 'manufacture asphalt',
 'teacher website',
 'import sandwich panel',
 'online table reservation',
 'manufacture dry mortar',
 'import equipment',
 'export spring drinking water',
 'floor floor covering shop',
 'import body shaping machine',
 'painting drawing course',
 'import body massager',
 'manufacture bottle stopper cap closure',
 'information technology training center',
 'production wedding cake',
 'management sy

In [51]:
with open('clean_activities.json', 'w') as file:
    json.dump(clean_activities, file)

# Clustering with OpenAI API

In [3]:
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

In [54]:
#API call for clustering
def industry_cluster_extractor(content):
    role = """You are given a list of business areas or industries.

            Generate names for clusters that the business areas or industries belong to.

            Cluster based on the semantic meaning of the business areas or industries. 

            Output a list of semantically coherent and distinct cluster names.

            Do not concentrate on the operational model (i.e., import, export, manufacturing, etc.).

            Concentrate on the sector/industry of the business activity.

            Do not add bullet points, numbering, or any other text formatting.
            """
    
    # Calculate the number of parts to split based on token limit (30,000 per request)
    max_tokens_per_request = 30000
    num_parts = math.ceil(len(content) / max_tokens_per_request)
    
    # Function to make API request and get completion
    def make_api_request(content_part):
        chat_completion = client.chat.completions.create(
            model="gpt-4o",  
            messages=[
                {"role": "system", "content": role},
                {"role": "user", "content": content_part}
            ]
        )
        return chat_completion.choices[0].message.content

    
    outputs = []
    for i in range(num_parts):
        start_idx = i * max_tokens_per_request
        end_idx = (i + 1) * max_tokens_per_request
        content_part = content[start_idx:end_idx]
        output_part = make_api_request(content_part)
        outputs.append(output_part)

    joined_output = ''.join(outputs)

    return joined_output

In [65]:
industry_str = "\n\n".join(clean_activities)
industry_clusters = industry_cluster_extractor(industry_str)
industry_cluster_names = [industry.strip() for industry in industry_clusters.replace('- ', '').splitlines()]
api_clusters = {item for item in industry_cluster_names if item}

In [75]:
api_clusters = list(api_clusters)

In [76]:
with open('clusters.json', 'w') as json_file:
    json.dump(api_clusters, json_file, indent=4)

# Cluster Labeling

In [4]:
with open('clusters.json', 'r') as file:
    clusters = json.load(file)

In [5]:
len(clusters)

60

In [6]:
unique_activities = set(activity_table['activity_name'])
activities = list(unique_activities)

In [7]:
len(activities)

2415

In [8]:
#API call for labeling
def industry_clusterer(activities, clusters): 
    role = f"""You are given a list of business areas or industry clusters below. 
    {clusters}
    I will give you a single list of business areas or industry. Determine which of the business areas or industry clusters the given business areas or industry belongs to.
    Return a python dictionary with given list as a key and the cluster as a value. Return only one label that is the closest for each given activity list. \
    Do not return any text other than dictionary. Do not leave any labels empty and do not add any additional text of code formatting markers like \.   Always return both activity and label.
    Example of output:
        {{"['Consulting: Real Estate Transactions', 'Appraisal: Real Estate']": "Real Estate Services"}} 
    """
    content = f'{activities}'
        
    chat_completion =client.chat.completions.create(
        model = "gpt-4",
        messages = [{"role": "system", "content": role},
                    {"role": "user", "content": content}]
    )
    output = chat_completion.choices[0].message.content
    return output

In [9]:
result = {}
errors = []

In [None]:
for activity in activities:
    try:
        output = industry_clusterer(activity, clusters)
        json_out = json.loads(output)
        result.update(json_out)
        print(f'Added output {output} to dictionary')
    except json.JSONDecodeError as e:
        eval_out = eval(output)
        result.update(eval_out)
        continue
    except Exception as e:
        print(f'Error for {output}: {e}')
        errors.append(output)
        continue
print('Labeling is completed')   

In [15]:
with open('labeled_act.json', 'w') as json_file:   #488/2400
    json.dump(result, json_file, indent=4)