In [16]:
from openai import OpenAI
import os
import pandas as pd
import math
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json
nltk.download('wordnet')          
nltk.download('stopwords')       
nltk.download('punkt') 

[nltk_data] Downloading package wordnet to /Users/copa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/copa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/copa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Activity cleaning

In [25]:
with open('activity_table.pkl', 'rb') as f:
    activity_table = pd.read_pickle(f)

In [None]:
activity_table

In [24]:
#Extracting activities to clean and prepare for clustering
activities_to_clean = set(activity_table['activity_name'])

In [25]:
#removing empty actvities
activities_to_clean = {activity for activity in activities_to_clean if activity.strip()}

In [26]:
#Removing words in ()
def remove_words_in_parentheses(activity):
    return re.sub(r'\([^)]*\)', '', activity)
activities_to_clean = {remove_words_in_parentheses(activity) for activity in activities_to_clean}

In [27]:
# Spliting activities and keep everything before '/'
cleaned_activities = set()
for activity in activities_to_clean:
    if '/' in activity:
        cleaned_activity = activity.split('/')[0].strip()
    else:
        cleaned_activity = activity.strip()
    cleaned_activities.add(cleaned_activity)

In [29]:
#Converting to lowercase
cleaned_activities = {activity.lower() for activity in cleaned_activities}

In [32]:
#Lemmitizing activities
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))  # assuming English stopwords

    # Tokenize the text into words
    tokens = word_tokenize(text.lower())  # lowercase tokens
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a cleaned text string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text
cleaned_texts = [lemmatize_text(text) for text in cleaned_activities]

In [46]:
# Finding anagrams and keeping one instance of anagrams
def keep_one_anagram(activities):
    anagram_map = defaultdict(list)

    # Create a dictionary where keys are sorted activities and values are the original activities
    for activity in activities:
        sorted_activity = ' '.join(sorted(activity.split()))
        anagram_map[sorted_activity].append(activity)

    # List to store unique activities (one from each set of anagrams)
    unique_activities = []

    # Keep one instance of each set of anagrams
    for sorted_activity, orig_activities in anagram_map.items():
        unique_activities.append(orig_activities[0])  

    return unique_activities
clean_activities = keep_one_anagram(cleaned_texts)

In [47]:
len(clean_activities)

2337

In [48]:
clean_activities

['gynecology center',
 'power equipment installation',
 'cosmetic shop',
 'organizing child event',
 'import thread yarn',
 'hearing restoration',
 'online shop organic food',
 'online shop medical clothing',
 'organic certification',
 'gas equipment installation',
 'cultural center',
 'mobile phone operator',
 'household repair',
 'import specialty vehicle',
 'airline',
 'infrared heating system heating equipment installation',
 'sewing accessory',
 'import laminate',
 'import men clothing',
 'manufacture heating system',
 'mobile car service',
 'holiday home resort',
 'manufacture asphalt',
 'teacher website',
 'import sandwich panel',
 'online table reservation',
 'manufacture dry mortar',
 'import equipment',
 'export spring drinking water',
 'floor floor covering shop',
 'import body shaping machine',
 'painting drawing course',
 'import body massager',
 'manufacture bottle stopper cap closure',
 'information technology training center',
 'production wedding cake',
 'management sy

In [51]:
with open('clean_activities.json', 'w') as file:
    json.dump(clean_activities, file)

# Clustering with OpenAI API

In [4]:
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

In [54]:
#API call for clustering
def industry_cluster_extractor(content):
    role = """You are given a list of business areas or industries.

            Generate names for clusters that the business areas or industries belong to.

            Cluster based on the semantic meaning of the business areas or industries. 

            Output a list of semantically coherent and distinct cluster names.

            Do not concentrate on the operational model (i.e., import, export, manufacturing, etc.).

            Concentrate on the sector/industry of the business activity.

            Do not add bullet points, numbering, or any other text formatting.
            """
    
    # Calculate the number of parts to split based on token limit (30,000 per request)
    max_tokens_per_request = 30000
    num_parts = math.ceil(len(content) / max_tokens_per_request)
    
    # Function to make API request and get completion
    def make_api_request(content_part):
        chat_completion = client.chat.completions.create(
            model="gpt-4o",  
            messages=[
                {"role": "system", "content": role},
                {"role": "user", "content": content_part}
            ]
        )
        return chat_completion.choices[0].message.content

    
    outputs = []
    for i in range(num_parts):
        start_idx = i * max_tokens_per_request
        end_idx = (i + 1) * max_tokens_per_request
        content_part = content[start_idx:end_idx]
        output_part = make_api_request(content_part)
        outputs.append(output_part)

    joined_output = ''.join(outputs)

    return joined_output

In [65]:
industry_str = "\n\n".join(clean_activities)
industry_clusters = industry_cluster_extractor(industry_str)
industry_cluster_names = [industry.strip() for industry in industry_clusters.replace('- ', '').splitlines()]
api_clusters = {item for item in industry_cluster_names if item}

In [75]:
api_clusters = list(api_clusters)

In [76]:
with open('clusters.json', 'w') as json_file:
    json.dump(api_clusters, json_file, indent=4)

# Cluster Labeling

In [4]:
import json

In [1]:
act_clusters = ['Education and Training Services',
'Healthcare and Medical Services',
'Manufacturing (General)',
'Agriculture and Farming',
'Retail and E-commerce',
'Food and Beverage Production',
'Hospitality and Tourism',
'Automotive and Transportation Services',
'Construction and Real Estate',
'Technology and IT Services',
'Energy and Utilities',
'Financial Services',
'Arts, Crafts, and Entertainment',
'Environmental Services',
'Consulting and Business Services',
'Logistics and Supply Chain',
'Legal and Security Services',
'Media and Communication',
'Social and Community Services',
'Beauty and Personal Care Services',
'Cleaning and Maintenance Services',
'Sports and Recreation',
'Government and Public Services']


In [5]:
unique_activities = set(activity_table['activity_name'])
activities = list(unique_activities)

In [6]:
len(activities)

2415

In [10]:
#API call for labeling
def industry_clusterer(activities, clusters): 
    role = f"""You are given a list of business areas or industry clusters below. 
    {clusters}
    I will give you a single list of business areas or industry. Determine which of the business areas or industry clusters the given business areas or industry belongs to.
    Do not return any text other than cluster. Do not leave any labels empty and do not add any additional text of code formatting markers.
    """
    content = f'{activities}'
        
    chat_completion =client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [{"role": "system", "content": role},
                    {"role": "user", "content": content}]
    )
    output = chat_completion.choices[0].message.content
    return output

In [11]:
def label_activities(activities, clusters):
    labeled_activities = {}
    
    for activity in activities:
        cluster = industry_clusterer(activity, clusters)
        labeled_activities[activity] = cluster
        print(f'{activity} added to dict as {cluster}')
    
    print("Labeling process completed.")
    print(f"Total activities labeled: {len(labeled_activities)}")
    return labeled_activities

In [12]:
labeled_activities = label_activities(activities, act_clusters)


ROAD CONSTRUCTION added to dict as Construction and Real Estate
IMPORT OF SELF-ADHESIVE TAPES / STICKY TAPES (IMPORTER) added to dict as Manufacturing (General)
ONLINE SHOP: KIDS/CHILDREN'S GOODS added to dict as Retail and E-commerce
ALCOHOLIC BEVERAGES/DRINKS (RESELLER) added to dict as Food and Beverage Production
CHEMICAL TREATMENT OF PLANTS AND TREES added to dict as Agriculture and Farming
POSTAGE STAMPS / POSTAGE STAMP SHOP added to dict as Retail and E-commerce
IMPORT OF SHOE COVERS / SHOE COVERS (IMPORTER) added to dict as Retail and E-commerce
CONSULTING: USE OF SERVER SYSTEMS added to dict as Consulting and Business Services
DIGITAL EQUIPMENT REPAIR AND SERVICE added to dict as Technology and IT Services
IMPORT OF TABLEWARE, DINNER SETS / DISHWARE, DINNERWARE (IMPORTER) added to dict as Retail and E-commerce
LAUNDRY added to dict as Cleaning and Maintenance Services
DANCING CLOTHES, SHOES AND ACCESSORIES SHOP added to dict as Retail and E-commerce
IMPORT OF AIR CONDITIONERS 

In [13]:
labeled_activities

{'ROAD CONSTRUCTION': 'Construction and Real Estate',
 'IMPORT OF SELF-ADHESIVE TAPES / STICKY TAPES (IMPORTER)': 'Manufacturing (General)',
 "ONLINE SHOP: KIDS/CHILDREN'S GOODS": 'Retail and E-commerce',
 'ALCOHOLIC BEVERAGES/DRINKS (RESELLER)': 'Food and Beverage Production',
 'CHEMICAL TREATMENT OF PLANTS AND TREES': 'Agriculture and Farming',
 'POSTAGE STAMPS / POSTAGE STAMP SHOP': 'Retail and E-commerce',
 'IMPORT OF SHOE COVERS / SHOE COVERS (IMPORTER)': 'Retail and E-commerce',
 'CONSULTING: USE OF SERVER SYSTEMS': 'Consulting and Business Services',
 'DIGITAL EQUIPMENT REPAIR AND SERVICE': 'Technology and IT Services',
 'IMPORT OF TABLEWARE, DINNER SETS / DISHWARE, DINNERWARE (IMPORTER)': 'Retail and E-commerce',
 'LAUNDRY': 'Cleaning and Maintenance Services',
 'DANCING CLOTHES, SHOES AND ACCESSORIES SHOP': 'Retail and E-commerce',
 'IMPORT OF AIR CONDITIONERS / AIR CONDITIONERS (IMPORTER)': 'Retail and E-commerce',
 'LIGHTING EQUIPMENT MAINTENANCE': 'Cleaning and Maintenance 

In [14]:
with open('labeled_act.json', 'w') as json_file:   
    json.dump(labeled_activities, json_file, indent=4)

In [23]:
with open('labeled_act.json', 'r') as file:
    labeled_activities = json.load(file)

In [24]:
labeled_activities

{'ROAD CONSTRUCTION': 'Construction and Real Estate',
 'IMPORT OF SELF-ADHESIVE TAPES / STICKY TAPES (IMPORTER)': 'Manufacturing (General)',
 "ONLINE SHOP: KIDS/CHILDREN'S GOODS": 'Retail and E-commerce',
 'ALCOHOLIC BEVERAGES/DRINKS (RESELLER)': 'Food and Beverage Production',
 'CHEMICAL TREATMENT OF PLANTS AND TREES': 'Agriculture and Farming',
 'POSTAGE STAMPS / POSTAGE STAMP SHOP': 'Retail and E-commerce',
 'IMPORT OF SHOE COVERS / SHOE COVERS (IMPORTER)': 'Retail and E-commerce',
 'CONSULTING: USE OF SERVER SYSTEMS': 'Consulting and Business Services',
 'DIGITAL EQUIPMENT REPAIR AND SERVICE': 'Technology and IT Services',
 'IMPORT OF TABLEWARE, DINNER SETS / DISHWARE, DINNERWARE (IMPORTER)': 'Retail and E-commerce',
 'LAUNDRY': 'Cleaning and Maintenance Services',
 'DANCING CLOTHES, SHOES AND ACCESSORIES SHOP': 'Retail and E-commerce',
 'IMPORT OF AIR CONDITIONERS / AIR CONDITIONERS (IMPORTER)': 'Retail and E-commerce',
 'LIGHTING EQUIPMENT MAINTENANCE': 'Cleaning and Maintenance 

In [27]:
activity_table['cluster'] = activity_table['activity_name'].map(labeled_activities) 

In [28]:
activity_table

Unnamed: 0,url_id,activity_name_id,activity_name,act_first_level_cluster,act_second_level_cluster,url,cluster
0,0006c310b6a71cc9e85da7aee19e2fe7,258ab09ec70c9bd3a63739f001e49cdf,CLOTHES / CLOTHING SHOP,COMPANIES AND NON-PROFIT ORGANIZATIONS,TRADE: NON-FOOD SHOPS,https://www.spyur.am/en/companies/rich-classic...,Retail and E-commerce
1,00098c3343fe620b76c5267ec353119a,b495cdadb3c1955ae04a1c7681cc72d9,ONLINE SHOP: MILITARY CLOTHING,COMPANIES AND NON-PROFIT ORGANIZATIONS,TRADE: ONLINE NON-FOOD SHOPS,https://www.spyur.am/en/companies/zinujam-mili...,Retail and E-commerce
2,000c418c4d41709259a5d052890adf8d,0edc1d76624f25c236da232104a88442,MANUFACTURE OF BUILDING MATERIALS / BUILDING M...,COMPANIES AND NON-PROFIT ORGANIZATIONS,PRODUCTION/MANUFACTURING: NON-FOOD MANUFACTURE...,https://www.spyur.am/en/companies/chanaparh-ro...,Manufacturing (General)
3,000c418c4d41709259a5d052890adf8d,51947763f87b52750731ff640e982a0e,BUILDING CONSTRUCTION,COMPANIES AND NON-PROFIT ORGANIZATIONS,CONSTRUCTION,https://www.spyur.am/en/companies/chanaparh-ro...,Construction and Real Estate
4,000d3c03f88ea054e58c29425b38e7bd,5b41b946c8084bf3719b9d35fba20906,IMPORT OF MEDICAL SUPPLIES/ACCESSORIES / MEDIC...,COMPANIES AND NON-PROFIT ORGANIZATIONS,TRADE: NON-FOOD IMPORTERS,https://www.spyur.am/en/companies/apex-medical...,Healthcare and Medical Services
...,...,...,...,...,...,...,...
31701,fff97927fc680f290cd8810a92fecb07,16cd1a31ed10f7c10ea45f98fb2eaccc,OUTGOING TOURISM,COMPANIES AND NON-PROFIT ORGANIZATIONS,"TOURISM, RECREATION AND RELATED SERVICES",https://www.spyur.am/en/companies/sirov-travel...,Hospitality and Tourism
31702,fff97927fc680f290cd8810a92fecb07,62f917196aa516aaf04046be405b61d6,NCOMING TOURISM,COMPANIES AND NON-PROFIT ORGANIZATIONS,"TOURISM, RECREATION AND RELATED SERVICES",https://www.spyur.am/en/companies/sirov-travel...,Hospitality and Tourism
31703,fffed1a93fbe2686bc32ff754aa7ec26,9a8eb6b69f2873c73a0299ba9aaada48,MANUFACTURE OF FURNITURE / FURNITURE (MANUFACT...,COMPANIES AND NON-PROFIT ORGANIZATIONS,PRODUCTION/MANUFACTURING: NON-FOOD MANUFACTURE...,https://www.spyur.am/en/companies/zaqaryan-fur...,Manufacturing (General)
31704,ffff17b4e13fbfe1fe106aafffb246f7,6806f21bce58ba2b29464f5fa56630f8,BUILDING MATERIALS SHOP,COMPANIES AND NON-PROFIT ORGANIZATIONS,TRADE: NON-FOOD SHOPS,https://www.spyur.am/en/companies/suren-petros...,Construction and Real Estate


In [32]:
activity_table.to_pickle('activity_table.pkl')