# Matching ISIC classifications

From here :
https://unstats.un.org/unsd/classifications/Econ 

In [1]:
import re
import itertools as iter
import numpy as np

# data
import pandas as pd

# import shapely
from scalenav.plotting import cmap

import ibis as ib
from ibis import _
ib.options.interactive = True

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# plots
from wordcloud import WordCloud

# from datashader import transfer_functions as tf, reductions as rd

  from tqdm.autonotebook import tqdm, trange


In [2]:
from parameters import *

/Users/cenv1069/Documents/data/notebooks


## Support function 

In [3]:
# def fuzzy_match_category(poi_category, isic_df, threshold=60):
#     match = process.extractOne(
#         poi_category, 
#         isic_df['Description'], 
#         scorer=fuzz.ratio
#     )
#     # print(match)
#     if match and match[1] >= threshold:
#         return isic_df.iloc[match[2]][["Code"]], # ,"Description"
#     return None#, None

## Data

In [4]:
isic_simple = pd.read_csv("https://unstats.un.org/unsd/classifications/Econ/Download/In%20Text/ISIC_Rev_4_english_structure.Txt")

In [5]:
print(isic_simple.shape)
isic_simple.head()

(766, 2)


Unnamed: 0,Code,Description
0,A,"Agriculture, forestry and fishing"
1,01,"Crop and animal production, hunting and relate..."
2,011,Growing of non-perennial crops
3,0111,"Growing of cereals (except rice), leguminous c..."
4,0112,Growing of rice


In [6]:
letter = []
# number
main_letter = "A"
for val in isic_simple["Code"]:
    if re.search(r"[A-Z]",val):
        main_letter = val
        letter.append(main_letter)
    else :
        letter.append(main_letter)

In [7]:
letter[0:5]

['A', 'A', 'A', 'A', 'A']

In [8]:
isic_simple["section"]=letter

In [9]:
filt = [True if re.search(r"^\d{2}$",string=x) else False for x in isic_simple.Code]
isic_double = isic_simple.loc[filt].copy()

In [10]:
print(isic_double.shape)
isic_double.head()

(88, 3)


Unnamed: 0,Code,Description,section
1,1,"Crop and animal production, hunting and relate...",A
39,2,Forestry and logging,A
48,3,Fishing and aquaculture,A
56,5,Mining of coal and lignite,B
61,6,Extraction of crude petroleum and natural gas,B


### Adding detail for better learning 
By concatenating the description for all the subclasses into a single one.

In [11]:
detailed_descr=[]
for ind,ind_p in zip(isic_double.index[:-1],isic_double.index[1:]):
    detailed_descr.append(", ".join(isic_simple.iloc[ind:ind_p]["Description"]))
detailed_descr.append(", ".join(isic_simple.iloc[isic_double.index[-1]:]["Description"]))

In [12]:
print(len(detailed_descr))
detailed_descr

88


['Crop and animal production, hunting and related service activities, Growing of non-perennial crops, Growing of cereals (except rice), leguminous crops and oil seeds, Growing of rice, Growing of vegetables and melons, roots and tubers, Growing of sugar cane, Growing of tobacco, Growing of fibre crops, Growing of other non-perennial crops, Growing of perennial crops, Growing of grapes, Growing of tropical and subtropical fruits, Growing of citrus fruits, Growing of pome fruits and stone fruits, Growing of other tree and bush fruits and nuts, Growing of oleaginous fruits, Growing of beverage crops, Growing of spices, aromatic, drug and pharmaceutical crops, Growing of other perennial crops, Plant propagation, Plant propagation, Animal production, Raising of cattle and buffaloes, Raising of horses and other equines, Raising of camels and camelids, Raising of sheep and goats, Raising of swine/pigs, Raising of poultry, Raising of other animals, Mixed farming, Mixed farming, Support activit

In [13]:
isic_double["detailed_descr"] = detailed_descr

### Matching DOSE, high level aggregation

In [14]:
dose_types = {"services" : "G-U"
              ,"manufacturing" : "B-F"
              ,"agriculture" : "A"}

In [15]:
dose_to_isic = [key for x in isic_double.section for (key,val) in dose_types.items() if re.search(pattern=f"[{val}]",string=x)]

In [16]:
isic_double["dose"] = dose_to_isic

In [17]:
isic_double.iloc[9].loc["detailed_descr"]

'Manufacture of beverages, Manufacture of beverages, Distilling, rectifying and blending of spirits, Manufacture of wines, Manufacture of malt liquors and malt, Manufacture of soft drinks; production of mineral waters and other bottled waters'

## Combining with place type from Overture

In [18]:
place_types = pd.read_csv("data/overture_place_types.csv")
landuse_type = pd.read_csv("data/overture_landuse_type.csv")

In [19]:
place_types.head()

Unnamed: 0,main_cat,sec_cat,raw_cat
0,eat_and_drink,eat_and_drink,eat_and_drink
1,eat_and_drink,restaurant,restaurant
2,eat_and_drink,restaurant,afghan_restaurant
3,eat_and_drink,restaurant,african_restaurant
4,eat_and_drink,restaurant,ethiopian_restaurant


In [20]:
place_types.drop_duplicates("sec_cat",inplace=True)

In [21]:
landuses_raw = landuse_type.iloc[:,0].values.tolist()

### Landuse

In [22]:
landuse_type = pd.DataFrame([x.split(" ") for x in landuses_raw],columns=["sec_cat","main_cat"])

In [23]:
landuse_type.head()

Unnamed: 0,sec_cat,main_cat
0,greenfield,construction
1,college,education
2,village_green,park
3,nature_reserve,protected
4,education,education


### Overture categories together

In [24]:
overture_cats = pd.concat([place_types,landuse_type],axis=0,ignore_index=True)

In [25]:
print(overture_cats.shape)
overture_cats.head()#[overture_cats.main_cat=='transport']

(812, 3)


Unnamed: 0,main_cat,sec_cat,raw_cat
0,eat_and_drink,eat_and_drink,eat_and_drink
1,eat_and_drink,restaurant,restaurant
2,eat_and_drink,bar,bar
3,eat_and_drink,cafe,cafe
4,accommodation,accommodation,accommodation


In [26]:
overture_cats.sec_cat.nunique()

800

## Using LLMs : background

We use a lightweight sentence encoder model to match categories in incoming data sets to the reference ISIC classifications. This helps bring the desciptions and categorical variables in the incoming data sets to a standardised and commonly used definition of economic activity. 

More information : 
[SBERT light weight models](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html)

## Model setup for classification

In [27]:
model = SentenceTransformer(selected_model)

# Precompute embeddings for ISIC categories
isic_double['embedding'] = isic_double["detailed_descr"].apply(lambda x: model.encode(x))

### Embedding and matching function

In [28]:
def embedding_match_category(poi_category, isic_df):
    poi_embedding = model.encode(poi_category)
    embeddings = list(isic_df['embedding'])
    similarities = cosine_similarity([poi_embedding], embeddings)[0]
    best_match_index = similarities.argmax()
    best_match_score = similarities[best_match_index]
    best_match_row = isic_df.iloc[best_match_index]
    
    return [best_match_row['section'],best_match_row['Code'], best_match_row['Description'], best_match_row['dose'], best_match_score]

## Matching categories

In [29]:
# fuzzy_thres = .2
# place_types["fuzzy_isic"] = place_types["raw_cat"].apply(lambda x: fuzzy_match_category(x.replace("_"," "),isic_simple)) # ,"isic_description"

In [30]:
overture_cats[["section","isic_embed", "isic_descr","dose", "match_score"]] = [
    *overture_cats["sec_cat"].apply(
        lambda x: embedding_match_category(x.replace("_", " "), isic_double)
    )
]

In [31]:
overture_cats.shape

(812, 8)

In [32]:
overture_cats.loc[overture_cats.match_score<0.3,["sec_cat","section","isic_embed","isic_descr","dose","match_score"]]

Unnamed: 0,sec_cat,section,isic_embed,isic_descr,dose,match_score
0,eat_and_drink,Q,86,Human health activities,services,0.264077
2,bar,R,92,Gambling and betting activities,services,0.250466
6,cabin,C,31,Manufacture of furniture,manufacturing,0.285907
12,hostel,I,55,Accommodation,services,0.287494
14,inn,I,55,Accommodation,services,0.250833
...,...,...,...,...,...,...
798,driving_range,G,45,Wholesale and retail trade and repair of motor...,services,0.289407
799,schoolyard,E,37,Sewerage,manufacturing,0.256795
802,brownfield,E,37,Sewerage,manufacturing,0.230865
804,meadow,N,81,Services to buildings and landscape activities,services,0.229788


In [33]:
overture_cats.loc[630,["sec_cat","section","isic_embed","isic_descr","dose","match_score"]].values

array(['hazardous_waste_disposal', 'E', '39',
       'Remediation activities and other waste management services',
       'manufacturing', 0.6262050867080688], dtype=object)

In [34]:
overture_cats.loc[overture_cats.sec_cat=="shopping"]

Unnamed: 0,main_cat,sec_cat,raw_cat,section,isic_embed,isic_descr,dose,match_score
258,retail,shopping,shopping,G,47,"Retail trade, except of motor vehicles and mot...",services,0.477107


In [35]:
overture_cats.tail()

Unnamed: 0,main_cat,sec_cat,raw_cat,section,isic_embed,isic_descr,dose,match_score
807,horticulture,greenhouse_horticulture,,A,1,"Crop and animal production, hunting and relate...",agriculture,0.457248
808,recreation,beach_resort,,N,79,"Travel agency, tour operator, reservation serv...",services,0.287185
809,developed,institutional,,S,94,Activities of membership organizations,services,0.303659
810,protected,state_park,,R,91,"Libraries, archives, museums and other cultura...",services,0.313521
811,military,danger_area,,N,80,Security and investigation activities,services,0.336976


In [36]:
overture_cats[overture_cats.sec_cat=="farmland"]

Unnamed: 0,main_cat,sec_cat,raw_cat,section,isic_embed,isic_descr,dose,match_score
734,agriculture,farmland,,A,1,"Crop and animal production, hunting and relate...",agriculture,0.506969


In [37]:
# match_thres = .4

In [38]:
overture_cats.loc[~overture_cats["sec_cat"].duplicated()].to_csv(place_types_filename,index=False)

In [39]:
place_types_filename

'data/place_types_isic_all-mpnet-base-v2.csv'