## Feed Policy Text data into sentence-transformer to get position in vector space

In [3]:
import configparser
import pandas as pd
import numpy as np
import os
import time
import spacy
from spacy import displacy
import string
from pathlib import Path

from sentence_transformers import SentenceTransformer, util
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [4]:
config = configparser.ConfigParser()
config.read("config.ini")

# access values
raw_path = Path(config["default"]["raw_path"])
interim_path = Path(config["default"]["interim_path"])
processed_path = Path(config["default"]["processed_path"])

In [6]:
df = pd.read_csv(interim_path/"tokenised_policy_incentives_subsectioned.csv")
df

Unnamed: 0,country,section,text_clean
0,Austria,Purchase subsidies,federal purchase subsidy scheme e mobilität 20...
1,Austria,Registration tax benefits,bevs are fully exempt from nova registration tax
2,Austria,Ownership / Circulation Tax Benefits,1 april 2025 bevs are no longer exempt from mo...
3,Austria,Company tax benefits,zero emission vehicles benefit from exemption ...
4,Austria,VAT benefits,companies may deduct vat fully for bevs priced...
...,...,...,...
275,United Kingdom,Registration tax benefits,vehicle excise duty ved
276,United Kingdom,Ownership tax benefits,vehicle excise duty ved
277,United Kingdom,Company tax benefits,benefit in kind bik tax first year capital all...
278,United Kingdom,AF infrastructure incentives,electric vehicle homecharge scheme evhs provid...


#### Import sentence-transformer model. 
* Documentation on these options found here: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
* Chose 'all' mini - because best general purpose model (out of those available), and was trained on all available training data. 
* 'Mini' version is faster, and small enough to run locally without any issues.

In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Quick check that the model is working and returning values as expected

In [14]:
# TEST EMBEDDINGS
embeddings = model.encode(["Hello world"])
print(embeddings.shape)

(1, 384)


Now apply embeddings to cleaned policy text column, storing in a new embeddings column of same dataframe to keep metadata
* for now, I haven't included section subheadings in the text - may explore doing so? 

In [16]:
# apply embeddings model to policy text column
df['embeddings'] = df['text_clean'].apply(lambda x: model.encode(x))

# check results
print(df['embeddings'][0].shape)  # embedding dimension


(384,)


In [16]:
# check results
df

Unnamed: 0,country,section,text_clean,embeddings
0,Austria,Purchase subsidies,federal purchase subsidy scheme e mobilität 20...,"[-0.049947806, 0.004338905, 0.05885577, 0.0036..."
1,Austria,Registration tax benefits,bevs are fully exempt from nova registration tax,"[-0.019519879, -0.0025194725, 0.06559204, -0.0..."
2,Austria,Ownership / Circulation Tax Benefits,1 april 2025 bevs are no longer exempt from mo...,"[-0.035666157, 0.0037750318, 0.10083223, 0.018..."
3,Austria,Company tax benefits,zero emission vehicles benefit from exemption ...,"[0.011076084, 0.08334179, 0.082944065, -0.0180..."
4,Austria,VAT benefits,companies may deduct vat fully for bevs priced...,"[0.018948326, 0.05301374, 0.10462659, -0.05205..."
...,...,...,...,...
275,United Kingdom,Registration tax benefits,vehicle excise duty ved,"[-0.08971297, 0.03890533, 0.012312598, -0.0413..."
276,United Kingdom,Ownership tax benefits,vehicle excise duty ved,"[-0.08971297, 0.03890533, 0.012312598, -0.0413..."
277,United Kingdom,Company tax benefits,benefit in kind bik tax first year capital all...,"[0.004469301, 0.083178654, 0.038855247, -0.049..."
278,United Kingdom,AF infrastructure incentives,electric vehicle homecharge scheme evhs provid...,"[0.020492207, 0.045277625, -0.047004666, -0.02..."


In [17]:
# save out DF with embeddings
df.to_csv(interim_path/"policy_embeddings.csv", index=False)