# Clean Policy Incentives Data to Prepare For Vectorisation
* Uses policy_incentives_subsectioned.csv created in script 1a 
* Clean and tokenise data using spacy package

In [13]:
import configparser
import pandas as pd
import numpy as np
import os
import time
import spacy
from spacy import displacy
import string
from pathlib import Path

In [14]:
nlp = spacy.load('en_core_web_sm')

In [9]:
config = configparser.ConfigParser()
config.read("config.ini")

# access values
raw_path = Path(config["default"]["raw_path"])
interim_path = Path(config["default"]["interim_path"])
processed_path = Path(config["default"]["processed_path"])

Read in data & ensure stored as strings

In [23]:
df = pd.read_csv(interim_path/"policy_incentives_subsectioned.csv")

#df["text"] = df["text"].fillna("")
df = df.dropna(subset=['text'])
df.astype(str)



Unnamed: 0,country,section,text
0,Austria,Purchase subsidies,The federal purchase subsidy scheme “E-Mobilit...
1,Austria,Registration tax benefits,BEVs are fully exempt from the NoVA registrati...
2,Austria,Ownership / Circulation Tax Benefits,"As of 1 April 2025, BEVs are no longer exempt ..."
3,Austria,Company tax benefits,Zero-emission vehicles benefit from exemption ...
4,Austria,VAT benefits,Companies may deduct VAT fully for BEVs priced...
...,...,...,...
312,United Kingdom,Registration tax benefits,Vehicle Excise Duty (VED):
313,United Kingdom,Ownership tax benefits,Vehicle Excise Duty (VED):
314,United Kingdom,Company tax benefits,Benefit-in-Kind (BiK) Tax: First-Year Capital ...
315,United Kingdom,AF infrastructure incentives,'Electric Vehicle Homecharge Scheme (EVHS): Pr...


In [15]:

# tokenise, remove capitalisation, experiment with removing stop words
    # keep numbers, skip lemmatisation - tense will be important here

# define custom stopwords list - removing filler words but ensuring some policy relevant words are kept (such as from, per, until...)
custom_stopwords = {
    "the",  "a", "an", "and", "or", "of", "this", "that", "these", "those",
    "which", "such", "as", "etc"
}

# define function to clean & preprocess entire dataframe
def preprocess_text(spacy_doc: spacy.tokens.doc.Doc) -> str:
    text_clean = [token.text.lower() 
                  for token in spacy_doc 
                  if not token.is_space
                  and not token.is_punct
                  and token.text.lower() not in custom_stopwords
                
]
    return " ".join(text_clean)


In [26]:
# apply cleaning to each row
df["text_clean"] = df["text"].apply(lambda x: preprocess_text(nlp(x)))
#df_clean = df.apply(lambda x: preprocess_text(nlp(x)))

#quick check
print(df[["text", "text_clean"]].head())
#print(df, df_clean)

                                                text  \
0  The federal purchase subsidy scheme “E-Mobilit...   
1  BEVs are fully exempt from the NoVA registrati...   
2  As of 1 April 2025, BEVs are no longer exempt ...   
3  Zero-emission vehicles benefit from exemption ...   
4  Companies may deduct VAT fully for BEVs priced...   

                                          text_clean  
0  federal purchase subsidy scheme e mobilität 20...  
1   bevs are fully exempt from nova registration tax  
2  1 april 2025 bevs are no longer exempt from mo...  
3  zero emission vehicles benefit from exemption ...  
4  companies may deduct vat fully for bevs priced...  


In [30]:
df

Unnamed: 0,country,section,text,text_clean
0,Austria,Purchase subsidies,The federal purchase subsidy scheme “E-Mobilit...,federal purchase subsidy scheme e mobilität 20...
1,Austria,Registration tax benefits,BEVs are fully exempt from the NoVA registrati...,bevs are fully exempt from nova registration tax
2,Austria,Ownership / Circulation Tax Benefits,"As of 1 April 2025, BEVs are no longer exempt ...",1 april 2025 bevs are no longer exempt from mo...
3,Austria,Company tax benefits,Zero-emission vehicles benefit from exemption ...,zero emission vehicles benefit from exemption ...
4,Austria,VAT benefits,Companies may deduct VAT fully for BEVs priced...,companies may deduct vat fully for bevs priced...
...,...,...,...,...
312,United Kingdom,Registration tax benefits,Vehicle Excise Duty (VED):,vehicle excise duty ved
313,United Kingdom,Ownership tax benefits,Vehicle Excise Duty (VED):,vehicle excise duty ved
314,United Kingdom,Company tax benefits,Benefit-in-Kind (BiK) Tax: First-Year Capital ...,benefit in kind bik tax first year capital all...
315,United Kingdom,AF infrastructure incentives,'Electric Vehicle Homecharge Scheme (EVHS): Pr...,electric vehicle homecharge scheme evhs provid...


In [33]:
# save cleaned df 
df_cleaned = df.drop(columns="text")

df_cleaned.to_csv(interim_path/"tokenised_policy_incentives_subsectioned.csv" )