In [None]:
# import necessary libraries
import json
import pandas as pd


In [3]:
import json
import re
from sentence_transformers import SentenceTransformer, util

# Load data
with open('assets/column_names.json', 'r') as file:
    data = json.load(file)

# Preprocess text
def preprocess(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Extract embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')


In [13]:
for section in enumerate(data['2018']):
    print(section)
    break

(0, 'Summary Data')


In [None]:

# Create mappings
def create_mappings(data, year1, year2):
    questions_year1 = data[year1]
    questions_year2 = data[year2]
    
    preprocessed_year1 = [preprocess(q) for q in questions_year1]
    preprocessed_year2 = [preprocess(q) for q in questions_year2]
    
    embeddings_year1 = model.encode(preprocessed_year1)
    embeddings_year2 = model.encode(preprocessed_year2)
    
    similarity_matrix = util.cos_sim(embeddings_year1, embeddings_year2)
    
    mappings = {}
    for i, question in enumerate(questions_year1):
        max_sim_idx = similarity_matrix[i].argmax()
        mappings[question] = questions_year2[max_sim_idx]
    
    return mappings


In [None]:

years = ['2018', '2019', '2020', '2021', '2022']
for i in range(len(years) - 1):
    year1 = years[i]
    year2 = years[i + 1]
    mappings = create_mappings(data, year1, year2)
    
    with open(f'{year1}-{year2} mapping.json', 'w') as outfile:
        json.dump(mappings, outfile, indent=4)
