In [27]:
import json
import re
from sentence_transformers import SentenceTransformer, util

# Load data
with open('assets/column_names.json', 'r') as file:
    data = json.load(file)

# Preprocess text
def preprocess(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Extract embeddings
model = SentenceTransformer('all-MiniLM-L6-v2',)




In [25]:

# Create mappings
def create_mappings(data, year1, year2):
    mappings = {}
    
    for section in data[year1]:
        if section in data[year2]:  # Ensure the section exists in both years
            
            questions_year1 = [qn for qn in data[year1][section] if qn.startswith(section)]
            questions_year2 = [qn for qn in data[year2][section] if qn.startswith(section)]
            
            preprocessed_year1 = [preprocess(q) for q in questions_year1]
            preprocessed_year2 = [preprocess(q) for q in questions_year2]
            
            if not preprocessed_year1 or not preprocessed_year2:
                continue  # Skip if any list is empty
            
            embeddings_year1 = model.encode(preprocessed_year1)
            embeddings_year2 = model.encode(preprocessed_year2)
            
            similarity_matrix = util.cos_sim(embeddings_year1, embeddings_year2)
            
            section_mappings = {}
            for i, question in enumerate(questions_year1):
                max_sim_idx = similarity_matrix[i].argmax()
                section_mappings[question] = questions_year2[max_sim_idx]
            
            mappings[section] = section_mappings

    return mappings


In [26]:

years = ['2018', '2019', '2020', '2021', '2022']
for i in range(len(years) - 1):
    year1 = years[i]
    year2 = years[i + 1]
    mappings = create_mappings(data, year1, year2)
    
    with open(f'{year1}-{year2} mapping.json', 'w') as outfile:
        json.dump(mappings, outfile, indent=4)


Mappings created successfully.
