In [1]:
import json
import pandas as pd

# Load JSON file
with open("policies_from_seed.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Convert to DataFrame for easier viewing
df = pd.DataFrame(data)

df.head()


Unnamed: 0,categoryId,title,summary,eligibility,benefits,docUrl,state,tags,imageUrl,popularity
0,1,Pradhan Mantri Jan Dhan Yojana (PMJDY),Universal bank accounts and financial inclusion.,All unbanked citizens.,"Zero-balance accounts, RuPay card, overdraft.",https://pmjdy.gov.in/,National,"[finance, banking, inclusion]",https://images.unsplash.com/photo-1553729459-e...,497
1,1,Pradhan Mantri Ujjwala Yojana (PMUY),LPG connections for BPL households.,Women from BPL households.,Free LPG connection with subsidy.,https://www.pmuy.gov.in/,National,"[lpg, subsidy, women]",https://images.unsplash.com/photo-151971016423...,468
2,1,National Social Assistance Programme (NSAP),Pension support for vulnerable groups.,"Senior citizens, widows, persons with disabili...",Monthly pension transfers.,https://nsap.nic.in/,National,"[pension, dbt]",https://images.unsplash.com/photo-152630464058...,80
3,1,Stand Up India Scheme,Loans for SC/ST and women entrepreneurs.,SC/ST and women entrepreneurs.,Bank loans from 10-100 lakh.,https://www.standupmitra.in/,National,"[loan, msme, women]",https://images.unsplash.com/photo-1556157382-9...,476
4,1,Deen Dayal Upadhyaya Grameen Kaushalya Yojana ...,Skilling and placement for rural youth.,Rural youth 15-35.,Skill training and jobs.,https://ddugky.gov.in/,National,"[skill, rural, employment]",https://images.unsplash.com/photo-152933695312...,342


In [2]:
def combine_fields(row):
    text_parts = [
        str(row["title"]),
        str(row["summary"]),
        str(row["eligibility"]),
        str(row["benefits"]),
        " ".join(row["tags"]) if isinstance(row["tags"], list) else str(row["tags"])
    ]
    # Join everything into a single string
    return " ".join(text_parts)

df["full_text"] = df.apply(combine_fields, axis=1)

df[["title", "full_text"]].head()


Unnamed: 0,title,full_text
0,Pradhan Mantri Jan Dhan Yojana (PMJDY),Pradhan Mantri Jan Dhan Yojana (PMJDY) Univers...
1,Pradhan Mantri Ujjwala Yojana (PMUY),Pradhan Mantri Ujjwala Yojana (PMUY) LPG conne...
2,National Social Assistance Programme (NSAP),National Social Assistance Programme (NSAP) Pe...
3,Stand Up India Scheme,Stand Up India Scheme Loans for SC/ST and wome...
4,Deen Dayal Upadhyaya Grameen Kaushalya Yojana ...,Deen Dayal Upadhyaya Grameen Kaushalya Yojana ...


In [3]:
import re
import string

def clean_text(text):
    text = text.lower()                                            # lowercase
    text = re.sub(r'\s+', ' ', text)                               # remove extra spaces
    text = re.sub(r'http\S+', '', text)                            # remove URLs
    text = re.sub(r'\d+', '', text)                                # remove numbers (optional)
    text = text.translate(str.maketrans("", "", string.punctuation)) # remove punctuation
    return text.strip()

# Apply cleaning to full_text column
df["clean_text"] = df["full_text"].apply(clean_text)

df[["full_text", "clean_text"]].head()


Unnamed: 0,full_text,clean_text
0,Pradhan Mantri Jan Dhan Yojana (PMJDY) Univers...,pradhan mantri jan dhan yojana pmjdy universal...
1,Pradhan Mantri Ujjwala Yojana (PMUY) LPG conne...,pradhan mantri ujjwala yojana pmuy lpg connect...
2,National Social Assistance Programme (NSAP) Pe...,national social assistance programme nsap pens...
3,Stand Up India Scheme Loans for SC/ST and wome...,stand up india scheme loans for scst and women...
4,Deen Dayal Upadhyaya Grameen Kaushalya Yojana ...,deen dayal upadhyaya grameen kaushalya yojana ...


In [4]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   ----------- ---------------------------- 3.4/12.0 MB 16.8 MB/s eta 0:00:01
   ------------------------------ --------- 9.2/12.0 MB 22.0 MB/s eta 0:00:01
   ---------------------------------------- 12.0/12.0 MB 19.3 MB/s  0:00:00
Downloading huggingfac


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from sentence_transformers import SentenceTransformer

# Load lightweight, fast model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for each policy
df["embedding"] = df["clean_text"].apply(lambda x: model.encode(x))


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
df["embedding"].head()


0    [-0.036563437, 0.006473959, -0.07031108, -0.02...
1    [-0.05222776, 0.01365079, -0.03511135, 0.02972...
2    [-0.03602484, 0.0010115533, -0.06702071, 0.015...
3    [0.00020903249, -0.03379309, -0.020338206, -0....
4    [-0.054050278, 0.0036310414, 0.0069049555, 0.0...
Name: embedding, dtype: object

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similar_policies(query, top_n=5):
    # Convert query to embedding
    query_vec = model.encode(query)
    
    # Convert all embeddings to a matrix
    embeddings = np.vstack(df["embedding"].values)
    
    # Compute cosine similarity
    scores = cosine_similarity([query_vec], embeddings)[0]
    
    # Get top N results
    top_indices = scores.argsort()[::-1][:top_n]
    
    # Return dataframe slice
    return df.iloc[top_indices][["title", "summary", "eligibility", "benefits", "categoryId"]]


In [8]:
find_similar_policies("scholarship for students")


Unnamed: 0,title,summary,eligibility,benefits,categoryId
6,AICTE Pragati Scholarship for Girls,Scholarship for girls in technical education.,Girl students in AICTE approved institutions.,Tuition support and contingency.,2
320,State Post-Matric Scholarship - Uttar Pradesh,Scholarship for post-matric students (SC/ST/OB...,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,2
309,State Post-Matric Scholarship - Tripura,Scholarship for post-matric students (SC/ST/OB...,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,2
331,State Post-Matric Scholarship - Uttarakhand,Scholarship for post-matric students (SC/ST/OB...,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,2
56,State Post-Matric Scholarship - Bihar,Scholarship for post-matric students (SC/ST/OB...,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,2


In [9]:
find_similar_policies("financial help for widows")


Unnamed: 0,title,summary,eligibility,benefits,categoryId
341,Widow Pension Scheme - West Bengal,Monthly pension for widows in need.,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,1
319,Widow Pension Scheme - Uttar Pradesh,Monthly pension for widows in need.,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,1
187,Widow Pension Scheme - Maharashtra,Monthly pension for widows in need.,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,1
330,Widow Pension Scheme - Uttarakhand,Monthly pension for widows in need.,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,1
66,Widow Pension Scheme - Chhattisgarh,Monthly pension for widows in need.,As per state government notification. Resident...,Monthly pension / scholarship / insurance / se...,1
