In [1]:
import spacy
import pandas as pd
import json
from spacy.pipeline import EntityRuler

In [2]:
df = pd.read_csv("Indian Cities Geo Data.csv")

In [3]:
df["Location_cleaned"] = df["Location"].str.replace(" Latitude and Longitude", "", regex=False)

In [4]:
df["Location_cleaned"] = df["Location_cleaned"].str.strip()

In [5]:
patterns = []
for x in df["Location_cleaned"]:
    patterns.append({"label": "GPE", "pattern": x})

In [48]:
with open("indian_locations.json", "w", encoding="utf-8") as f:
    json.dump(patterns, f, ensure_ascii=False, indent=4)

In [6]:
nlp = spacy.load("en_core_web_sm")


In [7]:
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [51]:
with open("indian_locations.json", "r", encoding="utf-8") as f:
    locations = json.load(f)

In [8]:
ruler.add_patterns(patterns)

In [9]:
states = [
    # States
    {"label": "GPE", "pattern": "Andhra Pradesh"},
    {"label": "GPE", "pattern": "Arunachal Pradesh"},
    {"label": "GPE", "pattern": "Assam"},
    {"label": "GPE", "pattern": "Bihar"},
    {"label": "GPE", "pattern": "Chhattisgarh"},
    {"label": "GPE", "pattern": "Goa"},
    {"label": "GPE", "pattern": "Gujarat"},
    {"label": "GPE", "pattern": "Haryana"},
    {"label": "GPE", "pattern": "Himachal Pradesh"},
    {"label": "GPE", "pattern": "Jharkhand"},
    {"label": "GPE", "pattern": "Karnataka"},
    {"label": "GPE", "pattern": "Kerala"},
    {"label": "GPE", "pattern": "Madhya Pradesh"},
    {"label": "GPE", "pattern": "Maharashtra"},
    {"label": "GPE", "pattern": "Manipur"},
    {"label": "GPE", "pattern": "Meghalaya"},
    {"label": "GPE", "pattern": "Mizoram"},
    {"label": "GPE", "pattern": "Nagaland"},
    {"label": "GPE", "pattern": "Odisha"},
    {"label": "GPE", "pattern": "Punjab"},
    {"label": "GPE", "pattern": "Rajasthan"},
    {"label": "GPE", "pattern": "Sikkim"},
    {"label": "GPE", "pattern": "Tamil Nadu"},
    {"label": "GPE", "pattern": "Telangana"},
    {"label": "GPE", "pattern": "Tripura"},
    {"label": "GPE", "pattern": "Uttar Pradesh"},
    {"label": "GPE", "pattern": "Uttarakhand"},
    {"label": "GPE", "pattern": "West Bengal"},

    # Union Territories
    {"label": "GPE", "pattern": "Andaman and Nicobar Islands"},
    {"label": "GPE", "pattern": "Chandigarh"},
    {"label": "GPE", "pattern": "Dadra and Nagar Haveli and Daman and Diu"},
    {"label": "GPE", "pattern": "Delhi"},
    {"label": "GPE", "pattern": "Jammu and Kashmir"},
    {"label": "GPE", "pattern": "Ladakh"},
    {"label": "GPE", "pattern": "Lakshadweep"},
    {"label": "GPE", "pattern": "Puducherry"}
]


In [10]:
ruler.add_patterns(states)

In [33]:
doc = nlp("i am from Khordha,Bhubaneshwar , Odisha")

In [36]:
for x in doc.ents:
    print(x.text , x.label_)

Khordha GPE
Bhubaneshwar GPE
Odisha GPE


In [13]:
nlp.to_disk("custom_nlp")

### Fuzzywuzzy Implementation


In [15]:
new_patt = list(df["Location_cleaned"])

In [17]:
for x in states:
    new_patt.append(x["pattern"])

In [18]:
print(new_patt)

['Bamboo Flat', 'Nicobar', 'Port Blair', 'South Andaman', 'Addanki', 'Adoni', 'Akasahebpet', 'Akividu', 'Akkarampalle', 'Amalapuram', 'Amudalavalasa', 'Anakapalle', 'Anantapur', 'Atmakur', 'Attili', 'Avanigadda', 'Badvel', 'Banganapalle', 'Bapatla', 'Betamcherla', 'Bhattiprolu', 'Bhimavaram', 'Bhimunipatnam', 'Bobbili', 'Challapalle', 'Chemmumiahpet', 'Chilakalurupet', 'Chinnachowk', 'Chipurupalle', 'Chirala', 'Chittoor', 'Chodavaram', 'Cuddapah', 'Cumbum', 'Darsi', 'Dharmavaram', 'Dhone', 'Diguvametta', 'East Godavari', 'Elamanchili', 'Ellore', 'Emmiganur', 'Erraguntla', 'Etikoppaka', 'Gajuwaka', 'Ganguvada', 'Gannavaram', 'Giddalur', 'Gokavaram', 'Gorantla', 'Govindapuram,Chilakaluripet,Guntur', 'Gudivada', 'Gudlavalleru', 'Gudur', 'Guntakal Junction', 'Guntur', 'Hindupur', 'Ichchapuram', 'Jaggayyapeta', 'Jammalamadugu', 'Kadiri', 'Kaikalur', 'Kakinada', 'Kalyandurg', 'Kamalapuram', 'Kandukur', 'Kanigiri', 'Kankipadu', 'Kanuru', 'Kavali', 'Kolanukonda', 'Kondapalle', 'Korukollu', 'Ko

In [40]:
pip install rapidfuzz

Defaulting to user installation because normal site-packages is not writeable
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------------------------- -------------- 1.0/1.6 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 4.3 MB/s eta 0:00:00
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
from rapidfuzz import process,fuzz

In [54]:
user_input = "whitefield"
matches = process.extract(user_input ,new_patt,limit = 3,scorer=fuzz.WRatio)


In [55]:
for match, score, _ in matches:
    print(match, score)

Vite 67.5
Belda 67.5
Petlad 60.00000000000001
