In [4]:
import spacy
import pandas as pd
import json
from spacy.pipeline import EntityRuler

In [5]:
df = pd.read_csv("Indian Cities Geo Data.csv")

In [7]:
df["Location_cleaned"] = df["Location"].str.replace(" Latitude and Longitude", "", regex=False)

In [8]:
df["Location_cleaned"] = df["Location_cleaned"].str.strip()

In [9]:
patterns = []
for x in df["Location_cleaned"]:
    patterns.append({"label": "GPE", "pattern": x})

In [48]:
with open("indian_locations.json", "w", encoding="utf-8") as f:
    json.dump(patterns, f, ensure_ascii=False, indent=4)

In [6]:
nlp = spacy.load("en_core_web_sm")


In [7]:
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [51]:
with open("indian_locations.json", "r", encoding="utf-8") as f:
    locations = json.load(f)

In [8]:
ruler.add_patterns(patterns)

In [10]:
states = [
    # States
    {"label": "GPE", "pattern": "Andhra Pradesh"},
    {"label": "GPE", "pattern": "Arunachal Pradesh"},
    {"label": "GPE", "pattern": "Assam"},
    {"label": "GPE", "pattern": "Bihar"},
    {"label": "GPE", "pattern": "Chhattisgarh"},
    {"label": "GPE", "pattern": "Goa"},
    {"label": "GPE", "pattern": "Gujarat"},
    {"label": "GPE", "pattern": "Haryana"},
    {"label": "GPE", "pattern": "Himachal Pradesh"},
    {"label": "GPE", "pattern": "Jharkhand"},
    {"label": "GPE", "pattern": "Karnataka"},
    {"label": "GPE", "pattern": "Kerala"},
    {"label": "GPE", "pattern": "Madhya Pradesh"},
    {"label": "GPE", "pattern": "Maharashtra"},
    {"label": "GPE", "pattern": "Manipur"},
    {"label": "GPE", "pattern": "Meghalaya"},
    {"label": "GPE", "pattern": "Mizoram"},
    {"label": "GPE", "pattern": "Nagaland"},
    {"label": "GPE", "pattern": "Odisha"},
    {"label": "GPE", "pattern": "Punjab"},
    {"label": "GPE", "pattern": "Rajasthan"},
    {"label": "GPE", "pattern": "Sikkim"},
    {"label": "GPE", "pattern": "Tamil Nadu"},
    {"label": "GPE", "pattern": "Telangana"},
    {"label": "GPE", "pattern": "Tripura"},
    {"label": "GPE", "pattern": "Uttar Pradesh"},
    {"label": "GPE", "pattern": "Uttarakhand"},
    {"label": "GPE", "pattern": "West Bengal"},

    # Union Territories
    {"label": "GPE", "pattern": "Andaman and Nicobar Islands"},
    {"label": "GPE", "pattern": "Chandigarh"},
    {"label": "GPE", "pattern": "Dadra and Nagar Haveli and Daman and Diu"},
    {"label": "GPE", "pattern": "Delhi"},
    {"label": "GPE", "pattern": "Jammu and Kashmir"},
    {"label": "GPE", "pattern": "Ladakh"},
    {"label": "GPE", "pattern": "Lakshadweep"},
    {"label": "GPE", "pattern": "Puducherry"}
]


In [10]:
ruler.add_patterns(states)

In [33]:
doc = nlp("i am from Khordha,Bhubaneshwar , Odisha")

In [36]:
for x in doc.ents:
    print(x.text , x.label_)

Khordha GPE
Bhubaneshwar GPE
Odisha GPE


In [13]:
nlp.to_disk("custom_nlp")

### Fuzzywuzzy Implementation


In [11]:
new_patt = list(df["Location_cleaned"])

In [12]:
for x in states:
    new_patt.append(x["pattern"])

In [13]:
print(new_patt)

['Bamboo Flat', 'Nicobar', 'Port Blair', 'South Andaman', 'Addanki', 'Adoni', 'Akasahebpet', 'Akividu', 'Akkarampalle', 'Amalapuram', 'Amudalavalasa', 'Anakapalle', 'Anantapur', 'Atmakur', 'Attili', 'Avanigadda', 'Badvel', 'Banganapalle', 'Bapatla', 'Betamcherla', 'Bhattiprolu', 'Bhimavaram', 'Bhimunipatnam', 'Bobbili', 'Challapalle', 'Chemmumiahpet', 'Chilakalurupet', 'Chinnachowk', 'Chipurupalle', 'Chirala', 'Chittoor', 'Chodavaram', 'Cuddapah', 'Cumbum', 'Darsi', 'Dharmavaram', 'Dhone', 'Diguvametta', 'East Godavari', 'Elamanchili', 'Ellore', 'Emmiganur', 'Erraguntla', 'Etikoppaka', 'Gajuwaka', 'Ganguvada', 'Gannavaram', 'Giddalur', 'Gokavaram', 'Gorantla', 'Govindapuram,Chilakaluripet,Guntur', 'Gudivada', 'Gudlavalleru', 'Gudur', 'Guntakal Junction', 'Guntur', 'Hindupur', 'Ichchapuram', 'Jaggayyapeta', 'Jammalamadugu', 'Kadiri', 'Kaikalur', 'Kakinada', 'Kalyandurg', 'Kamalapuram', 'Kandukur', 'Kanigiri', 'Kankipadu', 'Kanuru', 'Kavali', 'Kolanukonda', 'Kondapalle', 'Korukollu', 'Ko

In [14]:
pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable




[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from rapidfuzz import process,fuzz

In [40]:
user_input = "Koraput"
matches = process.extract(user_input, new_patt , limit = 3, scorer=fuzz.WRatio)

In [41]:
for match, score, _ in matches:
    print(match, score)

Koraput 100.0
Koratla 71.42857142857143
Korampallam 69.23076923076923


In [42]:
df.head()

Unnamed: 0,State,Location,Latitude,Longitude,Location_cleaned
0,Andaman and Nicobar Islands,Bamboo Flat Latitude and Longitude,11.7,92.71667,Bamboo Flat
1,Andaman and Nicobar Islands,Nicobar Latitude and Longitude,7.03002,93.79028,Nicobar
2,Andaman and Nicobar Islands,Port Blair Latitude and Longitude,11.66613,92.74635,Port Blair
3,Andaman and Nicobar Islands,South Andaman Latitude and Longitude,10.75776,92.52136,South Andaman
4,Andhra Pradesh,Addanki Latitude and Longitude,15.81061,79.97338,Addanki


In [44]:
df = df.drop(["Location", "Latitude", "Longitude"], axis=1)

In [45]:
df.head()

Unnamed: 0,State,Location_cleaned
0,Andaman and Nicobar Islands,Bamboo Flat
1,Andaman and Nicobar Islands,Nicobar
2,Andaman and Nicobar Islands,Port Blair
3,Andaman and Nicobar Islands,South Andaman
4,Andhra Pradesh,Addanki


In [48]:
tempdict = {}
for x in range(len(df)):
    state = df["State"][x].strip()
    city = df["Location_cleaned"][x].strip()

    if state not in tempdict:
        tempdict[state] = []

    tempdict[state].append(city)
        

In [49]:
print(tempdict)

{'Andaman and Nicobar Islands': ['Bamboo Flat', 'Nicobar', 'Port Blair', 'South Andaman'], 'Andhra Pradesh': ['Addanki', 'Adoni', 'Akasahebpet', 'Akividu', 'Akkarampalle', 'Amalapuram', 'Amudalavalasa', 'Anakapalle', 'Anantapur', 'Atmakur', 'Attili', 'Avanigadda', 'Badvel', 'Banganapalle', 'Bapatla', 'Betamcherla', 'Bhattiprolu', 'Bhimavaram', 'Bhimunipatnam', 'Bobbili', 'Challapalle', 'Chemmumiahpet', 'Chilakalurupet', 'Chinnachowk', 'Chipurupalle', 'Chirala', 'Chittoor', 'Chodavaram', 'Cuddapah', 'Cumbum', 'Darsi', 'Dharmavaram', 'Dhone', 'Diguvametta', 'East Godavari', 'Elamanchili', 'Ellore', 'Emmiganur', 'Erraguntla', 'Etikoppaka', 'Gajuwaka', 'Ganguvada', 'Gannavaram', 'Giddalur', 'Gokavaram', 'Gorantla', 'Govindapuram,Chilakaluripet,Guntur', 'Gudivada', 'Gudlavalleru', 'Gudur', 'Guntakal Junction', 'Guntur', 'Hindupur', 'Ichchapuram', 'Jaggayyapeta', 'Jammalamadugu', 'Kadiri', 'Kaikalur', 'Kakinada', 'Kalyandurg', 'Kamalapuram', 'Kandukur', 'Kanigiri', 'Kankipadu', 'Kanuru', 'Ka

In [50]:
json_data = json.dumps(tempdict, indent=4)

In [51]:
json_data

'{\n    "Andaman and Nicobar Islands": [\n        "Bamboo Flat",\n        "Nicobar",\n        "Port Blair",\n        "South Andaman"\n    ],\n    "Andhra Pradesh": [\n        "Addanki",\n        "Adoni",\n        "Akasahebpet",\n        "Akividu",\n        "Akkarampalle",\n        "Amalapuram",\n        "Amudalavalasa",\n        "Anakapalle",\n        "Anantapur",\n        "Atmakur",\n        "Attili",\n        "Avanigadda",\n        "Badvel",\n        "Banganapalle",\n        "Bapatla",\n        "Betamcherla",\n        "Bhattiprolu",\n        "Bhimavaram",\n        "Bhimunipatnam",\n        "Bobbili",\n        "Challapalle",\n        "Chemmumiahpet",\n        "Chilakalurupet",\n        "Chinnachowk",\n        "Chipurupalle",\n        "Chirala",\n        "Chittoor",\n        "Chodavaram",\n        "Cuddapah",\n        "Cumbum",\n        "Darsi",\n        "Dharmavaram",\n        "Dhone",\n        "Diguvametta",\n        "East Godavari",\n        "Elamanchili",\n        "Ellore",\n      

In [52]:
with open("state_city.json", "w") as f:
    f.write(json_data)