In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


# Exploratory Analysis & PreProcessing

In [2]:
df_train = pd.read_csv('data.csv')

In [3]:
df_train.shape # Shape of our training set

(10000, 3)

In [4]:
df_train.head()

Unnamed: 0,address,latitude,longitude
0,"JAIPUR,H.NO.- 408, LAVENDER MANGALAM, ANAND NA...",71.052021,11.396546
1,"C-341-A,Malviya nagar,302017",70.516921,9.85515
2,"J-3A, Khandal Hostel, Jhalana Doongri,,Jaipur,...",70.430068,9.222819
3,846 Rani Sati Nagar Janpath Lane No. 10 Aj...,71.333194,8.762032
4,"12/132 , sector 12,girdhar marg, malviya nagar...",70.49008,10.209948


In [5]:
df_train['Comma_delimitted_address']=df_train['address'].str.split(',') #Comma delimitting string

In [6]:
df_train['zip']=df_train['Comma_delimitted_address']

In [7]:
for n in range(10000):
    df_train['zip'][n]=df_train['Comma_delimitted_address'][n][-1] #Extracting Zip Codes from address
    df_train['address'].iloc[n]=df_train['address'].iloc[n].lower() #Converting address to lowercase

In [8]:
df=df_train.groupby('zip').describe()
df ## Simple exploratory analysis giving us insights about various zipcodes associated with our addresses 

Unnamed: 0_level_0,latitude,latitude,latitude,latitude,latitude,latitude,latitude,latitude,longitude,longitude,longitude,longitude,longitude,longitude,longitude,longitude
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
zip,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
302001,227.0,70.563646,0.169794,70.14506,70.457821,70.50882,70.678291,71.278342,227.0,7.499307,0.453884,5.733055,7.335641,7.500807,7.627125,10.90016
302002,200.0,70.244976,0.116888,70.04176,70.156119,70.262491,70.321249,71.077559,200.0,6.794372,0.326928,5.963971,6.638705,6.793148,6.94277,8.713631
302003,136.0,70.244915,0.111343,70.014251,70.138482,70.247168,70.323494,70.868688,136.0,7.535019,0.204884,7.259255,7.409824,7.488543,7.550351,8.497583
302004,576.0,70.243602,0.08402,70.067955,70.184189,70.238455,70.292628,70.724412,576.0,8.368404,0.268464,7.666205,8.187441,8.356049,8.5251,9.321617
302005,51.0,70.705687,0.037339,70.615241,70.680351,70.705253,70.733307,70.785151,51.0,8.491171,0.204339,7.344134,8.413764,8.506746,8.582362,8.903108
302006,326.0,70.889371,0.116174,70.635735,70.806431,70.87013,70.966213,71.530073,326.0,8.053026,0.376979,7.147274,7.851548,8.066003,8.333253,8.907539
302007,2.0,70.591596,0.451391,70.272414,70.432005,70.591596,70.751187,70.910777,2.0,8.097628,0.132911,8.003646,8.050637,8.097628,8.144619,8.19161
302011,7.0,70.660223,0.321424,70.038364,70.595341,70.742748,70.791978,71.065814,7.0,11.02241,0.364164,10.500514,10.759103,11.160572,11.283139,11.411296
302012,959.0,71.369186,0.217888,70.756466,71.2332,71.337373,71.43541,72.515009,959.0,6.497777,0.542624,3.800507,6.155461,6.591599,6.820019,8.119502
302013,189.0,70.980195,0.142525,70.649209,70.908318,70.985594,71.075931,71.339939,189.0,5.09291,0.448873,3.952547,4.819912,5.136315,5.413151,6.488565


# Tokenizing the addresses

In [9]:
df_train['tokens'] = df_train['Comma_delimitted_address']
import nltk
for n in range(10000):
    tokens = nltk.word_tokenize(df_train['address'][n])
    exceptions = [',','jaipur','rajasthan' ,'near', 'road', 'apartment', 'adjacent','sector']
    #^ Customized stopwords that will have high colinearity with a wide range of addresses 
    #These words if not removed can skew Jaccard similarity between two unrelated addresses significantly
    #My exceptions/stopwords list by no means is exhaustive its just an attempt to showcase what needs to be done
    df_train['tokens'][n]=set(tokens).difference(exceptions)
    #Removing our custom set of stopwords

In [10]:
df_train.head()

Unnamed: 0,address,latitude,longitude,Comma_delimitted_address,zip,tokens
0,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.052021,11.396546,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ..."
1,"c-341-a,malviya nagar,302017",70.516921,9.85515,"[C-341-A, Malviya nagar, 302017]",302017,"{nagar,302017, malviya, c-341-a}"
2,"j-3a, khandal hostel, jhalana doongri,,jaipur,...",70.430068,9.222819,"[J-3A, Khandal Hostel, Jhalana Doongri, , Ja...",302004,"{doongri, khandal, j-3a, ,jaipur,302004, hoste..."
3,846 rani sati nagar janpath lane no. 10 aj...,71.333194,8.762032,[846 Rani Sati Nagar Janpath Lane No. 10 A...,302019,"{lane, janpath, india,302019,302019, ., nagar,..."
4,"12/132 , sector 12,girdhar marg, malviya nagar...",70.49008,10.209948,"[12/132 , sector 12, girdhar marg, malviya n...",302017,"{girdhar, marg, nagar,302017, 12, 12/132, malv..."


# Calculating Jaccard Score within the same zipcode subgroup (Test Case used here is Zip Code 302029)

** Jaccard Score is a way to measure the similarity of two strings, its can be summarized as the  Intersection over Union of the sets at play **


** Higher score corresponds to higher similarity within two strings**

In [11]:
df_temp= df_train[df_train['zip']=='302029'] #Test Case of Sub Group ZipCode = 302029
df_temp['similarity']= df_temp['zip']

In [12]:
for i in range(len(df_temp)): #Calculating Jaccard Scores for element '0' w.r.t. other elements with the same zipcode
    intersection = df_temp['tokens'].iloc[0].intersection(df_temp['tokens'].iloc[i])
    jaccard_score= float(len(intersection))/((len(df_temp.tokens.iloc[0])) + len(df_temp.tokens.iloc[i]) - len(intersection))
    df_temp['similarity'].iloc[i] = jaccard_score

In [13]:
df_temp.sort_values(by='similarity', ascending=False)

Unnamed: 0,address,latitude,longitude,Comma_delimitted_address,zip,tokens,similarity
0,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.052021,11.396546,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ...",1
5201,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.051969,11.398205,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ...",1
7369,"rose g-8 , mangalam aananda, opposite sanganer...",71.058382,11.359776,"[Rose G-8 , mangalam Aananda, opposite sanga...",302029,"{jaipur,302029, mangalam, rose, railway, sanga...",0.210526
7974,"rose g-8 , mangalam aananda, opposite sanganer...",71.062842,11.351647,"[Rose G-8 , mangalam Aananda, opposite sanga...",302029,"{jaipur,302029, mangalam, rose, railway, sanga...",0.210526
8244,"41 sunder nagar sanganer railway station, ke s...",70.999621,11.356692,"[41 sunder nagar sanganer railway station, ke...",302029,"{ke, nagar, temple, jaipur,302029, railway, sa...",0.2
5702,"flat 306, block iris, mangalam ananda,near san...",71.045985,11.359348,"[Flat 306, block iris, mangalam ananda, Near...",302029,"{mangalam, ,near, ananda, block, iris, railway...",0.190476
2250,"bright cotton b4, khatri nagar, near sanganer ...",70.892179,10.845294,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818
8660,"bright cotton b4, khatri nagar, near sanganer ...",70.891893,10.846765,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818
6491,"bright cotton b4, khatri nagar, near sanganer ...",70.892897,10.848949,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818
4041,"bright cotton b4, khatri nagar, near sanganer ...",70.892786,10.848944,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818


# Calculating Jaccard Score for test case element '0' w.r.t. to all other addresses , and calculating relative distance from each of these points 

**Distance metric is calculate here assuming long/lat cordinates to be analogous to cordintaes in a Euclidean Space. This is done only because these lat/longitudes are encoded and dont refelect true positions on the globe so a Euclidean system should sufficiently capture the crux of the relationship (Nearness/Furtherness)**

In [14]:
df_copy=df_train
df_copy['similarity'] = df_copy['zip']
df_copy['distance'] = df_copy['zip']
import math  
for i in range(len(df_copy)):
    intersection = df_copy['tokens'].iloc[0].intersection(df_copy['tokens'].iloc[i])
    jaccard_score= float(len(intersection))/((len(df_copy.tokens.iloc[0])) + len(df_copy.tokens.iloc[i]) - len(intersection))
    df_copy['similarity'].iloc[i] = jaccard_score 
    dist = math.sqrt((df_copy['latitude'].iloc[i] - df_copy['latitude'].iloc[0])**2 + (df_copy['longitude'].iloc[i] - df_copy['longitude'].iloc[0])**2)  
    df_copy['distance'].iloc[i] = dist  
    

## Underneath I have sorted the findings using similarity scores , and distance metric 

In [15]:
df_copy.sort_values(by='similarity', ascending=False)

Unnamed: 0,address,latitude,longitude,Comma_delimitted_address,zip,tokens,similarity,distance
0,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.052021,11.396546,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ...",1,0
5201,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.051969,11.398205,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ...",1,0.00165996
7974,"rose g-8 , mangalam aananda, opposite sanganer...",71.062842,11.351647,"[Rose G-8 , mangalam Aananda, opposite sanga...",302029,"{jaipur,302029, mangalam, rose, railway, sanga...",0.210526,0.0461843
7369,"rose g-8 , mangalam aananda, opposite sanganer...",71.058382,11.359776,"[Rose G-8 , mangalam Aananda, opposite sanga...",302029,"{jaipur,302029, mangalam, rose, railway, sanga...",0.210526,0.0373161
8244,"41 sunder nagar sanganer railway station, ke s...",70.999621,11.356692,"[41 sunder nagar sanganer railway station, ke...",302029,"{ke, nagar, temple, jaipur,302029, railway, sa...",0.2,0.065834
5702,"flat 306, block iris, mangalam ananda,near san...",71.045985,11.359348,"[Flat 306, block iris, mangalam ananda, Near...",302029,"{mangalam, ,near, ananda, block, iris, railway...",0.190476,0.0376848
6491,"bright cotton b4, khatri nagar, near sanganer ...",70.892897,10.848949,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818,0.570248
8660,"bright cotton b4, khatri nagar, near sanganer ...",70.891893,10.846765,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818,0.572625
4248,"bright cotton b4, khatri nagar, near sanganer ...",70.892586,10.847497,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818,0.57173
4041,"bright cotton b4, khatri nagar, near sanganer ...",70.892786,10.848944,"[Bright Cotton B4, Khatri Nagar, Near Sangan...",302029,"{302029,302029, bridge, mansarovar, cotton, na...",0.181818,0.570284


In [16]:
df_copy.sort_values(by='distance', ascending=True)

Unnamed: 0,address,latitude,longitude,Comma_delimitted_address,zip,tokens,similarity,distance
0,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.052021,11.396546,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ...",1,0
5201,"jaipur,h.no.- 408, lavender mangalam, anand na...",71.051969,11.398205,"[JAIPUR, H.NO.- 408, LAVENDER MANGALAM, ANAN...",302029,"{mangalam, 408, 302029, railway, -, o-302029, ...",1,0.00165996
7971,"lavender 207 mangalam aananda sanganer,mansaro...",71.051058,11.394790,"[lavender 207 Mangalam aananda Sanganer, mansa...",302029,"{mansarovar, jaipur,302029, mangalam, 207, lav...",0.166667,0.00200284
6903,"104, lavendar,, mangalam aananda,sanganer,302029",71.053568,11.394492,"[104, Lavendar, , Mangalam Aananda, Sanganer...",302029,"{lavendar, 104, mangalam, aananda, sanganer,30...",0.0555556,0.0025713
2076,"104, lavendar,mangalam's aananda, sanganer,in ...",71.052389,11.403921,"[104, Lavendar, Mangalam's Aananda, Sanganer...",302029,"{front, 104, mangalam, 's, railway, sanagner, ...",0.130435,0.00738427
7965,"sunflower 410, manglam aananda,near rampura ro...",71.064959,11.394044,"[Sunflower 410, Manglam Aananda, Near Rampura...",302029,"{jaipur,302029, sunflower, rampura, saganer, m...",0,0.0131774
3111,"211 sunflower, manglam aananda city,opposite s...",71.064646,11.392212,"[211 Sunflower, Manglam Aananda City, Opposit...",302029,"{211, sunflower, city, sanganer, opposite, sta...",0.0454545,0.0133479
8866,"flat no. 304,sun flower,manglam aananda, near ...",71.065814,11.411296,"[Flat no. 304, Sun flower, Manglam aananda, n...",302011,"{railway, sun, flower, station, sanganer,30201...",0.125,0.0201938
6156,"indrajeet singh , apollo pharmacy ,mangalam an...",71.071097,11.406526,"[indrajeet singh , Apollo Pharmacy , mangalam...",302029,"{pharmacy, mangalam, ananda, no, house, indraj...",0.04,0.0215289
6907,"g-03, orchid ,mangalam ananda ,road sanganer,n...",71.071004,11.412793,"[G-03, orchid , Mangalam Ananda , road Sangan...",302029,"{mangalam, orchid, ananda, sanganer, g-03, n/a...",0.111111,0.0249859


## Inferences to be drawn from the two tables :

** Jaccord Similarity and Distance between to addresses have a seemingly inverse relationship, which is in line with intuition as the more similar an address is the closer it should be**

**Anomalies in DATA: 
  There are various repetions of same addresses in our database with slight latitude/longitude differences which can be      attributed to GPS noise. We could potentially aggregate our location metrics for these repeating entries **
 
** Paying close attention to the table above element indexed '7965' has a similarity score of '0' but has the 5th lowest distance seperation when rank ordered based on distance, this is counterintuitive as low distance seperation should co-relate with high similarity. 
This can be explained by the fact that element indexed '7965' has tokens that are mispelled versions of tokens from our comparision set (eg. Manglam/Mangalam , Anand/Ananda...). Transforming to mitigate these spelling errors can definitely increase our similarity scores. **

 


# Prediction

In [17]:
print('input address')
address = str(input()) #Input new address

input address
JAIPUR,H.NO.- 408, LAVENDER MANGALAM, ANAND NAGAR SANGANER, RAILWAY STATION, JAIPUR - 302029 ,JAIPUR H O-302029 ,Rajasthan INDIA,302029


In [18]:
address= address.lower()
tokens = nltk.word_tokenize(address)
exceptions = [',','jaipur','rajasthan' ,'near', 'road', 'apartment', 'adjacent','sector']
tokens_final=set(tokens).difference(exceptions)
tokens_final

{'-',
 '302029',
 '408',
 'anand',
 'h',
 'h.no.-',
 'india,302029',
 'lavender',
 'mangalam',
 'nagar',
 'o-302029',
 'railway',
 'sanganer',
 'station'}

In [19]:
for i in range(len(df_copy)):
    intersection = tokens_final.intersection(df_copy['tokens'].iloc[i])
    jaccard_score= float(len(intersection))/((len(df_copy.tokens.iloc[0])) + len(df_copy.tokens.iloc[i]) - len(intersection))
    df_copy['similarity'].iloc[i] = jaccard_score

In [20]:
df_copy.sort_values(by='similarity', ascending=False).iloc[0].address #Nearest address based on only the address input

'jaipur,h.no.- 408, lavender mangalam, anand nagar sanganer, railway station, jaipur - 302029 ,jaipur h o-302029 ,rajasthan india,302029'