In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
raw_df = pd.read_csv('data/train.csv')

In [3]:
df = raw_df.copy()

In [4]:
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [5]:
df['POI'] = df['POI/street'].str.split('/', expand = True)[0]
df['street'] = df['POI/street'].str.split('/', expand = True)[1]

In [6]:
df

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru
...,...,...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko,,raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/,,
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,


In [7]:
# create new column for boolean value which tells which row has incomplete address
flags = []
for row in tqdm(df.itertuples(index = False), total = len(df)):
    flags.append(row[1].find(row[3]) == -1 or row[1].find(row[4]) == -1)

100%|██████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 526800.40it/s]


In [8]:
df['POI/street not exact in raw_address'] = flags

In [9]:
incomplete_address = df[df['POI/street not exact in raw_address']==True].drop(['id','POI','street','POI/street not exact in raw_address'], axis = 1)

In [10]:
# all rows with incomplete address
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 400)
incomplete_address.head(100)

Unnamed: 0,raw_address,POI/street
10,"cikahuripan sd neg boj 02 klap boj, no 5 16877",sd negeri bojong 02/klap boj
11,"yaya atohar,",yayasan atohariyah/
20,"toko bang ajs,",toko bangunan ajs/
40,mar tabl metro iringmulyo metro timur,markaz tabligh metro/
44,sd neg 12 anggrek,sd negeri 12 anggrek/
48,"rumah makan pela, raya jomb,",rumah makan pelangi/raya jomb
69,cak 11 nagasari karawang barat,/cakrad
74,"rnd prin, gang pinak, sukarame",rnd printing/gang pinak
76,"pp minhajutt, kh abdul manan, sumberberas muncar",pp minhajutthollab/kh abdul manan
77,"tk islam daruss,",tk islam darussalam/


In [11]:
all_poi = np.unique(df['POI'])

In [12]:
'sd negeri bojong 02' in all_poi

True

In [13]:
len(np.unique(df['street']))

95123

## Experiments

In [15]:
from fast_autocomplete import AutoComplete

In [16]:
all_poi = sorted(all_poi)

In [18]:
all_poi_dict = dict()
for poi in all_poi:
    all_poi_dict[poi] = {}

In [23]:
all_poi_dict

{'': {},
 '"taman pemuda"': {},
 '# jagakarsa': {},
 '# jakarta': {},
 '# jatirahayu': {},
 '# kali malang': {},
 '# pasar rebo': {},
 "'arina cell & laundry": {},
 '(depan': {},
 '(masuk kampus stkip)': {},
 '(pintu belakang)': {},
 '01 "kantor': {},
 '02 mart': {},
 '07 cell': {},
 '0ncez shop': {},
 '1 stations': {},
 '1.4.6 kids': {},
 "10 ten's": {},
 '100) lotte mart': {},
 '1001 cctv': {},
 '117 rental mobil': {},
 '126 cellular': {},
 '16 car wash': {},
 '17 mie ayam bengkel': {},
 '18 elektronik': {},
 '182 cell': {},
 '19 distro box': {},
 '1999 cell': {},
 '1a crispy puff': {},
 '1ciganjur': {},
 '1mt tutup jam 20:00 wib, warung bu, kp tonggoh rt02 01 desa gunung sari kecamatan citeureup': {},
 '1perumahan bukit waringin': {},
 '1st floor, b 110 - trans studio mall bandung': {},
 '1st home citraland': {},
 '2 diva': {},
 '2 serangkai cell': {},
 '2 tang finest tea': {},
 '2 x 11 kayu tanam': {},
 '20 fit': {},
 '2016 florist semarang': {},
 '21 cell': {},
 '21 cinema': {},
 

In [19]:
autocomplete = AutoComplete(words=all_poi_dict)

In [30]:
autocomplete.search(word='electr laun system')

[['electra'], ['electra jaya']]

In [40]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [60]:
'LnwZa/JomLnwKing'.find('Lnw', 'LnwZa/JomLnwKing'.find('/'))

9

In [52]:
'yayasan atohariyah' in all_poi

True

In [61]:
len(all_poi)

93412

In [51]:
process.extract('yaya atohar', choices = all_poi, limit = 5)

[('toha', 90), ('ay ay', 86), ('hari', 77), ('yahya', 76), ('yayak', 76)]

In [64]:
fuzz.ratio('yaya atohar,','yayasan atohariyah')

73

In [63]:
fuzz.ratio('yaya atohar,','toha')

50

In [43]:
process.extractOne('cikahuripan sd neg boj 02 klap boj, no 5 16877', all_poi)

('02 mart', 86)

In [44]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.1.0-py3-none-any.whl (112 kB)
Collecting geographiclib<2,>=1.49
  Downloading geographiclib-1.50-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.1.0


In [45]:
from geopy.geocoders import Nominatim

In [50]:
geolocator = Nominatim(user_agent="my-app")
location = geolocator.geocode("cikahuripan sd neg boj 02 klap boj")
print(location.address)

AttributeError: 'NoneType' object has no attribute 'address'