# Culinary Recommendation in Tangerang¶

In [13]:
import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup

## 1. Web Scraping to gather all boroughs & districts name in Tangerang

In [68]:
wiki = 'https://id.wikipedia.org/wiki/Daftar_kecamatan_dan_kelurahan_di_Kabupaten_Tangerang'
wiki_page = requests.get(wiki)

wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]

# Clean up the table 
df = wiki_raw[:-1]
df

Unnamed: 0,Kode Kemendagri,Kecamatan,Jumlah Kelurahan,Jumlah Desa,Status,Daftar Desa/Kelurahan
0,36.03.01,Balaraja,1.0,8,Desa,Cangkudu Gembong Saga Sentul Sentul Jaya Sukam...
1,36.03.01,Balaraja,1.0,8,Kelurahan,Balaraja
2,36.03.18,Cikupa,2.0,12,Desa,Bitung Jaya Bojong Budi Mulya Cibadak Cikupa D...
3,36.03.18,Cikupa,2.0,12,Kelurahan,Bunder Sukamulya
4,36.03.23,Cisauk,1.0,5,Desa,Cibogo Dangdang Mekar Wangi Sampora Suradita
5,36.03.23,Cisauk,1.0,5,Kelurahan,Cisauk
6,36.03.05,Cisoka,,10,Desa,Bojong Loa Carenang Caringin Cempaka Cibugel C...
7,36.03.17,Curug,3.0,4,Desa,Cukanggalih Curug Wetan Kadu Kadu Jaya
8,36.03.17,Curug,3.0,4,Kelurahan,Binong Curug Kulon Sukabakti
9,36.03.32,Gunung Kaler,,9,Desa,Cibetok Cipaeh Gunung Kaler Kandawati Kedung O...


In [192]:
soup = BeautifulSoup(wiki_page.content,"html.parser")
content = soup.findAll('tr')

# Looping through
dis_num = 0
tangerang_list, district_list, district_text = [], [], ''

for i in range(1, len(content)-1):
    # loop through position
    pos = 0

    if len(content[i].findAll('td'))==6 :
        for j in content[i].findAll('td'):
            pos+=1
            # At second column, get the borough
            if pos==2:
                borough = j.text.strip().split('\n')[0]
            elif pos==6:
                district_list.extend(j.text.strip().split('\n'))
        if len(content[i+1].findAll('td'))!=2:
            for d in range(len(district_list)):
                if d<len(district_list)-1:
                    district_text = district_text + district_list[d] + ', '
                else:
                    district_text = district_text + district_list[d]
            tangerang_list.append({'borough': borough, 'number_of_neighbourhoods': len(district_list), 'neighbourhoods': district_text})
            district_list, district_text, borough =[],'', ''
    elif len(content[i].findAll('td'))==2:
        for j in content[i].findAll('td'):
            pos+=1
            if pos==2:
                district_list.extend(j.text.strip().split('\n'))
                for d in range(len(district_list)):
                    if d<len(district_list)-1:
                        district_text = district_text + district_list[d] + ', '
                    else:
                        district_text = district_text + district_list[d]
                tangerang_list.append({'borough': borough, 'number_of_neighbourhoods': len(district_list), 'neighbourhoods': district_text})
                district_list, district_text, borough =[],'',''


# Export the beautiful_soup result to df
tangerang_df= pd.DataFrame(tangerang_list)[:-1]
tangerang_df

Unnamed: 0,borough,number_of_neighbourhoods,neighbourhoods
0,Balaraja,9,"Cangkudu, Gembong, Saga, Sentul, Sentul Jaya, ..."
1,Cikupa,14,"Bitung Jaya, Bojong, Budi Mulya, Cibadak, Ciku..."
2,Cisauk,6,"Cibogo, Dangdang, Mekar Wangi, Sampora, Suradi..."
3,Cisoka,10,"Bojong Loa, Carenang, Caringin, Cempaka, Cibug..."
4,Curug,7,"Cukanggalih, Curug Wetan, Kadu, Kadu Jaya, Bin..."
5,Gunung Kaler,9,"Cibetok, Cipaeh, Gunung Kaler, Kandawati, Kedu..."
6,Jambe,10,"Ancol Pasir, Daru, Jambe, Kutruk, Mekarsari, P..."
7,Jayanti,8,"Cikande, Dangdeur, Jayanti, Pabuaran, Pangkat,..."
8,Kelapa Dua,6,"Curug Sangereng, Bencongan, Bencongan Indah, B..."
9,Kemiri,7,"Karang Anyar, Kemiri, Klebet, Legok Suka Maju,..."


## 2. Find all neighbourhoods geolocation (latitude & longitude) including postal code from JSON and CSV files

In [193]:
banten_province_geojson = 'data/json/area/62/36/36.json'

try:
    with open(banten_province_geojson) as f:
        json36 = json.load(f)
except:
    print('An error occured.')

borough_code = []
for i in json36['children']:
    borough_code.append(i)
borough_code.pop(0)

df_districts = pd.read_csv('data/csv/62/subDistricts.csv')

df_districts = df_districts[df_districts['Parent'].apply(lambda x : x in borough_code)][['Name','Latitude','Longitude','Postal']].reset_index(drop=True)
df_districts['Name'] = df_districts['Name'].apply(lambda x : x.upper())
df_districts = df_districts.rename(columns={'Name':'neighbourhoods'})
display(df_districts.head())

Unnamed: 0,neighbourhoods,Latitude,Longitude,Postal
0,MALINGPING,-6.75553,106.00769,42391
1,WANASALAM,-6.761585,105.919716,42396
2,PANGGARANGAN,-6.831495,106.23245,42394423954239242390
3,CIHARA,-6.83793,106.113282,423944239842392
4,BAYAH,-6.897934,106.29737,42393


In [200]:
col = list(tangerang_df.columns)
col.remove('number_of_neighbourhoods')
tangerang_ds = pd.DataFrame(columns=col)
for i,v in enumerate(tangerang_df.values):
    tangerang_ds = tangerang_ds.append(pd.DataFrame({col[0]:v[0], col[1]:v[2].upper().split(', ')}), ignore_index=True)
tangerang = pd.merge(tangerang_ds, df_districts, how='inner', on='neighbourhoods')
display(tangerang.head())

Unnamed: 0,borough,neighbourhoods,Latitude,Longitude,Postal
0,Balaraja,BALARAJA,-6.20018,106.436878,"15611,15612,15613,15614,15615,15616,15617,1561..."
1,Cikupa,CIBADAK,-6.340584,106.224435,42357
2,Cikupa,CIKUPA,-6.217129,106.513105,15710
3,Cikupa,SUKAMULYA,-6.158935,106.439735,"15631,15632,15633,15634,15635,15636,15637,1563..."
4,Sukamulya,SUKAMULYA,-6.158935,106.439735,"15631,15632,15633,15634,15635,15636,15637,1563..."


## 3. Population number for each neighbourhood from CSV file

In [204]:
# Download the 2016 from here first: https://tangerangkab.bps.go.id/statictable/download.html?nrbvfeve=NjE%3D&sdfs=ldjfdifsdjkfahi&zxcv=L2JhY2tlbmQ%3D&xzmn=aHR0cHM6Ly90YW5nZXJhbmdrYWIuYnBzLmdvLmlkL3N0YXRpY3RhYmxlLzIwMTgvMDIvMDIvNjEvanVtbGFoLXBlbmR1ZHVrLWJlcmRhc2Fya2FuLWplbmlzLWtlbGFtaW4tbWVudXJ1dC1rZWNhbWF0YW4tZGkta2FidXBhdGVuLXRhbmdlcmFuZy0tMjAxNi5odG1s&twoadfnoarfeauf=MjAyMS0wMS0wNiAxNjo0NTo0MA%3D%3D

df_population = pd.read_excel('data/tangerang_population.xls')
df_population


Unnamed: 0,"Jumlah Penduduk Berdasarkan Jenis Kelamin menurut Kecamatan di Kabupaten Tangerang, 2016",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Number of Population by Sex and Subdistrict in...,,,,,
1,,,,,,
2,Kecamatan,,Laki-laki,Perempuan,Jumlah,Rasio Jenis Kelamin
3,Subdistrict,,Male,Female,Total,Sex Ratio
4,(1),,(2),(3),(4),(5)
5,1,Cisoka,48 693,45 423,94 116,10720
6,2,Solear,46 385,44 561,90 946,10409
7,3,Tigaraksa,79 151,75 746,154 897,10450
8,4,Jambe,23 082,21 891,44 973,10544
9,5,Cikupa,144 465,135 320,279 785,10676
