# Web Scraping

In [31]:
from bs4 import BeautifulSoup as bs4
import requests
import pandas as pd

In [2]:
countries_page = requests.get("https://en.wikipedia.org/wiki/List_of_Indonesian_provinces_by_GDP#2022_data")
countries_soup = bs4(countries_page.content,'lxml')

In [3]:
countries_table= countries_soup.find_all('table', {'class':'wikitable'})
countries_table

[<table class="wikitable sortable plainrowheaders">
 <tbody><tr>
 <th rowspan="2">Rank
 </th>
 <th rowspan="2">Province
 </th>
 <th rowspan="2">Region
 </th>
 <th rowspan="2">GDP<sup class="reference" id="cite_ref-bpsdata2022_8-0"><a href="#cite_note-bpsdata2022-8">[8]</a></sup><br/>(in billion <a href="/wiki/Indonesian_rupiah" title="Indonesian rupiah">Rp</a>)
 </th>
 <th scope="col">GDP Nominal
 </th>
 <th scope="col">GDP PPP
 </th></tr>
 <tr>
 <th scope="col"><small>(in billion <a class="mw-redirect" href="/wiki/US_Dollar" title="US Dollar">$</a>)</small>
 </th>
 <th scope="col"><small>(in billion <a class="mw-redirect" href="/wiki/US_Dollar" title="US Dollar">$</a>)</small>
 </th></tr>
 <tr bgcolor="#F0E891">
 <td align="center">-
 </td>
 <td><b><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/

In [4]:
countries_df= pd.read_html(str(countries_table))
countries_df

[   Rank                      Province                Region  \
    Rank                      Province                Region   
 0     -                     Indonesia       South East Asia   
 1     -                   Java Island             Indonesia   
 2     -                Sumatra Island             Indonesia   
 3     1                       Jakarta                  Java   
 4     2                     East Java                  Java   
 5     3                     West Java                  Java   
 6     -                    Kalimantan             Indonesia   
 7     4                  Central Java                  Java   
 8     -               Sulawesi Island             Indonesia   
 9     5                          Riau               Sumatra   
 10    6                 North Sumatra               Sumatra   
 11    7               East Kalimantan            Kalimantan   
 12    8                        Banten                  Java   
 13    9                South Sulawesi  

In [5]:
countries_df= countries_df[0]
countries_df

Unnamed: 0_level_0,Rank,Province,Region,GDP[8](in billion Rp),GDP Nominal,GDP PPP
Unnamed: 0_level_1,Rank,Province,Region,GDP[8](in billion Rp),(in billion $),(in billion $)
0,-,Indonesia,South East Asia,19588455,1319.19,4023.5
1,-,Java Island,Indonesia,10813999,728.27,2272.54
2,-,Sumatra Island,Indonesia,4220203,284.21,886.87
3,1,Jakarta,Java,3186470,214.59,669.63
4,2,East Java,Java,2730907,183.91,573.89
5,3,West Java,Java,2422782,163.16,509.14
6,-,Kalimantan,Indonesia,1767053,119.0,371.34
7,4,Central Java,Java,1560899,105.12,328.02
8,-,Sulawesi Island,Indonesia,1168122,81.64,245.72
9,5,Riau,Sumatra,991589,66.78,208.38


In [6]:
countries_df.to_csv('C:/Users/Lenovo/Documents/Dataset/dibimbing/gdp_indo.csv', index=False)

# Data Cleansing 

In [7]:
df = pd.read_csv("gdp_indo.csv")
df

Unnamed: 0,Rank,Province,Region,GDP[8](in billion Rp),GDP Nominal,GDP PPP
0,Rank,Province,Region,GDP[8](in billion Rp),(in billion $),(in billion $)
1,-,Indonesia,South East Asia,19588455,1319.19,4023.5
2,-,Java Island,Indonesia,10813999,728.27,2272.54
3,-,Sumatra Island,Indonesia,4220203,284.21,886.87
4,1,Jakarta,Java,3186470,214.59,669.63
5,2,East Java,Java,2730907,183.91,573.89
6,3,West Java,Java,2422782,163.16,509.14
7,-,Kalimantan,Indonesia,1767053,119.0,371.34
8,4,Central Java,Java,1560899,105.12,328.02
9,-,Sulawesi Island,Indonesia,1168122,81.64,245.72


In [10]:
# Rename column name
df = df.rename(columns={'Rank':'rank',
                        'Province': 'province',
                        'Region':'region',
                        'GDP[8](in billion Rp)':'gdp_in_billion_rp',
                        'GDP Nominal':'gdp_in_billion_usd', 'GDP PPP':'gdp_ppp_in_billion_usd'})
df

Unnamed: 0,rank,province,region,gdp_in_billion_rp,gdp_in_billion_usd,gdp_ppp_in_billion_usd
0,Rank,Province,Region,GDP[8](in billion Rp),(in billion $),(in billion $)
1,-,Indonesia,South East Asia,19588455,1319.19,4023.5
2,-,Java Island,Indonesia,10813999,728.27,2272.54
3,-,Sumatra Island,Indonesia,4220203,284.21,886.87
4,1,Jakarta,Java,3186470,214.59,669.63
5,2,East Java,Java,2730907,183.91,573.89
6,3,West Java,Java,2422782,163.16,509.14
7,-,Kalimantan,Indonesia,1767053,119.0,371.34
8,4,Central Java,Java,1560899,105.12,328.02
9,-,Sulawesi Island,Indonesia,1168122,81.64,245.72


In [11]:
# Drop Country & Island name
df.query("rank != '-'", inplace=True)
df

# drop columns
df.drop([0], inplace=True)
df

Unnamed: 0,rank,province,region,gdp_in_billion_rp,gdp_in_billion_usd,gdp_ppp_in_billion_usd
4,1,Jakarta,Java,3186470,214.59,669.63
5,2,East Java,Java,2730907,183.91,573.89
6,3,West Java,Java,2422782,163.16,509.14
8,4,Central Java,Java,1560899,105.12,328.02
10,5,Riau,Sumatra,991589,66.78,208.38
11,6,North Sumatra,Sumatra,955193,64.33,200.73
12,7,East Kalimantan,Kalimantan,921332,62.05,193.62
13,8,Banten,Java,747250,50.32,157.03
14,9,South Sulawesi,Sulawesi,605145,40.75,124.32
15,10,South Sumatra,Sumatra,591603,39.84,124.32


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34 entries, 4 to 42
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   rank                    34 non-null     object
 1   province                34 non-null     object
 2   region                  34 non-null     object
 3   gdp_in_billion_rp       34 non-null     object
 4   gdp_in_billion_usd      34 non-null     object
 5   gdp_ppp_in_billion_usd  34 non-null     object
dtypes: object(6)
memory usage: 1.9+ KB


# API

In [14]:
#URL Open Cage API
url = 'https://api.opencagedata.com/geocode/v1/json'

#API Key for Open Cage
api_key = '83f1cbe68c5b42d086c2f4648fbe2b1e'

In [15]:
provinces= df['province'].unique()
provinces

array(['Jakarta', 'East Java', 'West Java', 'Central Java', 'Riau',
       'North Sumatra', 'East Kalimantan', 'Banten', 'South Sulawesi',
       'South Sumatra', 'Lampung', 'Central Sulawesi', 'Riau Islands',
       'West Sumatra', 'Jambi', 'Papua', 'West Kalimantan',
       'South Kalimantan', 'Bali', 'Aceh', 'Central Kalimantan',
       'Special Region of Yogyakarta', 'Southeast Sulawesi',
       'North Sulawesi', 'West Nusa Tenggara', 'North Kalimantan',
       'East Nusa Tenggara', 'Bangka Belitung Islands', 'West Papua',
       'Bengkulu', 'North Maluku', 'West Sulawesi', 'Maluku', 'Gorontalo'],
      dtype=object)

In [16]:
components_list= []

In [17]:
# Function to get latitude and longitude for a province using OpenCage API
def get_location_info(api_key, province):
    url = 'https://api.opencagedata.com/geocode/v1/json'
    api_key = '83f1cbe68c5b42d086c2f4648fbe2b1e'
    
    params = {
        'key': api_key,
        'q': province,
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        if 'results' in data and len(data['results']) > 0:
            result = data['results'][0]
            return {
                'province': province,
                'latitude': result['geometry']['lat'],
                'longitude': result['geometry']['lng']
            }
        else:
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

# Loop through each province and make a request to OpenCage API
for province in provinces:
    province_components = get_location_info(api_key, province)
    
    if province_components:
        components_list.append(province_components)

# Display the list of components for each province
for province_components in components_list:
    print(province_components)

{'province': 'Jakarta', 'latitude': -6.175247, 'longitude': 106.8270488}
{'province': 'East Java', 'latitude': -7.6977397, 'longitude': 112.4914199}
{'province': 'West Java', 'latitude': -6.8891904, 'longitude': 107.6404716}
{'province': 'Central Java', 'latitude': -7.3032412, 'longitude': 110.0044145}
{'province': 'Riau', 'latitude': 0.5004112, 'longitude': 101.5475811}
{'province': 'North Sumatra', 'latitude': 2.1923519, 'longitude': 99.3812201}
{'province': 'East Kalimantan', 'latitude': 0.7884397, 'longitude': 116.2419977}
{'province': 'Banten', 'latitude': -6.4453801, 'longitude': 106.1375586}
{'province': 'South Sulawesi', 'latitude': -3.6446718, 'longitude': 119.9471906}
{'province': 'South Sumatra', 'latitude': -3.1266842, 'longitude': 104.0930554}
{'province': 'Lampung', 'latitude': -4.8555039, 'longitude': 105.0272986}
{'province': 'Central Sulawesi', 'latitude': -1.6937786, 'longitude': 120.8088555}
{'province': 'Riau Islands', 'latitude': -0.1547846, 'longitude': 104.580374

In [18]:
#Create DataFrame for components_list
df2 = pd.DataFrame(components_list)
df2

Unnamed: 0,province,latitude,longitude
0,Jakarta,-6.175247,106.827049
1,East Java,-7.69774,112.49142
2,West Java,-6.88919,107.640472
3,Central Java,-7.303241,110.004414
4,Riau,0.500411,101.547581
5,North Sumatra,2.192352,99.38122
6,East Kalimantan,0.78844,116.241998
7,Banten,-6.44538,106.137559
8,South Sulawesi,-3.644672,119.947191
9,South Sumatra,-3.126684,104.093055


# Transformasi Data

In [19]:
merged_df= pd.merge(df, df2, on='province')
merged_df.head()

Unnamed: 0,rank,province,region,gdp_in_billion_rp,gdp_in_billion_usd,gdp_ppp_in_billion_usd,latitude,longitude
0,1,Jakarta,Java,3186470,214.59,669.63,-6.175247,106.827049
1,2,East Java,Java,2730907,183.91,573.89,-7.69774,112.49142
2,3,West Java,Java,2422782,163.16,509.14,-6.88919,107.640472
3,4,Central Java,Java,1560899,105.12,328.02,-7.303241,110.004414
4,5,Riau,Sumatra,991589,66.78,208.38,0.500411,101.547581


In [22]:
merged_df.isna().sum()

rank                      0
province                  0
region                    0
gdp_in_billion_rp         0
gdp_in_billion_usd        0
gdp_ppp_in_billion_usd    0
latitude                  0
longitude                 0
dtype: int64

In [23]:
merged_df.dtypes

rank                       object
province                   object
region                     object
gdp_in_billion_rp          object
gdp_in_billion_usd         object
gdp_ppp_in_billion_usd     object
latitude                  float64
longitude                 float64
dtype: object

# Data Cleansing 2

In [24]:
#sesuaikan tipe data

merged_df['rank']= merged_df['rank'].astype('int64')
merged_df['gdp_in_billion_rp']= merged_df['gdp_in_billion_rp'].astype('int64')
merged_df['gdp_in_billion_usd']= merged_df['gdp_in_billion_usd'].astype('float64')
merged_df['gdp_ppp_in_billion_usd']= merged_df['gdp_ppp_in_billion_usd'].astype('float64')

In [26]:
merged_df.dtypes

rank                        int64
province                   object
region                     object
gdp_in_billion_rp           int64
gdp_in_billion_usd        float64
gdp_ppp_in_billion_usd    float64
latitude                  float64
longitude                 float64
dtype: object

# Data Enrichment

In [29]:
#Add column lat_long
merged_df['lat_long'] = merged_df['latitude'].astype(str) + ',' + merged_df['longitude'].astype(str)

merged_df.head()

Unnamed: 0,rank,province,region,gdp_in_billion_rp,gdp_in_billion_usd,gdp_ppp_in_billion_usd,latitude,longitude,lat_long
0,1,Jakarta,Java,3186470,214.59,669.63,-6.175247,106.827049,"-6.175247,106.8270488"
1,2,East Java,Java,2730907,183.91,573.89,-7.69774,112.49142,"-7.6977397,112.4914199"
2,3,West Java,Java,2422782,163.16,509.14,-6.88919,107.640472,"-6.8891904,107.6404716"
3,4,Central Java,Java,1560899,105.12,328.02,-7.303241,110.004414,"-7.3032412,110.0044145"
4,5,Riau,Sumatra,991589,66.78,208.38,0.500411,101.547581,"0.5004112,101.5475811"


In [30]:
merged_df.to_csv('C:/Users/Lenovo/Documents/Dataset/dibimbing/DE_Project.csv', index=False)