### Let's scrape the following wiki page:
https://en.wikipedia.org/wiki/List_of_countries_by_GDP_sector_composition

In [31]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_sector_composition"

In [4]:
response = requests.get(url)

In [5]:
wiki = BeautifulSoup(response.text, 'lxml')

In [9]:
print(wiki.prettify)

<bound method Tag.prettify of <!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of countries by GDP sector composition - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_GDP_sector_composition","wgTitle":"List of countries by GDP sector composition","wgCurRevisionId":783122238,"wgRevisionId":783122238,"wgArticleId":9419906,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using web citations with no URL","Pages using citations with accessdate and no URL","Wikipedia articles in need of updating from May 2017","All Wikipedia articles in need of updating","All articles wit

Now let's grab the "GDP from natural resources" table

In [16]:
all_tables = wiki.find_all('table')

In [18]:
print(all_tables)

[<table class="plainlinks metadata ambox ambox-content ambox-Update" role="presentation">
<tr>
<td class="mbox-image">
<div style="width:52px"><img alt="Ambox current red.svg" data-file-height="290" data-file-width="360" height="34" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Ambox_current_red.svg/42px-Ambox_current_red.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Ambox_current_red.svg/63px-Ambox_current_red.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/98/Ambox_current_red.svg/84px-Ambox_current_red.svg.png 2x" width="42"/></div>
</td>
<td class="mbox-text"><span class="mbox-text-span">This article needs to be <b>updated</b>. <span class="hide-when-compact">Please update this article to reflect recent events or newly available information.</span> <small><i>(May 2017)</i></small></span></td>
</tr>
</table>, <table class="wikitable sortable">
<tr>
<th width="2%">№</th>
<th width="25%">Country/Economy</th>
<th width="15%"><a href="/wi

In [23]:
correctTable = all_tables[3]
print(correctTable.prettify)

<bound method Tag.prettify of <table class="wikitable sortable">
<tr>
<th>Country/Economy</th>
<th>Total natural resources<br/>
(% of GDP)</th>
<th>Oil<br/>
(% of GDP)</th>
<th>Natural gas<br/>
(% of GDP)</th>
<th>Coal<br/>
(% of GDP)</th>
<th>Mineral<br/>
(% of GDP)</th>
<th>Forest<br/>
(% of GDP)</th>
</tr>
<tr>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="375" data-file-width="562" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" width="23"/> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a></td>
<td>2.1</td>
<td>..</td>
<td>..</td>
<td>0</td>
<td>0</td>
<td>2.1</td>
</tr>
<tr>
<td><span class="flagicon"><img alt="" class=

Now, let's create a dataframe from the extracted table. Use the same columns as the table on the wiki page but make them proper. 

In [24]:
country = []
totalNaturalResources = []
oil = []
naturalGas = []
coal = []
mineral = []
forest = []

for row in correctTable.findAll('tr')[1:]:
    cells = row.findAll('td')
    if (len(cells) > 0):
        country.append(cells[0].find('a')['title'])
        totalNaturalResources.append(cells[1].find(text = True))
        oil.append(cells[2].find(text = True))
        naturalGas.append(cells[3].find(text = True))
        coal.append(cells[4].find(text = True))
        mineral.append(cells[5].find(text = True))
        forest.append(cells[6].find(text = True))
        
resources_df = pd.DataFrame(country, columns = ["Country"])
resources_df["TotalNaturalResources"] = totalNaturalResources
resources_df["Oil"] = oil
resources_df["NaturalGas"] = naturalGas
resources_df["Coal"] = coal
resources_df["Mineral"] = mineral
resources_df["Forest"] = forest

resources_df.head()

Unnamed: 0,Country,TotalNaturalResources,Oil,NaturalGas,Coal,Mineral,Forest
0,Afghanistan,2.1,..,..,0,0.0,2.1
1,Albania,5.1,4.6,0,0,0.5,0.1
2,Algeria,26.3,19,7,0,0.3,0.1
3,Angola,46.6,46.3,0.1,..,0.0,0.2
4,Antigua and Barbuda,0.0,..,..,..,0.0,..


Extract the "list by percentage" table. Create a dataframe out of it. 

In [25]:
country = []
agriculture = []
industry = []
services = []
year = [] 

pct = wiki.findAll('table')[4]

for row in pct.findAll('tr')[1:]:
    cells = row.findAll('td')
    if (len(cells) > 1):
        country.append(cells[0].find('a').find(text = True))
        agriculture.append(cells[1].find(text = True))
        industry.append(cells[2].find(text = True))
        services.append(cells[3].find(text = True))
        year.append(cells[4].find(text = True))
        
gdp = pd.DataFrame(country, columns = ['Country'])
gdp["Agriculture"] = agriculture
gdp["Industry"] = industry
gdp["Services"] = services
gdp["Year"] = year

gdp.head()

Unnamed: 0,Country,Agriculture,Industry,Services,Year
0,Afghanistan,24.0,21.0,55.0,2014
1,Albania,21.6,14.9,63.5,2016
2,Algeria,13.1,38.7,48.2,2016
3,American Samoa,27.4,12.4,60.2,2012
4,Andorra,14.0,79.0,6.0,2011


Let's combine the 2 dataframes together

In [26]:
combined = pd.merge(resources_df, gdp, how = "outer", on = "Country")

In [27]:
print(combined.shape)

(234, 11)


In [28]:
print(resources_df.shape)

(182, 7)


In [29]:
print(gdp.shape)

(222, 5)


In [30]:
combined.head()

Unnamed: 0,Country,TotalNaturalResources,Oil,NaturalGas,Coal,Mineral,Forest,Agriculture,Industry,Services,Year
0,Afghanistan,2.1,..,..,0,0.0,2.1,24.0,21.0,55.0,2014
1,Albania,5.1,4.6,0,0,0.5,0.1,21.6,14.9,63.5,2016
2,Algeria,26.3,19,7,0,0.3,0.1,13.1,38.7,48.2,2016
3,Angola,46.6,46.3,0.1,..,0.0,0.2,10.2,61.4,28.4,2011
4,Antigua and Barbuda,0.0,..,..,..,0.0,..,2.2,17.8,80.0,2016


Let's replace invalid ".." values with NaNs

In [32]:
combined = combined.replace("..", np.NaN)
combined.head()

Unnamed: 0,Country,TotalNaturalResources,Oil,NaturalGas,Coal,Mineral,Forest,Agriculture,Industry,Services,Year
0,Afghanistan,2.1,,,0.0,0.0,2.1,24.0,21.0,55.0,2014
1,Albania,5.1,4.6,0.0,0.0,0.5,0.1,21.6,14.9,63.5,2016
2,Algeria,26.3,19.0,7.0,0.0,0.3,0.1,13.1,38.7,48.2,2016
3,Angola,46.6,46.3,0.1,,0.0,0.2,10.2,61.4,28.4,2011
4,Antigua and Barbuda,0.0,,,,0.0,,2.2,17.8,80.0,2016
