In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Get the data
The function bellow is used to extract the first table present in a wikipedia Page

In [106]:
def getDataFrameFromWikipedia(wikipedia_url, table_no=1):
    response = requests.get(wikipedia_url)
    print(response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', {'class': 'wikitable'})
    
    if table_no >= 1 and table_no <= len(tables):
        df = pd.read_html(str(tables[table_no - 1]))[0]
        return df
    else:
        return None



In [67]:
## GET GDP PER CAPITA

gdp_per_capita = getDataFrameFromWikepedia("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita")

df= gdp_per_capita

df = df.drop(["IMF[4][5]", "United Nations[7]"], axis = 1)
df = df.drop(0)
df.columns = df.columns.droplevel(-1)
df.columns = [*df.columns[:-1], 'years']
df = df.drop(["UN Region", "years"], axis = 1)
print(df.head(10000)) 

200
    Country/Territory World Bank[6]
1              Monaco        234317
2       Liechtenstein        184083
3          Luxembourg        126426
4             Bermuda        118846
5             Ireland        104039
..                ...           ...
219        Madagascar           505
220       South Sudan          1072
221      Sierra Leone           461
222       Afghanistan           364
223           Burundi           238

[223 rows x 2 columns]


  df = df.drop(["IMF[4][5]", "United Nations[7]"], axis = 1)


In [70]:
# List of countries by Internet connection speeds, Fixed broadband, Average download speed (Mbit/s) (Ookla).

internet_speed = getDataFrameFromWikepedia("https://en.wikipedia.org/wiki/List_of_countries_by_Internet_connection_speeds")

df = internet_speed
df = df.drop(["Rank", "Averagedownloadspeed(Mbit/s)(M-Lab)[2]", "Averagedownloadspeed(Mbit/s)(SpeedTestNet.io)", "Averagedownloadspeed(Mbit/s)(Speed-Test-Pros.com)[4]"], axis = 1)
df.columns = [*df.columns[:-1], "Average download speed (Mbit/s)"]
print(df.head(10000)) 

200
    Country/Territory  Average download speed (Mbit/s)
0             Romania                           178.90
1         South Korea                           241.58
2           Hong Kong                           265.17
3              Monaco                           220.35
4           Singapore                           259.11
..                ...                              ...
96           Dominica                            37.62
97    North Macedonia                            37.20
98             Greece                            36.73
99         Bangladesh                            36.02
100           Senegal                            35.32

[101 rows x 2 columns]


In [110]:
# List of countries by alcohol consumption per capita, Recorded per capita consumption of pure alcohol (litres) per adult 15 years of age and over per year, 2016.
alcohol_consumption = getDataFrameFromWikepedia("https://en.wikipedia.org/wiki/List_of_countries_by_alcohol_consumption_per_capita")

df = alcohol_consumption

df = df.drop(["1996[7]"], axis = 1)
df.columns = [*df.columns[:-1], "Alcohiol consumption per capita (litres) 2016"]
print(df.head(10000))

200
         Country Alcohiol consumption per capita (litres) 2016
0    Afghanistan                                           0.2
1        Albania                                           7.5
2        Algeria                                           0.9
3        Andorra                                          11.3
4         Angola                                           6.4
..           ...                                           ...
186    Venezuela                                           5.6
187      Vietnam                                           8.3
188        Yemen                                           0.1
189       Zambia                                           4.8
190     Zimbabwe                                           4.8

[191 rows x 2 columns]


In [117]:
# List of countries by intentional homicide rate, Intentional homicide victims per 100,000 inhabitants. From UNODC, rate.

homicide_rate = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_intentional_homicide_rate",2)

df = homicide_rate

df = df.drop(["Region", "Subregion", "Year", "Count"], axis = 1)
df.columns = [*df.columns[:-1], "Intentional homicide victims per 100,000 inhabitants"]
df['Location'] = df['Location'].str.replace('*', '', regex=False).str.strip()
df = df.rename(columns={'Location': 'Country'})

print(df.head(10000))

200
            Country  Intentional homicide victims per 100,000 inhabitants
0       Afghanistan                                                4.0   
1           Albania                                                2.3   
2           Algeria                                                1.6   
3    American Samoa                                                0.0   
4           Andorra                                                2.6   
..              ...                                                ...   
201       Venezuela                                               19.3   
202         Vietnam                                                1.5   
203           Yemen                                                6.3   
204          Zambia                                                5.2   
205        Zimbabwe                                                6.1   

[206 rows x 2 columns]
