In [7]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import country_converter as coco

## Get the data
The function bellow is used to extract the first table present in a wikipedia Page

In [8]:

def getDataFrameFromWikipedia(wikipedia_url, table_no=1):
    """
    returns a dataframe of the data from the wikipedia page. \n
    optionaly, if there are multiple tables on the page, you can specify which table to use in `table_no`.
    """
    response = requests.get(wikipedia_url)
    print(response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', {'class': 'wikitable'})
    
    if table_no >= 1 and table_no <= len(tables):
        df = pd.read_html(str(tables[table_no - 1]))[0]
        return df
    else:
        return None

## Normalise the country
- As there are no standerdised way to get countries, we use a library called `country_converter` to convert countries for us.
- Some country might not exist (Channel Islands, European Union, World...), in that case, we just drop the whole record for this country

In [9]:
def normaliseCountryNames(df):
    df.columns.values[0] = 'Country'
    df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))
    df = df[df['Country'] != 'not found']
    return df

In [10]:
def keepOnlyNumberInLastColumn(df):
    last_col = df.columns[-1]
    df.loc[:, last_col] = df[last_col].str.extract('(\d+)')
    df.loc[:, last_col] = pd.to_numeric(df[last_col], errors='coerce')

    return df

## All column

In [11]:
the_final_table = []

In [12]:
## GET GDP PER CAPITA

gdp_per_capita = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita")

df= gdp_per_capita

df = df.drop(["IMF[4][5]", "United Nations[7]"], axis = 1)
df = df.drop(0)
df.columns = df.columns.droplevel(-1)
df.columns = [*df.columns[:-1], 'years']
df.columns.values[0] = 'Country'

df = df.drop(["UN Region", "years"], axis = 1)


df = normaliseCountryNames(df)

the_final_table = df

print(df.head(10000)) 

200


  df = df.drop(["IMF[4][5]", "United Nations[7]"], axis = 1)
Channel Islands not found in regex
European Union[n 1] not found in regex
World not found in regex
Zanzibar not found in regex


           Country World Bank[6]
1           Monaco        234317
2    Liechtenstein        184083
3       Luxembourg        126426
4          Bermuda        118846
5          Ireland        104039
..             ...           ...
219     Madagascar           505
220    South Sudan          1072
221   Sierra Leone           461
222    Afghanistan           364
223        Burundi           238

[219 rows x 2 columns]


In [13]:
# List of countries by Internet connection speeds, Fixed broadband, Average download speed (Mbit/s) (Ookla).

internet_speed = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_Internet_connection_speeds")

df = internet_speed
df = df.drop(["Rank", "Averagedownloadspeed(Mbit/s)(M-Lab)[2]", "Averagedownloadspeed(Mbit/s)(SpeedTestNet.io)", "Averagedownloadspeed(Mbit/s)(Speed-Test-Pros.com)[4]"], axis = 1)
df.columns = [*df.columns[:-1], "Average download speed (Mbit/s)"]
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 

200
             Country  Average download speed (Mbit/s)
0            Romania                           178.90
1        South Korea                           241.58
2          Hong Kong                           265.17
3             Monaco                           220.35
4          Singapore                           259.11
..               ...                              ...
96          Dominica                            37.62
97   North Macedonia                            37.20
98            Greece                            36.73
99        Bangladesh                            36.02
100          Senegal                            35.32

[101 rows x 2 columns]


In [14]:
# List of countries by alcohol consumption per capita, Recorded per capita consumption of pure alcohol (litres) per adult 15 years of age and over per year, 2016.
alcohol_consumption = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_alcohol_consumption_per_capita")

df = alcohol_consumption

df = df.drop(["1996[7]"], axis = 1)
df.columns = [*df.columns[:-1], "Alcohiol consumption per capita (litres) 2016"]
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))

200


Netherlands Antilles not found in regex


         Country Alcohiol consumption per capita (litres) 2016
0    Afghanistan                                           0.2
1        Albania                                           7.5
2        Algeria                                           0.9
3        Andorra                                          11.3
4         Angola                                           6.4
..           ...                                           ...
186    Venezuela                                           5.6
187      Vietnam                                           8.3
188        Yemen                                           0.1
189       Zambia                                           4.8
190     Zimbabwe                                           4.8

[190 rows x 2 columns]


In [15]:
# List of countries by intentional homicide rate, Intentional homicide victims per 100,000 inhabitants. From UNODC, rate.

homicide_rate = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_intentional_homicide_rate",2)

df = homicide_rate

df = df.drop(["Region", "Subregion", "Year", "Count"], axis = 1)
df.columns = [*df.columns[:-1], "Intentional homicide victims per 100,000 inhabitants"]
df['Location'] = df['Location'].str.replace('*', '', regex=False).str.strip()
df = df.rename(columns={'Location': 'Country'})
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))

200


Northern Ireland not found in regex
Scotland not found in regex


            Country  Intentional homicide victims per 100,000 inhabitants
0       Afghanistan                                                4.0   
1           Albania                                                2.3   
2           Algeria                                                1.6   
3    American Samoa                                                0.0   
4           Andorra                                                2.6   
..              ...                                                ...   
201       Venezuela                                               19.3   
202         Vietnam                                                1.5   
203           Yemen                                                6.3   
204          Zambia                                                5.2   
205        Zimbabwe                                                6.1   

[204 rows x 2 columns]


In [16]:
# List of countries by military expenditures, List by the Stockholm International Peace Research Institute, % of GDP.

list_of_military_expendature = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_with_highest_military_expenditures", 1)

df = list_of_military_expendature

df = df[['Country', '% of GDP']]
df.columns = [*df.columns[:-1], "Military Expenditure % GDP"]

normaliseCountryNames(df)
the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df)

World total not found in regex


200
           Country  Military Expenditure % GDP
0        not found                         2.2
1    United States                         3.5
2            China                         1.6
3           Russia                         4.1
4            India                         2.4
5     Saudi Arabia                         7.4
6   United Kingdom                         2.2
7          Germany                         1.4
8           France                         1.9
9      South Korea                         2.7
10           Japan                         1.1
11         Ukraine                        34.0
12           Italy                         1.7
13       Australia                         1.9
14          Canada                         1.2
15          Israel                         4.5
16           Spain                         1.5
17          Brazil                         1.1
18          Poland                         2.4
19     Netherlands                         1.6
20       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))


In [17]:
## GET Human Development Index,

human_dev_index = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index", 2)

df= human_dev_index
# print(df)


df = df.drop(["Rank"], axis=1)

df = df[[('Nation', 'Nation'), ('HDI', '2021 data (2022\xa0report)\u200b[2]')]]
df.columns = df.columns.droplevel(0)

df.columns = [*df.columns[:-1], "Human Development Index"]
df = normaliseCountryNames(df)


the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 





200


  df = df.drop(["Rank"], axis=1)


                      Country  Human Development Index
0                 Switzerland                    0.962
1                      Norway                    0.961
2                     Iceland                    0.959
3                   Hong Kong                    0.952
4                   Australia                    0.951
..                        ...                      ...
186                   Burundi                    0.426
187  Central African Republic                    0.404
188                     Niger                    0.400
189                      Chad                    0.394
190               South Sudan                    0.385

[191 rows x 2 columns]


In [18]:
# Democracy Index, Democracy Index, 2020

democracy_index = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/The_Economist_Democracy_Index", 4)

df= democracy_index
df = df[['Country','2020']]
df.columns = [*df.columns[:-1], "Democracy Index (2020)"]
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 



200
           Country  Democracy Index (2020)
0           Canada                    9.24
1    United States                    7.92
2          Austria                    8.16
3          Belgium                    7.51
4           Cyprus                    7.56
..             ...                     ...
162       Tanzania                    5.10
163           Togo                    2.80
164         Uganda                    4.94
165         Zambia                    4.86
166       Zimbabwe                    3.16

[167 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))


In [19]:
# List of countries by tertiary education attainment,Countries by level of tertiary education, at least a 2-year tertiary degree or its equivalent %.

tertiary_education_attainment = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_tertiary_education_attainment", 2)

df= tertiary_education_attainment
df = df[[('Country', 'Country'), ('Ages 25–34:\xa0% equivalent to a degree course lasting at least:', '2 years')]]
df.columns = [*df.columns[:-1], "tertiary education attainment, 2 y"]
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 

200
           Country  tertiary education attainment, 2 y
0        Australia                                48.0
1          Austria                                39.0
2          Belgium                                43.0
3           Brazil                                16.0
4           Canada                                59.0
5            Chile                                36.0
6            China                                35.8
7         Colombia                                27.0
8       Costa Rica                                28.0
9          Czechia                                31.0
10         Denmark                                44.0
11         Estonia                                41.0
12         Finland                                41.0
13          France                                45.0
14         Germany                                30.0
15          Greece                                40.0
16         Hungary                                32.0
17    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))


In [20]:
# Importance of religion by country, Countries/Districts, Yes important.

importance_religion = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Importance_of_religion_by_country")

df= importance_religion
df = df.drop(["No, unimportant[1]"], axis=1)
df.columns = [*df.columns[:-1], "Importance of religion"]
df = normaliseCountryNames(df)
df = keepOnlyNumberInLastColumn(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 

200
        Country  Importance of religion
0        Sweden                      17
1       Denmark                      19
2       Estonia                      16
3         Japan                      24
4     Hong Kong                      24
..          ...                     ...
107   Sri Lanka                      99
108       Yemen                      99
109  Bangladesh                      99
110   Indonesia                      99
111       Niger                      99

[112 rows x 2 columns]


  df.loc[:, last_col] = pd.to_numeric(df[last_col], errors='coerce')


In [21]:
# Christianity by country, UN members and dependent territories, % Chriastian.

christianity_by_country = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Christianity_by_country", 3)

df= christianity_by_country
df = df[['Country or entity', '% Christian']]
df.columns = [*df.columns[:-1], "Importance of religion"]
df = normaliseCountryNames(df)
df = keepOnlyNumberInLastColumn(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 

200


Europe not found in regex
Latin America and the Caribbean not found in regex
Africa not found in regex
Asia not found in regex
North America not found in regex
Oceania not found in regex
Middle East-North Africa not found in regex
Total not found in regex


            Country  Importance of religion
0       Afghanistan                     0.0
1           Albania                    17.0
2           Algeria                     0.0
3    American Samoa                    98.0
4           Andorra                    89.0
..              ...                     ...
190       Venezuela                    88.0
191         Vietnam                     7.0
192           Yemen                     0.0
193          Zambia                    95.0
194        Zimbabwe                    87.0

[195 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))
  df.loc[:, last_col] = pd.to_numeric(df[last_col], errors='coerce')


In [22]:
#11. Islam by country, Muslim percentage of total population
islam_by_country = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Islam_by_country")
df = islam_by_country
df = df.drop(["Total Population", "Muslim Population", "Percentage of world (%)", "Sources"], axis = 1)
df = df.drop(["Unnamed: 6"], axis = 1)
df = df.rename(columns={'Country/Region': 'Country'})
df = normaliseCountryNames(df)
df = df.rename(columns={'Muslim percentage of total population': 'Muslim percentage of total population_islam'})

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')
print(df.head(10000))

200


Netherlands Antilles not found in regex


                       Country Muslim percentage of total population_islam
0                  Afghanistan                                        99.7
1                      Albania                                        58.8
2                      Algeria                                          99
3               American Samoa                                       < 0.1
4                      Andorra                                         2.6
..                         ...                                         ...
227  Wallis and Futuna Islands                                       < 0.1
228             Western Sahara                                        99.4
229                      Yemen                                        97.2
230                     Zambia                                           1
231                   Zimbabwe                                         0.7

[231 rows x 2 columns]


In [23]:
#12 Buddhism by country, Buddhist population by country, % population.
Buddhist_by_country = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Buddhism_by_country")

df = Buddhist_by_country
df = df.drop(["Pew estimates (2010)[1]", "Other estimates"], axis = 1)
df.columns = df.columns.droplevel(0)
df = df.drop(["Population", "No. of Buddhists", "Census Year"], axis = 1)
#print(df.columns)

df = df.rename(columns={'Country/Territory': 'Country'})
df = df.rename(columns={'% Buddhist': 'Buddhist population by country, % population'})
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))

200


  df = df.drop(["Pew estimates (2010)[1]", "Other estimates"], axis = 1)
World not found in regex


            Country Buddhist population by country, % population
0       Afghanistan                                          NaN
1           Albania                                          NaN
2           Algeria                                          NaN
3    American Samoa                                          NaN
4           Andorra                                          NaN
..              ...                                          ...
174       Venezuela                                          NaN
175         Vietnam                                          NaN
176           Yemen                                          NaN
177          Zambia                                          NaN
178        Zimbabwe                                          NaN

[179 rows x 2 columns]


In [24]:
#13 Jewish population by country, Table, Core population, pct
Jewish_by_country = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Jewish_population_by_country")
df = Jewish_by_country

df = df.drop(["Connected population", "Enlarged population","Eligible population","National official"], axis = 1)

df.columns = df.columns.droplevel(0)
df = df.drop(["Total", "pmp"], axis = 1)
df = df.rename(columns={'Countries': 'Country'})
df = df.rename(columns={'% pct': 'Core population, pct'})
#print(df.columns)

df = normaliseCountryNames(df)


the_final_table = pd.merge(the_final_table, df, on='Country', how='outer', suffixes=('_final', '_jewish'))


print(df.head(10000))

200


  df = df.drop(["Connected population", "Enlarged population","Eligible population","National official"], axis = 1)
Netherlands Antilles not found in regex
World not found in regex


                    Country  pct
0                    Israel  NaN
1             United States  NaN
2                    France  NaN
3                 Palestine  NaN
4                    Canada  NaN
..                      ...  ...
107                 Iceland    —
108              Montenegro    —
109  British Virgin Islands    —
110           Liechtenstein    —
111        Falkland Islands    —

[111 rows x 2 columns]


In [25]:
#14 List of countries by infant and under-five mortality rates,  Under-five mortality (deaths/1,000 live births) – 2019 estimates
mortality_rate = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_infant_and_under-five_mortality_rates",4)

df = mortality_rate
df = df.drop(["1950-1955", "1955-1960",'1960-1965', '1965-1970',
       '1970-1975', '1975-1980', '1980-1985', '1985-1990', '1990-1995',
       '1995-2000', '2000-2005', '2005-2010', '2010-2015', 'To CIA estimates'], axis = 1)
df = df.drop(0)

df = df.rename(columns={'Location [Note 1]': 'Country'})
df = df.rename(columns={'2015-2020': '2019 estimates'})

df['Country'] = df['Country'].str.rstrip(' *')

for index, row in df.iterrows():
    if '(' in row['Country'] and ')' in row['Country']:
        df.at[index, 'Country'] = row['Country'].split('(', 1)[1].split(')', 1)[0]

df.columns.values[0] = 'Country'


df = normaliseCountryNames(df)


the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')


print(df.head(10000))


200


disputed not found in regex
Congo  not found in regex
World not found in regex


                      Country  2019 estimates
1                     Iceland            1.25
2                       China            1.32
3                   Singapore            1.63
4                     Finland            1.71
5                       Japan            1.76
..                        ...             ...
197                      Mali           65.80
198                   Somalia           69.31
199                      Chad           74.50
200  Central African Republic           81.90
201              Sierra Leone           80.77

[199 rows x 2 columns]


In [26]:
#15 Age of criminal responsibility, By country, age reduced

Age= getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Age_of_criminal_responsibility")

df = Age


df = df.drop(["Age (full)[b]", "Ref","Notes"], axis = 1)

df = df.rename(columns={'Country': 'Country'})
df = df.rename(columns={'Age (reduced)[a]': 'By country, age reduced'})

for index, row in df.iterrows():
    if '(' in row['Country'] and ')' in row['Country']:
        df.at[index, 'Country'] = row['Country'].split('(', 1)[1].split(')', 1)[0].strip()
    else:
        df.at[index, 'Country'] = row['Country'].strip()
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer', suffixes=('_final', '_age_reduced'))

print(df.head(10000))

200
         Country By country, age reduced
0    Afghanistan                      12
1        Albania                      14
2        Algeria                      13
3        Andorra                      12
4         Angola                      14
..           ...                     ...
165      Vanuatu                      10
166      Vietnam                      14
167        Yemen                       7
168       Zambia                       8
169     Zimbabwe                       7

[170 rows x 2 columns]


In [27]:
#16 List of countries by minimum wage, List, Annual, US$.
list = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_minimum_wage")
df = list

df = df.drop(["Minimum wage", "Workweek (hours)[4]","Hourly","Effective per"], axis = 1)


df.columns = df.columns.droplevel(0)

df = df.drop(["PPP (Int$)[7]"], axis = 1)
df = df.drop(0)
df = df.iloc[:, :-1]

df = df.rename(columns={'Country': 'Country'})
df = df.rename(columns={'Nominal (US$)[6]': 'Annual US$'})
#print(df.columns)

df = normaliseCountryNames(df)


the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')


print(df.head(10000))


200


  df = df.drop(["Minimum wage", "Workweek (hours)[4]","Hourly","Effective per"], axis = 1)


         Country  Annual US$
1    Afghanistan      858.00
2        Albania     4637.00
3        Algeria     1777.00
4        Andorra    18253.00
5         Angola      663.00
..           ...         ...
197    Venezuela       10.32
198      Vietnam     1591.00
199        Yemen         NaN
200       Zambia      596.00
201     Zimbabwe         NaN

[201 rows x 2 columns]


In [28]:
#17 List of countries by external debt, List of countries with respect to external debt, % GDP
list_countries = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_external_debt")
df = list_countries


df = df.drop(["Per capita US dollars", "External debt US dollars","% of total wealth[1]","% of total wealth[1]","Date"], axis = 1)

df = df.rename(columns={'Country/Region': 'Country'})
df = df.rename(columns={'of GDP': 'external debt % GDP'})

for index, row in df.iterrows():
    if '(' in row['Country'] and ')' in row['Country']:
        df.at[index, 'Country'] = row['Country'].split('(', 1)[1].split(')', 1)[0]

df.columns.values[0] = 'Country'

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')


print(df.head(10000))

200
            Country  % of GDP
0     United States    122.56
1    United Kingdom    287.08
2             Japan     98.44
3       Netherlands    381.62
4            France    112.06
..              ...       ...
205  United Kingdom      5.00
206          France      6.00
207  United Kingdom      2.00
208   Liechtenstein      0.00
209     New Zealand      0.00

[210 rows x 2 columns]


In [29]:
#18 List of countries by income equality, UN, World Bank and CIA list – income ratios and Gini indices, WB Gini %.
list = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_income_equality")
df = list

df = df.drop(["Subregion", "Region","UN R/P","CIA R/P[4]"], axis = 1)
df.columns = df.columns.droplevel(0)
df = df.drop(["Year"], axis = 1)
df = df.drop(0)


df = df.rename(columns={'Country': 'Country'})
df = df.rename(columns={'%': 'WB Gini %'})
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')


print(df.head(10000))

200


  df = df.drop(["Subregion", "Region","UN R/P","CIA R/P[4]"], axis = 1)
European Union not found in regex
World not found in regex


       Country  WB Gini %
1      Albania       29.4
2      Algeria       27.6
3       Angola       51.3
4    Argentina       42.0
5      Armenia       27.9
..         ...        ...
165  Venezuela       44.8
166    Vietnam       36.8
167      Yemen       36.7
168     Zambia       55.9
169   Zimbabwe       50.3

[168 rows x 2 columns]


In [30]:
#19. List of countries by total health expenditure per capita, Total health expenditure per capita in 2018 PPP international U.S. dollars, 2018.
list = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita",2)
df = list

df = df.drop(["2019", "2020","2021"], axis = 1)
df = df.rename(columns={'Location': 'Country'})
df = df.rename(columns={'2018': 'Total health 2018'})

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')


print(df.head(10000))

200


Congo * not found in regex


                 Country  Total health 2018
0            Afghanistan                 71
1                Algeria                258
2                Andorra               3188
3                 Angola                 84
4    Antigua and Barbuda                850
..                   ...                ...
183              Vanuatu                102
184            Venezuela                165
185              Vietnam                164
186               Zambia                 75
187             Zimbabwe                115

[187 rows x 2 columns]


In [31]:
#20 List of countries by suicide rate, Suicide rates by sex and country, All.
list_suicide  = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_suicide_rate")
df = list_suicide

df = df.drop(["Male", "Female"], axis = 1)

df = df.rename(columns={'Country': 'Country'})
df = df.rename(columns={'All': 'All Suicide rates by sex and country'})

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')


print(df.head(10000))

200


Africa not found in regex
Americas not found in regex
South-East Asia not found in regex
Europe not found in regex
Eastern Mediterranean not found in regex
Western Pacific not found in regex
Global not found in regex


                 Country  All Suicide rates by sex and country
0            Afghanistan                                   6.0
1                Albania                                   3.7
2                Algeria                                   2.6
3                 Angola                                  12.6
4    Antigua and Barbuda                                   0.3
..                   ...                                   ...
178            Venezuela                                   2.1
179              Vietnam                                   7.2
180                Yemen                                   7.1
181               Zambia                                  14.4
182             Zimbabwe                                  23.6

[183 rows x 2 columns]


In [32]:
print(the_final_table.head(10000))

the_final_table.to_csv('the_final_table.csv', index=False)


                       Country World Bank[6]  Average download speed (Mbit/s)  \
0                       Monaco        234317                           220.35   
1                Liechtenstein        184083                           193.79   
2                   Luxembourg        126426                           154.26   
3                      Bermuda        118846                              NaN   
4                      Ireland        104039                           112.81   
..                         ...           ...                              ...   
296                    Tokelau           NaN                              NaN   
297  Wallis and Futuna Islands           NaN                              NaN   
298             Western Sahara           NaN                              NaN   
299                   Guernsey           NaN                              NaN   
300                     Jersey           NaN                              NaN   

    Alcohiol consumption pe