In [130]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import country_converter as coco


## Get the data
The function bellow is used to extract the first table present in a wikipedia Page

In [131]:

def getDataFrameFromWikipedia(wikipedia_url, table_no=1):
    """
    returns a dataframe of the data from the wikipedia page. \n
    optionaly, if there are multiple tables on the page, you can specify which table to use in `table_no`.
    """
    response = requests.get(wikipedia_url)
    print(response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', {'class': 'wikitable'})

    if table_no >= 1 and table_no <= len(tables):
        df = pd.read_html(str(tables[table_no - 1]))[0]
        return df
    else:
        return None


## Normalise the country
- As there are no standerdised way to get countries, we use a library called `country_converter` to convert countries for us.
- Some country might not exist (Channel Islands, European Union, World...), in that case, we just drop the whole record for this country

In [132]:
def normaliseCountryNames(df):
    df.columns.values[0] = 'Country'
    df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))
    df['Country'] = df['Country'].str.rstrip('*')
    df = df[df['Country'] != 'not found']
    return df


## All column

In [133]:
the_final_table = []


In [147]:
# 21. List of sovereign states and dependencies by total fertility rate, 2019 List by the World Bank, 2019.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_total_fertility_rate", 3)

df = df.drop(["Rank"], axis = 1)
df = normaliseCountryNames(df)
df.fillna('')

the_final_table = df

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte (France)
More than one regular expression match for Mayotte

      Country Fertility rate in 2019 (births/woman)
0       Niger                                   6.8
1     Somalia                                   6.0
2    DR Congo                                   5.8
3        Mali                                   5.8
4        Chad                                   5.6
..        ...                                   ...
251  Pitcairn                                     -
252       NaN                                     -
253       NaN                                     -
254    Tuvalu                                     -
255   Vatican                                     -

[223 rows x 2 columns]


In [148]:
# 22. Tobacco consumption by country, Prevalence of current tobacco use among persons aged 15 years and older, 2000.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Tobacco_consumption_by_country")
df = normaliseCountryNames(df)
df = df.rename(columns={'Cigarettes': 'Annual Cigarette Consumption per person aged 15 or older, 2016'})

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]
Congo * not found in regex


                 Country  \
0                Andorra   
1             Luxembourg   
2                Belarus   
3        North Macedonia   
4                Albania   
..                   ...   
176  Antigua and Barbuda   
177                Ghana   
178           Mauritania   
179        Guinea-Bissau   
180    Brunei Darussalam   

     Annual Cigarette Consumption per person aged 15 or older, 2016  
0                                               6398.3               
1                                               6330.9               
2                                               2911.3               
3                                               2784.9               
4                                               2491.6               
..                                                 ...               
176                                               89.2               
177                                               40.5               
178                              

In [149]:
# 23. List of countries by obesity rate, list, 2016 %.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_obesity_rate")
df = normaliseCountryNames(df)
df = df.rename(columns={'Obesity rate (%)': 'Obesity rate(%), 2016'})

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]


              Country  Obesity rate(%), 2016
0               Nauru                   61.0
1        Cook Islands                   55.9
2               Palau                   55.3
3    Marshall Islands                   52.9
4              Tuvalu                   51.6
..                ...                    ...
187          Cambodia                    3.9
188             India                    3.9
189       Timor-Leste                    3.8
190        Bangladesh                    3.6
191           Vietnam                    2.1

[192 rows x 2 columns]


In [150]:
# 24. List of countries by number of Internet users, Table, %.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users", 3)
df.columns.values[0] = 'Country'
df = df[['Country', 'Pct']]
df = df.rename(columns={'Pct': 'Internet users (%)'})

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, China
More than one regular expression match for Macau, Chin

                       Country Internet users (%)
0                        China              76.4%
1                        India              62.6%
2                United States              92.4%
3                    Indonesia              78.8%
4                       Brazil              77.1%
..                         ...                ...
209                 St. Helena              53.8%
210           Falkland Islands              76.5%
211                 Montserrat              64.1%
212  Wallis and Futuna Islands              11.9%
213                       Niue              53.4%

[214 rows x 2 columns]


In [151]:
# 25. List of countries by median age, Median age per CIA World Factbook 2018 and 2020 (ranked) estimates, 2020 median combined.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_median_age")

df.columns = [' '.join(col).strip() for col in df.columns.values]
df.columns.values[0] = 'Country'
df = df[['Country', 'Median ages in years 2020 medians Combined']]

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]
European Union not found in regex
Virgin Islands not found in regex
World not found in regex


            Country  Median ages in years 2020 medians Combined
0       Afghanistan                                        19.5
1           Albania                                        34.3
2           Algeria                                        28.9
3    American Samoa                                        27.2
4           Andorra                                        46.2
..              ...                                         ...
224       Palestine                                        21.9
225  Western Sahara                                        21.5
227           Yemen                                        19.8
228          Zambia                                        16.9
229        Zimbabwe                                        20.5

[227 rows x 2 columns]


In [152]:
# 26. List of countries by economic freedom, 2019 Economic Freedom of the World Index, score.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_economic_freedom")
df = df[['Country', 'Score']]
df = df.rename(columns={'Score': 'Economic Freedom Score, 2019'})

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]


            Country  Economic Freedom Score, 2019
0         Hong Kong                          90.2
1         Singapore                          89.4
2       New Zealand                          84.4
3       Switzerland                          81.9
4         Australia                          80.9
..              ...                           ...
175  Congo Republic                          39.7
176         Eritrea                          38.9
177            Cuba                          27.8
178       Venezuela                          25.9
179     North Korea                           5.9

[180 rows x 2 columns]


In [153]:
# 27. List of countries by oil production, per capita 2017. Valeur absente = 0, valeur manquante [–] = comme les autres.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_oil_production", 2)
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]
World not found in regex


          Country  Oil production April 2022 (bbl/day)[1]
0   United States                                11884154
1    Saudi Arabia                                10644394
2          Russia                                10278370
3          Canada                                 4543800
4            Iraq                                 4470506
..            ...                                     ...
92         Taiwan                                     196
93       Slovakia                                      43
94        Morocco                                      25
95         Jordan                                      20
96          Spain                                      19

[97 rows x 2 columns]


In [154]:
# 28. List of countries by population growth rate, Table, UN 2015-20.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_population_growth_rate")
df.columns = [' '.join(col).strip() for col in df.columns.values]

df.columns.values[0] = 'Country'
df.columns.values[-1] = 'Population growth rate (%) (2015-2020)'
df = df.iloc[:, [0, -1]]
df = df.fillna('')

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


  df = pd.read_html(str(tables[table_no - 1]))[0]
 not found in regex
World not found in regex
Asia/Other non-specified areas not found in regex


200


Congo * not found in regex
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands)
More than one regular expression match for Aruba * (Netherlands

                          Country Population growth rate (%) (2015-2020)
3                     Afghanistan                                   2.41
4                         Albania                                   0.13
5                         Algeria                                   1.67
6                         Andorra                                  -0.21
7                          Angola                                   3.28
..                            ...                                    ...
237        British Virgin Islands                                       
238  United States Virgin Islands                                       
239                           NaN                                       
240                     Palestine                                       
241                     Palestine                                       

[237 rows x 2 columns]


In [155]:
# 29. List of countries by life expectancy, List by the World Health Organization (2019), Life expectancy at birth, All.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_life_expectancy", 3)
df.columns = [' '.join(col).strip() for col in df.columns.values]
df.columns.values[0] = 'Country'
df.columns.values[1] = 'Life expectancy at birth, 2019'

df = df.iloc[:, [0, 1]]
df = df.fillna('')
df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))



200


  df = pd.read_html(str(tables[table_no - 1]))[0]
 not found in regex
World not found in regex


                      Country Life expectancy at birth, 2019
1                       Japan                           84.3
2                 Switzerland                           83.4
3                 South Korea                           83.3
4                   Singapore                           83.2
5                       Spain                           83.2
..                        ...                            ...
180                Mozambique                           58.1
181                  Eswatini                           57.7
182                   Somalia                           56.5
183  Central African Republic                           53.1
184                   Lesotho                           50.7

[183 rows x 2 columns]


In [156]:
# 30. List of countries by meat consumption, Meat consumption by country, KG per capita 2002.

df = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_meat_consumption")
df.columns.values[0] = 'Country'
df.columns.values[1] = 'Meat consumption (kg per capita), 2002'
df = df.iloc[:, [0, 1]]
df = df.fillna('')

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))


200


  df = pd.read_html(str(tables[table_no - 1]))[0]
Netherlands Antilles not found in regex


                 Country Meat consumption (kg per capita), 2002
0                Albania                                   38.2
1                Algeria                                   18.4
2         American Samoa                                   24.9
3                 Angola                                   25.0
4    Antigua and Barbuda                                   56.0
..                   ...                                    ...
182            Venezuela                                   56.6
183              Vietnam                                   28.6
184                Yemen                                   14.7
185               Zambia                                   11.9
186             Zimbabwe                                   15.2

[186 rows x 2 columns]


In [157]:
print(the_final_table.head(10000))

the_final_table.to_csv('the_final_table.csv', index=False)


                               Country Fertility rate in 2019 (births/woman)  \
0                                Niger                                   6.8   
1                              Somalia                                   6.0   
2                             DR Congo                                   5.8   
3                                 Mali                                   5.8   
4                                 Chad                                   5.6   
..                                 ...                                   ...   
779  Bonaire, Saint Eustatius and Saba                                   NaN   
780                      French Guiana                                   NaN   
781                            Tokelau                                   NaN   
782                         Guadeloupe                                   NaN   
783                            Reunion                                   NaN   

     Annual Cigarette Consumption per p