In [149]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import country_converter as coco

## Get the data
The function bellow is used to extract the first table present in a wikipedia Page

In [150]:

def getDataFrameFromWikipedia(wikipedia_url, table_no):
    """
    returns a dataframe of the data from the wikipedia page. \n
    optionaly, if there are multiple tables on the page, you can specify which table to use in `table_no`.
    """
    response = requests.get(wikipedia_url)
    print(response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', {'class': 'wikitable'})
    
    if table_no is not None:
        if table_no >= 1 and table_no <= len(tables):
            df = pd.read_html(str(tables[table_no - 1]))[0]
            return df
        else:
            return None
    else:
        # Merge all tables into one DataFrame
        dfs = [pd.read_html(str(table))[0] for table in tables]
        df_merged = pd.concat(dfs, ignore_index=True)
        return df_merged
        

## Normalise the country
- As there are no standerdised way to get countries, we use a library called `country_converter` to convert countries for us.
- Some country might not exist (Channel Islands, European Union, World...), in that case, we just drop the whole record for this country

In [151]:
def normaliseCountryNames(df):
    df.columns.values[0] = 'Country'
    df['Country'] = df['Country'].apply(lambda x: coco.convert(names=x, to='name_short', not_found='not found'))
    df = df[df['Country'] != 'not found']
    return df

## All column

In [152]:
the_final_table = []

In [164]:
#31. List of countries by incarceration rate

incarceration_rate = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_incarceration_rate", 1)

df = incarceration_rate

df = df.drop("Counts[3]", axis=1)
df = df.drop(0)
df.columns = [*df.columns[:-1], "Incarceration rates (per 100 000)"]

df = normaliseCountryNames(df)

the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 

200


Republika Srpska not found in regex
Northern Ireland * [Note] not found in regex
Scotland * [Note] not found in regex


TypeError: Can only merge Series or DataFrame objects, a <class 'list'> was passed

In [154]:
#32. List of countries by literacy rate

literacy_rate = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate", 2)

df = literacy_rate

df = df.drop([('Literacy rate[10][11]', 'Male'), ('Literacy rate[10][11]', 'Female'), ('Literacy rate[10][11]', 'Gender Gap'), ('Literacy rate[10][11]', 'Year')], axis=1) 
df.columns = [*df.columns[:-1], "Literacy rate"]

df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 

200


Congo * not found in regex
World not found in regex


         Country Literacy rate
0    Afghanistan         37.3%
1        Albania         98.1%
2        Algeria         81.4%
3        Andorra        100.0%
4         Angola         71.1%
..           ...           ...
190    Venezuela         97.1%
191      Vietnam         95.8%
192        Yemen         70.1%
193       Zambia         86.7%
194     Zimbabwe         86.5%

[194 rows x 2 columns]


In [155]:
#33. List of countries by age at first marriage

age_first_marriage = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_age_at_first_marriage", None)
    
df = age_first_marriage
    
df.drop(["Men", "Women", "Age Gap", "Age Ratio", "Year", "Source"], axis=1, inplace = True)
df.columns = [*df.columns[:-1], "Age at first marriage"]
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000)) 
print("\n")

200


Congo[a] not found in regex
South Korea (more info) not found in regex


             Country  Average  Age at first marriage
0            Algeria     30.3                    NaN
1             Angola     23.1                    NaN
2              Benin     21.3                    NaN
3           Botswana     28.7                    NaN
4       Burkina Faso     21.1                    NaN
..               ...      ...                    ...
188            Samoa      NaN                   26.4
189  Solomon Islands      NaN                   23.0
190            Tonga      NaN                   24.3
191           Tuvalu      NaN                   24.7
192          Vanuatu      NaN                   22.6

[191 rows x 3 columns]


In [156]:
#34. List of countries by spending on education as percentage of GDP
 
spending_education = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_spending_on_education_as_percentage_of_GDP", 1)
    
df = spending_education
    
df.drop(["Year", "Source"], axis=1, inplace= True)
df.columns = [*df.columns[:-1], "Spending on education (% of GDP)"]
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))    
    

200
                    Country Spending on education (% of GDP)
0          Marshall Islands                             15.8
1                      Cuba                             12.9
2     Micronesia, Fed. Sts.                             12.5
3                  Kiribati                             12.0
4                   Somalia                              9.6
..                      ...                              ...
193                 Somalia                             n.a.
194  Bosnia and Herzegovina                             n.a.
195              Montenegro                             n.a.
196             North Korea                             n.a.
197        Papua New Guinea                             n.a.

[198 rows x 2 columns]


In [157]:
#35. List of sovereign states by homeless population

homeless_pop = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_sovereign_states_by_homeless_population", 1)
    
df = homeless_pop

df.drop(["Homeless (average day)", "Data year", "Unsheltered per 10k", "Main article, other notes", "Unnamed: 6"], axis=1, inplace=True)
df.columns = [*df.columns[:-1], "Homeless population per 10k"] 
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))    

200
          Country  Homeless population per 10k
0     Afghanistan                       1180.0
1         Albania                        121.0
2       Australia                         48.0
3         Austria                         25.4
4      Azerbaijan                        725.0
..            ...                          ...
86  United States                         17.5
87        Vatican                          0.0
88        Vietnam                         16.6
89          Yemen                       1294.0
90       Zimbabwe                        848.0

[91 rows x 2 columns]


In [158]:
#36. List of countries by milk consumption per capita

milk_consumption = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_milk_consumption_per_capita", 1)
    
df = milk_consumption

df.drop(["Rank", "Change in rank 2013/2007", "Milk consumption 2007 (kg/capita/yr) [2]"], axis=1, inplace=True)
df.columns = [*df.columns[:-1], "Milk consumption per capita"] 
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))  

200


Netherlands Antilles not found in regex


         Country  Milk consumption per capita
0        Finland                       430.76
1     Montenegro                       349.21
2    Netherlands                       341.47
3         Sweden                       341.23
4    Switzerland                       318.69
..           ...                          ...
178  North Korea                         3.79
179     Cambodia                         3.47
180         Laos                         2.92
181      Liberia                         3.04
182     DR Congo                          NaN

[182 rows x 2 columns]


In [159]:
#37. List of countries by number of scientific and technical journal articles

sci_publi = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_number_of_scientific_and_technical_journal_articles", 1)
    
df = sci_publi

df.drop(["Rank", "Scientific publications per capita (per million)"], axis=1, inplace=True)
df.columns = [*df.columns[:-1], "Number of scientific publications"] 
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))  

200


World not found in regex


                            Country  Number of scientific publications
0                             China                             744042
1                     United States                             624554
2                    United Kingdom                             198500
3                             India                             191590
4                           Germany                             174524
..                              ...                                ...
224  Svalbard and Jan Mayen Islands                                  3
225                          Tuvalu                                  2
226         St. Pierre and Miquelon                                  1
227                         Tokelau                                  1
228     French Southern Territories                                  1

[229 rows x 2 columns]


In [160]:
#38. Books published per country per year

books_publi = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/Books_published_per_country_per_year", 1)
    
df = books_publi

df.drop(["Rank", "Year", "Notes", "References"], axis=1, inplace=True)
df.columns = [*df.columns[:-1], "Books published per year"] 
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))  


200
            Country Books published per year
0     United States                   275232
1             China                   208418
2    United Kingdom                   186000
3             Japan                   139078
4         Indonesia                   135081
..              ...                      ...
128          Angola                       22
129          Gambia                       14
130            Mali                       14
131    Burkina Faso                       12
132            Oman                        7

[133 rows x 2 columns]


In [162]:
#39. List of countries by food energy intake

kilocal = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_food_energy_intake", 1)
    
df = kilocal

df.drop(["Rank", ('Average daily dietary energy consumption per capita[8]', 'Year')], axis=1, inplace=True)
df.columns = [*df.columns[:-1], "Average daily dietary energy consumption per capita (kcal)"] 
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))  

200


  df.drop(["Rank", ('Average daily dietary energy consumption per capita[8]', 'Year')], axis=1, inplace=True)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression match for New Caledonia (France)
More than one regular expression

                      Country  Average kilocalories consumption per capita
0                     Ireland                                         3885
1               United States                                         3782
2                     Belgium                                         3769
3                     Türkiye                                         3711
4                     Austria                                         3695
..                        ...                                          ...
166                    Zambia                                         2002
167                    Uganda                                         1981
168                Madagascar                                         1938
169                  Zimbabwe                                         1908
170  Central African Republic                                         1786

[171 rows x 2 columns]


In [163]:
#40. List of countries by average yearly temperature

temp_celsius_avg = getDataFrameFromWikipedia("https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature", 1)
    
df = temp_celsius_avg

df.columns = [*df.columns[:-1], "Average yearly temperature from 1961 to 1990 (°C)"] 
    
df = normaliseCountryNames(df)

#the_final_table = pd.merge(the_final_table, df, on='Country', how='outer')

print(df.head(10000))  


200
          Country Average yearly temperature from 1961 to 1990 (°C)
0    Burkina Faso                                             28.29
1            Mali                                             28.25
2        Kiribati                                             28.20
3        Djibouti                                             28.00
4        Maldives                                             28.00
..            ...                                               ...
187       Finland                                             01.55
188        Norway                                             01.50
189       Iceland                                             −0.70
190        Canada                                             −5.10
191        Russia                                             −5.35

[192 rows x 2 columns]
