## Merging DataFrames

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

employment_df = pd.read_csv('./employment_df.csv')
hdi_df = pd.read_csv('./hdi_clean.csv')
rd_extended_df = pd.read_csv('./RD_extended_df.csv')
patents_df = pd.read_csv('./patents_data_clean.csv')
education_df = pd.read_csv('./educ_filtered.csv')

### Employment DataFrame

In [2]:
print('\nemployment_df:\n')
display(employment_df)
print('\ndata types per column:\n')
print(employment_df.dtypes)


employment_df:



Unnamed: 0,country_code,area,year,statistic_type,statistic_value
0,1,World,2005,Labour force participation,62.9
1,1,World,2005,Unemployment rate,6.3
2,1,World,2010,Labour force participation,62.0
3,1,World,2010,Unemployment rate,6.3
4,1,World,2015,Labour force participation,60.7
...,...,...,...,...,...
1833,97,European Union,2010,Unemployment rate,9.8
1834,97,European Union,2015,Labour force participation,56.8
1835,97,European Union,2015,Unemployment rate,10.0
1836,97,European Union,2023,Labour force participation,57.6



data types per column:

country_code         int64
area                object
year                 int64
statistic_type      object
statistic_value    float64
dtype: object


In [3]:
print('\nmissing values per column:\n')
print(employment_df.isnull().sum())


missing values per column:

country_code       0
area               0
year               0
statistic_type     0
statistic_value    0
dtype: int64


### HDI (Human Development Index) DataFrame

In [4]:
print('\n hdi_df: \n')
hdi_df.rename(columns={'iso3':'country_iso', 'country':'area'}, inplace=True)
display(hdi_df)
print('\ndata types per column:\n')
print(hdi_df.dtypes)


 hdi_df: 



Unnamed: 0,country_iso,area,human_development_groups,undp_developing_regions,year,human_development_index_(hdi)
0,AFG,Afghanistan,Low,SA,2000,0.335
1,AGO,Angola,Medium,SSA,2000,0.375
2,ALB,Albania,High,ECA,2000,0.677
3,AND,Andorra,Very High,,2000,0.818
4,ARE,United Arab Emirates,Very High,AS,2000,0.796
...,...,...,...,...,...,...
4285,WSM,Samoa,High,EAP,2021,0.707
4286,YEM,Yemen,Low,AS,2021,0.455
4287,ZAF,South Africa,High,SSA,2021,0.713
4288,ZMB,Zambia,Medium,SSA,2021,0.565



data types per column:

country_iso                       object
area                              object
human_development_groups          object
undp_developing_regions           object
year                               int64
human_development_index_(hdi)    float64
dtype: object


In [5]:
print('\nmissing values per column:\n')
print(hdi_df.isnull().sum())


missing values per column:

country_iso                        0
area                               0
human_development_groups          88
undp_developing_regions          968
year                               0
human_development_index_(hdi)    189
dtype: int64


### Extended R&D DataFrame
We will take this one as the reference one

In [6]:
print('\nrd_extended_df:\n')
rd_extended_df.rename(columns={'location':'country_iso'}, inplace=True)
display(rd_extended_df)
print('\ndata types per column:\n')
print(rd_extended_df.dtypes)


rd_extended_df:



Unnamed: 0,area,year,series,value,country_iso,country_code
0,World,2005,Gross domestic expenditure on R&D: as a percen...,1.50000,,1.0
1,World,2005,Researchers per million inhabitants (FTE),907.20000,,1.0
2,World,2010,Gross domestic expenditure on R&D: as a percen...,1.60000,,1.0
3,World,2010,Researchers per million inhabitants (FTE),1022.10000,,1.0
4,World,2015,Gross domestic expenditure on R&D: as a percen...,1.70000,,1.0
...,...,...,...,...,...,...
1751,Venezuela,2016,,0.69000,VEN,
1752,Papua New Guinea,2016,,0.03158,PNG,
1753,Ivory Coast,2016,,0.06975,CIV,
1754,Angola,2016,,0.03229,AGO,



data types per column:

area             object
year              int64
series           object
value           float64
country_iso      object
country_code    float64
dtype: object


In [7]:
print('\nmissing values per column:\n')
print(rd_extended_df.isnull().sum())


missing values per column:

area              0
year              0
series          773
value             0
country_iso     983
country_code    773
dtype: int64


### Patents DataFrame

In [8]:
print('\npatents_df:\n')
patents_df.rename(columns={'region': 'country_code', 'country':'area'}, inplace=True)
display(patents_df)
print('\ndata types per column:\n')
print(patents_df.dtypes)


patents_df:



Unnamed: 0,country_code,area,year,patents,number_patents,patents_mapped
0,1,"Total, all countries or areas",1985,Grants of patents (number),,1
1,1,"Total, all countries or areas",1995,Grants of patents (number),,1
2,1,"Total, all countries or areas",2005,Grants of patents (number),,1
3,1,"Total, all countries or areas",2010,Grants of patents (number),,1
4,1,"Total, all countries or areas",2019,Grants of patents (number),,1
...,...,...,...,...,...,...
1855,716,Zimbabwe,2020,Grants of patents (number),6.0,1
1856,716,Zimbabwe,2021,Grants of patents (number),10.0,1
1857,716,Zimbabwe,2019,Patents in force (number),14.0,3
1858,716,Zimbabwe,2020,Patents in force (number),4.0,3



data types per column:

country_code        int64
area               object
year                int64
patents            object
number_patents    float64
patents_mapped      int64
dtype: object


In [9]:
print('\nmissing values per column:\n')
print(patents_df.isnull().sum())


missing values per column:

country_code        0
area                0
year                0
patents             0
number_patents    615
patents_mapped      0
dtype: int64


### Education DataFrame

In [10]:
print('\neducation_df:\n')
display(education_df)
print('\ndata types per column:\n')
print(education_df.dtypes)


education_df:



Unnamed: 0,region,area,year,public_expenditure_education
0,4,Afghanistan,2010,3.5
1,4,Afghanistan,2015,3.3
2,4,Afghanistan,2019,3.2
3,4,Afghanistan,2020,2.9
4,8,Albania,2005,3.3
...,...,...,...,...
878,894,Zambia,2017,3.7
879,894,Zambia,2020,3.9
880,716,Zimbabwe,2010,1.5
881,716,Zimbabwe,2014,6.1



data types per column:

region                            int64
area                             object
year                              int64
public_expenditure_education    float64
dtype: object


In [11]:
print('\nmissing values per column:\n')
print(education_df.isnull().sum())


missing values per column:

region                          0
area                            0
year                            0
public_expenditure_education    0
dtype: int64
