In [59]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# 0. Initial Data Loading and Exploration

## HDI

In [60]:
hdi_df = pd.read_csv('data/HDR21-22_Composite_indices_complete_time_series.csv')
hdi_df.sample(3)

Unnamed: 0,iso3,country,hdicode,region,hdi_rank_2021,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,...,mf_2012,mf_2013,mf_2014,mf_2015,mf_2016,mf_2017,mf_2018,mf_2019,mf_2020,mf_2021
141,POL,Poland,Very High,,34.0,0.716,0.713,0.718,0.731,0.741,...,18.03,16.55,17.12,17.55,17.96,17.93,17.78,17.85,17.85,17.85
104,LTU,Lithuania,Very High,,35.0,0.734,0.729,0.714,0.697,0.693,...,38.46,39.23,41.49,36.91,37.99,39.93,38.38,40.13,40.13,40.13
50,ECU,Ecuador,High,LAC,95.0,0.651,0.652,0.658,0.662,0.667,...,7.84,10.1,10.32,9.0,7.65,8.06,7.86,7.74,7.74,7.74


In [61]:
# Exploring the columns
hdi_df.columns.tolist()

['iso3',
 'country',
 'hdicode',
 'region',
 'hdi_rank_2021',
 'hdi_1990',
 'hdi_1991',
 'hdi_1992',
 'hdi_1993',
 'hdi_1994',
 'hdi_1995',
 'hdi_1996',
 'hdi_1997',
 'hdi_1998',
 'hdi_1999',
 'hdi_2000',
 'hdi_2001',
 'hdi_2002',
 'hdi_2003',
 'hdi_2004',
 'hdi_2005',
 'hdi_2006',
 'hdi_2007',
 'hdi_2008',
 'hdi_2009',
 'hdi_2010',
 'hdi_2011',
 'hdi_2012',
 'hdi_2013',
 'hdi_2014',
 'hdi_2015',
 'hdi_2016',
 'hdi_2017',
 'hdi_2018',
 'hdi_2019',
 'hdi_2020',
 'hdi_2021',
 'le_1990',
 'le_1991',
 'le_1992',
 'le_1993',
 'le_1994',
 'le_1995',
 'le_1996',
 'le_1997',
 'le_1998',
 'le_1999',
 'le_2000',
 'le_2001',
 'le_2002',
 'le_2003',
 'le_2004',
 'le_2005',
 'le_2006',
 'le_2007',
 'le_2008',
 'le_2009',
 'le_2010',
 'le_2011',
 'le_2012',
 'le_2013',
 'le_2014',
 'le_2015',
 'le_2016',
 'le_2017',
 'le_2018',
 'le_2019',
 'le_2020',
 'le_2021',
 'eys_1990',
 'eys_1991',
 'eys_1992',
 'eys_1993',
 'eys_1994',
 'eys_1995',
 'eys_1996',
 'eys_1997',
 'eys_1998',
 'eys_1999',
 'eys_20

In [62]:
# Selecting the columns of interest
hdi_df = hdi_df[['country', 'hdicode', 'hdi_2021', 'region']]
hdi_df.head()

Unnamed: 0,country,hdicode,hdi_2021,region
0,Afghanistan,Low,0.478,SA
1,Angola,Medium,0.586,SSA
2,Albania,High,0.796,ECA
3,Andorra,Very High,0.858,
4,United Arab Emirates,Very High,0.911,AS


In [63]:
# Renaming the columns so that they are easier to understand 
hdi_df.rename(columns={'hdi_2021': 'HDI',
                       'country': 'Country',
                       'hdicode': 'HDI Group',
                       'region': 'Region'}, inplace=True)    

In [64]:
hdi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    206 non-null    object 
 1   HDI Group  191 non-null    object 
 2   HDI        202 non-null    float64
 3   Region     151 non-null    object 
dtypes: float64(1), object(3)
memory usage: 6.6+ KB


#### Lets look at the null values

In [65]:
# Looking at missing HDI values
hdi_df[hdi_df['HDI'].isnull()]

Unnamed: 0,Country,HDI Group,HDI,Region
108,Monaco,,,
132,Nauru,,,EAP
142,Korea (Democratic People's Rep. of),,,EAP
158,Somalia,,,AS


In [66]:
# Lookinz at the missing HDI Group values
hdi_df[hdi_df['HDI Group'].isnull()]

Unnamed: 0,Country,HDI Group,HDI,Region
108,Monaco,,,
132,Nauru,,,EAP
142,Korea (Democratic People's Rep. of),,,EAP
158,Somalia,,,AS
195,Very high human development,,0.896,
196,High human development,,0.754,
197,Medium human development,,0.636,
198,Low human development,,0.518,
199,Arab States,,0.708,
200,East Asia and the Pacific,,0.749,


In [67]:
# Getting the regions (abbreviations)
hdi_df['Region'].value_counts()  

Region
SSA    46
LAC    33
EAP    26
AS     20
ECA    17
SA      9
Name: count, dtype: int64

As we can see we have 4 countries with missing HDI scores. These will have to be dropped as there would be no way to fill in the data for these cells (one could try to look for other datasets). We can also observe that there are summaries of the HDI scores by region. We save these in a separate dataframe so that we can have the data on the countries in one dataframe and the data about the regions in a separate one.

In [68]:
hdi_df_regions = hdi_df.tail(11)
hdi_df_regions

Unnamed: 0,Country,HDI Group,HDI,Region
195,Very high human development,,0.896,
196,High human development,,0.754,
197,Medium human development,,0.636,
198,Low human development,,0.518,
199,Arab States,,0.708,
200,East Asia and the Pacific,,0.749,
201,Europe and Central Asia,,0.796,
202,Latin America and the Caribbean,,0.754,
203,South Asia,,0.632,
204,Sub-Saharan Africa,,0.547,


In [69]:
hdi_df = hdi_df.drop(hdi_df.tail(11).index)
hdi_df.tail()

Unnamed: 0,Country,HDI Group,HDI,Region
190,Samoa,High,0.707,EAP
191,Yemen,Low,0.455,AS
192,South Africa,High,0.713,SSA
193,Zambia,Medium,0.565,SSA
194,Zimbabwe,Medium,0.593,SSA


In [72]:
# Lets drop the countries with NaN HDI values
hdi_df['HDI'].dropna(inplace=True)
hdi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    195 non-null    object 
 1   HDI Group  191 non-null    object 
 2   HDI        191 non-null    float64
 3   Region     151 non-null    object 
dtypes: float64(1), object(3)
memory usage: 6.2+ KB


In [None]:
# Lets set the countries as the index
hdi_df.set_index('Country', inplace=True)

We won't drop the rows with null values in the 'Region' row as we can still use the HDI-value of these countries for non-regional analysis.

## IQ

In [70]:
iq_df = pd.read_csv('data/National_IQ.csv')
iq_df.sample(3)

Unnamed: 0,Rank,Country,Measured IQ,IQ data quality,SchAch,SA direct,SA scaled,SA data quality,Final IQ,Final IQ.1
129,130.0,Belize,,,342.5,72.1,76.8,1.0,76.8,76.8
103,104.5,Puerto Rico,83.5,8.0,,,,,83.5,83.5
168,,EastTimor,,,,,,,(85),85.0


In [71]:
print(hdi_df.columns)


Index(['Country', 'HDI Group', 'HDI', 'Region'], dtype='object')


# 1. Cleaning the Data


# 1. Merge Data


# 2. INTRODUCTION

In the introduction, provide the description of the problem addressed (the context of your data) and the project objectives.
Very briefly describe the analysis design and how it accomplishes the stated objectives. 
State your research hypotheses in a human-understandable language.
What  can the results be used for?

# 3. DATA CLEANING AND PREPARATION

What did you need to do to clean and prepare your dataset?
Missing values, duplicates, inconsistent data types…


# 4.  DESCRIPTIVE STATISTICS

## 4.1  Univariate analysis
Histogram and metrics introduced in class. Outliers identification. Interpret and discuss your results.

## 4.2  Bivariate analysis
Scatter plots and correlation for pairs of variables of interest. Interpret and discuss your results.



# 5.  DISCUSSION AND PRELIMINARY CONCLUSIONS 

Discuss the initial insights and how they align with the objectives set in the Introduction. Briefly address any limitations or challenges encountered in the data or analysis. Reflect on the implications of these findings and how they might guide future research directions or applications
