In [1]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#read the data and load it in a df
hdi = pd.read_csv('./hdi.csv')

### Data understanding

In [2]:
# hdi data is useful to identify countries with high Human Development Index (HDI)
# starting hdi data  (first 5 rows)
hdi.head()

Unnamed: 0,country,hdicode
0,Afghanistan,Low
1,Angola,Medium
2,Albania,High
3,Andorra,Very High
4,United Arab Emirates,Very High


In [3]:
# Get info about dataframe
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  206 non-null    object
 1   hdicode  191 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB


In [4]:
# Get number of row and columns
num_row= hdi.shape[0]
num_col= hdi.shape[1]
print("number of rows: {}".format(num_row))
print("number of columns: {}".format(num_col))

number of rows: 206
number of columns: 2


In [5]:
# Get column names
hdi.columns

Index(['country', 'hdicode'], dtype='object')

In [6]:
# Get description of all columns
hdi.describe(include='all')

Unnamed: 0,country,hdicode
count,206,191
unique,206,4
top,Afghanistan,Very High
freq,1,66


### Data preparation

In [7]:
#check for null values in hdicode column
print(hdi['hdicode'].isnull().sum())

15


In [8]:
#drop rows with missing hdicode values
hdi_drop=hdi.dropna(subset=["hdicode"],axis=0)
hdi_drop

Unnamed: 0,country,hdicode
0,Afghanistan,Low
1,Angola,Medium
2,Albania,High
3,Andorra,Very High
4,United Arab Emirates,Very High
...,...,...
190,Samoa,High
191,Yemen,Low
192,South Africa,High
193,Zambia,Medium


In [9]:
#find unique hdi codes
hdi_drop['hdicode'].unique()

array(['Low', 'Medium', 'High', 'Very High'], dtype=object)

### Evaluation

In [10]:
#only keep rows where hdicode is very high
    # Create a boolean series
boolean_series = hdi_drop['hdicode'] == 'Very High'
    # Use the boolean series as a filter on the DataFrame using .loc
hdi_drop_final = hdi_drop.loc[boolean_series]

hdi_drop_final

Unnamed: 0,country,hdicode
3,Andorra,Very High
4,United Arab Emirates,Very High
5,Argentina,Very High
8,Australia,Very High
9,Austria,Very High
...,...,...
171,Thailand,Very High
176,Trinidad and Tobago,Very High
178,Turkey,Very High
183,Uruguay,Very High


In [11]:
country_list = hdi_drop_final['country'].tolist()
country_list

['Andorra',
 'United Arab Emirates',
 'Argentina',
 'Australia',
 'Austria',
 'Belgium',
 'Bahrain',
 'Bahamas',
 'Belarus',
 'Brunei Darussalam',
 'Canada',
 'Switzerland',
 'Chile',
 'Costa Rica',
 'Cyprus',
 'Czechia',
 'Germany',
 'Denmark',
 'Spain',
 'Estonia',
 'Finland',
 'France',
 'United Kingdom',
 'Georgia',
 'Greece',
 'Hong Kong, China (SAR)',
 'Croatia',
 'Hungary',
 'Ireland',
 'Iceland',
 'Israel',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Korea (Republic of)',
 'Kuwait',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Latvia',
 'Malta',
 'Montenegro',
 'Mauritius',
 'Malaysia',
 'Netherlands',
 'Norway',
 'New Zealand',
 'Oman',
 'Panama',
 'Poland',
 'Portugal',
 'Qatar',
 'Romania',
 'Russian Federation',
 'Saudi Arabia',
 'Singapore',
 'San Marino',
 'Serbia',
 'Slovakia',
 'Slovenia',
 'Sweden',
 'Thailand',
 'Trinidad and Tobago',
 'Turkey',
 'Uruguay',
 'United States']

### Interpretation

The list above contains all the countries with very high HDI.