In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


In [2]:
# Loading the dataset to analyze its content
file_path = '/kaggle/input/best-country-to-live-in-2024/best-countries-to-live-in-2024.csv'
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset for an initial overview
data.head()


Unnamed: 0,population_2024,population_growthRate,land_area,country,region,unMember,population_density,population_densityMi,share_borders,Hdi2021,Hdi2020,WorldHappiness2022
0,1441719852,0.00916,3287590,India,Asia,True,484.9067,1255.9084,"AFG, BGD, BTN, MMR, CHN, NPL, PAK, LKA",0.633,0.642,3.777
1,1425178782,-0.00035,9706961,China,Asia,True,151.2174,391.653,"AFG, BTN, MMR, HKG, IND, KAZ, PRK, KGZ, LAO, M...",0.768,0.764,5.585
2,341814420,0.00535,9372610,United States,North America,True,37.3673,96.7813,"CAN, MEX",0.921,0.92,6.977
3,279798049,0.00816,1904569,Indonesia,Asia,True,149.0254,385.9758,"TLS, MYS, PNG",0.705,0.709,5.24
4,245209815,0.01964,881912,Pakistan,Asia,True,318.0908,823.8551,"AFG, CHN, IND, IRN",0.544,0.543,4.516


In [3]:
import pandas as pd
# Columns in the dataset
columns = ['population_2024', 'population_growthRate', 'land_area', 'country', 
           'region', 'unMember', 'population_density', 'population_densityMi', 
           'share_borders', 'Hdi2021', 'Hdi2020', 'WorldHappiness2022']

# 1. Identify Missing Values and Data Imputation
# For numerical columns: Impute with mean or median
# For categorical or boolean columns: Impute with mode or a placeholder value

# Numerical Columns
numerical_cols = ['population_2024', 'population_growthRate', 'land_area', 
                  'population_density', 'population_densityMi', 'Hdi2021', 
                  'Hdi2020', 'WorldHappiness2022']
for col in numerical_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Categorical/Boolean Columns
categorical_cols = ['country', 'region', 'unMember', 'share_borders']
for col in categorical_cols:
    data[col].fillna(data[col].mode().iloc[0], inplace=True)

# 2. Data Type Correction
# Correcting data types, for example, ensuring 'unMember' is a boolean
data['unMember'] = data['unMember'].astype(bool)

# 3. Outlier Detection and Handling
# Here we can use
# Calculating IQR for 'population_density'
Q1 = data['population_density'].quantile(0.25)
Q3 = data['population_density'].quantile(0.75)
IQR = Q3 - Q1

# Defining bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filtering out outliers
data = data[(data['population_density'] >= lower_bound) & (data['population_density'] <= upper_bound)]

# 4. Duplicate Rows
# Check for and remove any duplicate rows in the dataset
data.drop_duplicates(inplace=True)



In [4]:
import pandas as pd
import plotly.express as px


# Filter the numeric columns for aggregation
numeric_cols = ['population_2024', 'population_growthRate', 'land_area', 
                'population_density', 'population_densityMi', 'Hdi2021', 
                'Hdi2020', 'WorldHappiness2022']

# 1. Meaningful Comparison

# Regional Comparison - Calculating average values per region for numeric columns
regional_averages = data.groupby('region')[numeric_cols].mean().reset_index()
fig_hdi_region = px.bar(regional_averages, x='region', y='Hdi2021', 
                        title='Average HDI 2021 by Region',
                        labels={'Hdi2021': 'Average HDI 2021'},
                        color='Hdi2021',
                        color_continuous_scale='Viridis')

# Land Area vs. Population Density - Scatter plot
fig_area_density = px.scatter(data, x='land_area', y='population_density',
                              title='Land Area vs Population Density',
                              labels={'land_area': 'Land Area (sq km)', 'population_density': 'Population Density (per sq km)'},
                              color='population_density',
                              color_continuous_scale='Viridis')

# Display the plots
fig_hdi_region.show()
fig_area_density.show()


In [5]:
data.dtypes

population_2024            int64
population_growthRate    float64
land_area                  int64
country                   object
region                    object
unMember                    bool
population_density       float64
population_densityMi     float64
share_borders             object
Hdi2021                  float64
Hdi2020                  float64
WorldHappiness2022       float64
dtype: object

In [6]:
# HDI and World Happiness Index
fig_hdi_happiness = px.scatter(data, x='Hdi2021', y='WorldHappiness2022',
                               title='HDI 2021 vs World Happiness Index 2022',
                               labels={'Hdi2021': 'HDI 2021', 'WorldHappiness2022': 'World Happiness Index 2022'},
                               color_continuous_scale='Viridis')

# Population Growth and HDI
fig_pop_growth_hdi = px.scatter(data, x='population_growthRate', y='Hdi2021',
                                title='Population Growth Rate vs HDI 2021',
                                labels={'population_growthRate': 'Population Growth Rate', 'Hdi2021': 'HDI 2021'},
                                color_continuous_scale='Viridis')

# Population Density and Happiness
fig_density_happiness = px.scatter(data, x='population_density', y='WorldHappiness2022',
                                   title='Population Density vs World Happiness Index 2022',
                                   labels={'population_density': 'Population Density', 'WorldHappiness2022': 'World Happiness Index 2022'},
                                   color_continuous_scale='Viridis')

# Display the plots
fig_hdi_happiness.show()
fig_pop_growth_hdi.show()
fig_density_happiness.show()


# HDI 2021, World Happiness Index 2022, and Population Density in a 3D Scatter Plot
fig_3d_scatter = px.scatter_3d(data, x='Hdi2021', y='WorldHappiness2022', z='population_density',
                               title='3D Scatter: HDI 2021, World Happiness Index 2022, and Population Density',
                               labels={'Hdi2021': 'HDI 2021', 'WorldHappiness2022': 'World Happiness Index 2022', 'population_density': 'Population Density'},
                               color='Hdi2021',
                               color_continuous_scale='Viridis')

# Display the 3D scatter plot
fig_3d_scatter.show()


In [7]:
# 1. Top Countries by HDI and Happiness Index
top_hdi_happiness = data.nlargest(10, ['Hdi2021', 'WorldHappiness2022'])
fig_top_hdi_happiness = px.scatter(top_hdi_happiness, x='Hdi2021', y='WorldHappiness2022',
                                   size='Hdi2021',  # Marker size based on HDI value
                                   color='country',
                                   title='Top Countries by HDI and World Happiness Index',
                                   labels={'Hdi2021': 'HDI 2021', 'WorldHappiness2022': 'World Happiness Index'},
                                   color_continuous_scale='Viridis')
fig_top_hdi_happiness.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))  # Outlining markers

# 2. Countries with Highest and Lowest Population Growth
top_pop_growth = data.nlargest(5, 'population_growthRate')
bottom_pop_growth = data.nsmallest(5, 'population_growthRate')
combined_pop_growth = pd.concat([top_pop_growth, bottom_pop_growth])
fig_pop_growth = px.bar(combined_pop_growth, x='country', y='population_growthRate',
                        title='Countries with Highest and Lowest Population Growth',
                        color='population_growthRate',
                        color_continuous_scale='Viridis')

# 3. Densely Populated Yet Happy Countries
high_density_happy = data[data['population_density'] > data['population_density'].median()]
high_density_happy = high_density_happy[high_density_happy['WorldHappiness2022'] > high_density_happy['WorldHappiness2022'].median()]
fig_density_happiness = px.scatter(high_density_happy, x='population_density', y='WorldHappiness2022',
                                   size='population_density',  # Marker size based on population density
                                   color='country',
                                   title='Densely Populated Yet Happy Countries',
                                   labels={'population_density': 'Population Density', 'WorldHappiness2022': 'World Happiness Index'},
                                   color_continuous_scale='Viridis')
fig_density_happiness.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))  # Outlining markers

# Display the plots
fig_top_hdi_happiness.show()
fig_density_happiness.show()
fig_pop_growth.show()



In [8]:
# Geospatial Analysis - Most and Least Happy Countries
fig_happiness_map = px.choropleth(data, locations='country', locationmode='country names',
                                  color='WorldHappiness2022',
                                  title='World Happiness Index 2022',
                                  color_continuous_scale='Viridis',
                                  labels={'WorldHappiness2022': 'World Happiness Index'})

# Display the map
fig_happiness_map.show()

In [9]:
# Geospatial Analysis - HDI 2021 with a spinning globe effect
fig_hdi_map = px.choropleth(data, locations='country', locationmode='country names',
                            color='Hdi2021',
                            title='Human Development Index (HDI) 2021',
                            color_continuous_scale='Viridis',
                            labels={'Hdi2021': 'HDI 2021'},
                            projection='natural earth')  # Natural Earth projection for spinning globe effect

# Adjusting the layout for a better spinning globe visualization
fig_hdi_map.update_geos(projection_type="orthographic")
fig_hdi_map.update_layout(showlegend=True, geo=dict(showcountries=True))

# Display the map
fig_hdi_map.show()