In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("C:/Users/yvonn/OneDrive/Desktop/SUNWAY/202409-S4/Business Intelligence/Assignment/HDIdata.csv")

In [3]:
df.head()

Unnamed: 0,crossid,Country.x,year,hdi
0,crossid,country,year,hdi
1,Afghanistan_1980,Afghanistan,1980,
2,Afghanistan_1981,Afghanistan,1981,
3,Afghanistan_1982,Afghanistan,1982,
4,Afghanistan_1983,Afghanistan,1983,


In [4]:
# Rename the 'Country.x' column to 'Country name'
df = df.rename(columns={'Country.x': 'Country name'})
df.head()

Unnamed: 0,crossid,Country name,year,hdi
0,crossid,country,year,hdi
1,Afghanistan_1980,Afghanistan,1980,
2,Afghanistan_1981,Afghanistan,1981,
3,Afghanistan_1982,Afghanistan,1982,
4,Afghanistan_1983,Afghanistan,1983,


In [5]:
# Create a dictionary to map different country name variations to a standard format
country_name_corrections = {
    'Gambia, The': 'Gambia',
    "CÃ´te d'Ivoire": 'Ivory Coast',
    'Türkiye': 'Turkey',
    'Laos': 'Lao PDR',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Viet Nam': 'Vietnam',
    'Syrian Arab Republic': 'Syria',
    'Slovak Republic': 'Slovakia',
    'Bahamas, The': 'Bahamas',
    'Congo, Dem. Rep.': 'Congo (Kinshasa)',
    'Congo, Democratic Rep.': 'Congo (Kinshasa)',
    'Congo, Rep.': 'Congo (Brazzaville)',
    'Czech Republic': 'Czechia',
    'Egypt, Arab Rep.': 'Egypt',
    'Hong Kong SAR': 'Hong Kong S.A.R. of China',
    'Hong Kong SAR, China': 'Hong Kong S.A.R. of China',
    'Iran, Islamic Rep.': 'Iran',
    'Korea, Dem. People\'s Rep.': 'North Korea',
    'Korea, Rep.': 'South Korea',
    'Macao SAR, China': 'Macau',
    'Macedonia, FYR': 'North Macedonia',
    'Russia': 'Russian Federation',
    'Taiwan, China': 'Taiwan Province of China',
    'United States': 'United States of America',
    'Venezuela, RB': 'Venezuela',
    'Yemen, Rep.': 'Yemen'
}

# Replace country names in both datasets based on the mapping
df['Country name'] = df['Country name'].replace(country_name_corrections)

# Replace the crossid based on corrected name
df['crossid'] = df['Country name'] + '_' + df['year'].astype(str)

In [6]:
# Convert the 'year' column to integer (if it contains non-numeric values, can use errors='coerce' to handle them)
df['year'] = pd.to_numeric(df['year'], errors='coerce')

In [7]:
# Filter the merged dataset to include only rows where 'year' is between 2011 and 2022
filtered_df = df[(df['year'] >= 2011) & (df['year'] <= 2022)]


In [8]:
# Check for missing values in the merged DataFrame
missing_values = df.isnull().sum()

# Display the count of missing values
print(missing_values)

crossid            0
Country name       0
year               1
hdi             3257
dtype: int64


In [9]:
# Export the DataFrame to a CSV file
csv_file_path = 'HDI_Cleaned.csv'
filtered_df.to_csv(csv_file_path, index=False)

print(f'DataFrame successfully exported to {csv_file_path}')

DataFrame successfully exported to HDI_Cleaned.csv


In [10]:
# Drop rows with missing values in 'hdi'
df_clean = filtered_df.dropna(subset=['hdi'])

In [11]:
# Check for missing values in the merged DataFrame
missing_values = df_clean.isnull().sum()

# Display the count of missing values
print(missing_values)

crossid         0
Country name    0
year            0
hdi             0
dtype: int64


In [12]:
# Convert 'hdi' column to numeric (handling errors if any)
df_clean['hdi'] = pd.to_numeric(df_clean['hdi'], errors='coerce')  # 'coerce' converts invalid values to NaN

# Group by 'Country name' and calculate the median of 'hdi'
df_grouped = df_clean.groupby('Country name')['hdi'].median().reset_index()

# Print the grouped data with median hdi
print(df_grouped)

    Country name     hdi
0    Afghanistan  0.4795
1        Albania  0.7940
2        Algeria  0.7370
3        Andorra  0.8595
4         Angola  0.5910
..           ...     ...
171      Vietnam  0.7040
172        World  0.7300
173        Yemen  0.4310
174       Zambia  0.5645
175     Zimbabwe  0.5480

[176 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['hdi'] = pd.to_numeric(df_clean['hdi'], errors='coerce')  # 'coerce' converts invalid values to NaN


In [13]:
# Save to CSV file with two columns: Country name and median hdi
df_grouped.to_csv('median_hdi_by_country.csv', index=False)

# Print the path to check the saved file (if working locally)
print("CSV file 'median_hdi_by_country.csv' created successfully.")

CSV file 'median_hdi_by_country.csv' created successfully.


In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [15]:
# Standardize the HDI values
scaler = StandardScaler()
df_grouped['hdi_scaled'] = scaler.fit_transform(df_grouped[['hdi']])

In [16]:
# Apply KMeans clustering (3 clusters for low, middle, and high income)
kmeans = KMeans(n_clusters=4, random_state=42)
df_grouped['cluster'] = kmeans.fit_predict(df_grouped[['hdi_scaled']])



In [17]:
# Check the cluster centers (in scaled HDI values)
print("Cluster Centers (Scaled HDI):")
print(kmeans.cluster_centers_)

# Convert cluster centers back to the original HDI values for better interpretation
cluster_centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
print("Cluster Centers (Original HDI):")
print(cluster_centers_original)


Cluster Centers (Scaled HDI):
[[-0.68455246]
 [ 1.17567075]
 [-1.57866907]
 [ 0.28002464]]
Cluster Centers (Original HDI):
[[0.60781429]
 [0.89635106]
 [0.46912903]
 [0.75742857]]


In [18]:
cluster_centers_combined = np.hstack((kmeans.cluster_centers_, cluster_centers_original))

# Sort the cluster centers by the original HDI values (second column in the array)
cluster_centers_sorted = cluster_centers_combined[cluster_centers_combined[:, 1].argsort()]

# Print the sorted cluster centers
print("Cluster Centers (Scaled and Original HDI) in Ascending Order:")
print(cluster_centers_sorted)

Cluster Centers (Scaled and Original HDI) in Ascending Order:
[[-1.57866907  0.46912903]
 [-0.68455246  0.60781429]
 [ 0.28002464  0.75742857]
 [ 1.17567075  0.89635106]]


In [19]:
# Cluster centers after sorting
cluster_centers_sorted = cluster_centers_combined[cluster_centers_combined[:, 1].argsort()]

# Map sorted clusters to income groups based on ascending HDI
cluster_labels_sorted = {
    np.where(kmeans.cluster_centers_ == cluster_centers_sorted[0, 0])[0][0]: 'Low income',
    np.where(kmeans.cluster_centers_ == cluster_centers_sorted[1, 0])[0][0]: 'Lower-middle income',
    np.where(kmeans.cluster_centers_ == cluster_centers_sorted[2, 0])[0][0]: 'Upper-middle income',
    np.where(kmeans.cluster_centers_ == cluster_centers_sorted[3, 0])[0][0]: 'High income'
}

# Assign the new income group labels to the DataFrame based on the new cluster labels
df_grouped['income_group'] = df_grouped['cluster'].map(cluster_labels_sorted)

# Print the updated DataFrame
print(df_grouped[['Country name', 'hdi', 'income_group']])


    Country name     hdi         income_group
0    Afghanistan  0.4795           Low income
1        Albania  0.7940  Upper-middle income
2        Algeria  0.7370  Upper-middle income
3        Andorra  0.8595          High income
4         Angola  0.5910  Lower-middle income
..           ...     ...                  ...
171      Vietnam  0.7040  Upper-middle income
172        World  0.7300  Upper-middle income
173        Yemen  0.4310           Low income
174       Zambia  0.5645  Lower-middle income
175     Zimbabwe  0.5480  Lower-middle income

[176 rows x 3 columns]


In [20]:
# Save the clustered data to a CSV file
df_grouped[['Country name', 'hdi', 'income_group']].to_csv('clustered_countries_income_groups.csv', index=False)

In [21]:
# Print the result
print(df_grouped[['Country name', 'hdi', 'income_group']])

    Country name     hdi         income_group
0    Afghanistan  0.4795           Low income
1        Albania  0.7940  Upper-middle income
2        Algeria  0.7370  Upper-middle income
3        Andorra  0.8595          High income
4         Angola  0.5910  Lower-middle income
..           ...     ...                  ...
171      Vietnam  0.7040  Upper-middle income
172        World  0.7300  Upper-middle income
173        Yemen  0.4310           Low income
174       Zambia  0.5645  Lower-middle income
175     Zimbabwe  0.5480  Lower-middle income

[176 rows x 3 columns]


In [22]:
# Group countries by their cluster assignment
for cluster in range(4):  # Assuming 4 clusters
    print(f"\nCountries in Cluster {cluster} ({cluster_labels_sorted[cluster]}):")
    print(df_grouped[df_grouped['cluster'] == cluster][['Country name', 'hdi']])



Countries in Cluster 0 (Lower-middle income):
              Country name     hdi
4                   Angola  0.5910
12              Bangladesh  0.6175
18                  Bhutan  0.6395
26              Cabo Verde  0.6620
27                Cambodia  0.5770
28                Cameroon  0.5700
35                 Comoros  0.5670
47             El Salvador  0.6650
48       Equatorial Guinea  0.6490
58                   Ghana  0.5865
61               Guatemala  0.6315
65                   Haiti  0.5515
66                Honduras  0.6140
69                   India  0.6315
71                    Iraq  0.6610
80                   Kenya  0.5885
81                Kiribati  0.6255
104                Morocco  0.6665
106                Myanmar  0.5735
107                Namibia  0.6230
108                  Nauru  0.6595
109                  Nepal  0.5775
112              Nicaragua  0.6510
121       Papua New Guinea  0.5500
133  Sao Tome and Principe  0.6015
141        Solomon Islands  0.5635
144     