In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

print("Loading and Preparing Data")
df = pd.read_csv("Population-EstimatesCountry.csv")
df_clean = df.dropna(subset=['Income Group']).copy()
TARGET_COL = 'Income Group'

FEATURE_COLS = [
    'Region',
    'Lending category',
    'System of National Accounts',
    'System of trade',
    'IMF data dissemination standard'
]

data_cols = FEATURE_COLS + [TARGET_COL]
df_model = df_clean[data_cols].copy()

df_model.loc[:, FEATURE_COLS] = df_model.loc[:, FEATURE_COLS].fillna('Missing')

df_processed = pd.get_dummies(df_model, columns=FEATURE_COLS, drop_first=True, dtype=int)

X = df_processed.drop(columns=[TARGET_COL])
y = df_processed[TARGET_COL]

df_final_for_review = pd.concat([X, y], axis=1)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows after dropping missing Income Group: {len(df_clean)}")
print(f"Number of features after one-hot encoding: {X.shape[1]}")
print("\nFirst 5 rows of the prepared data (Features and Target):")
print(df_final_for_review.head())

print("\nSplitting Data into Training and Testing Sets")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

print("\nTraining K-Nearest Neighbors Mode")

knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn_model.fit(X_train, y_train)

print("KNN Model training complete.")

print("\nEvaluating Model Performance")
# Predict on the test set
y_pred = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Model Classification Accuracy: {accuracy:.4f}")
print(f"Model F1-Score (weighted): {f1:.4f}")
print("--- End of Model Output ---")

Loading and Preparing Data
Original number of rows: 265
Number of rows after dropping missing Income Group: 215
Number of features after one-hot encoding: 17

First 5 rows of the prepared data (Features and Target):
   Region_Europe & Central Asia  Region_Latin America & Caribbean  \
0                             0                                 1   
2                             0                                 0   
4                             0                                 0   
5                             1                                 0   
6                             1                                 0   

   Region_Middle East & North Africa  Region_North America  Region_South Asia  \
0                                  0                     0                  0   
2                                  1                     0                  0   
4                                  0                     0                  0   
5                                  0         