# Algorithm Implementation: K-Nearest Neighbors (KNN)

## Overview
K-Nearest Neighbors (KNN) implementation for classifying countries based on their development indicators. This algorithm helps us understand how countries cluster together based on their wealth, health, and environmental characteristics, providing insights into development patterns and peer groups.

## Objectives
- Classify countries into development categories (High, Upper-Middle, Lower-Middle, Low)
- Identify similar countries based on multiple indicators
- Analyze the role of GHG emissions in country classification

## Key Questions
1. Which countries form natural peer groups based on development indicators?
2. How does the inclusion of GHG emissions affect country classifications?
3. What role does labor force participation play in country similarity?

In [None]:
# Import required libraries
import wbdata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import folium

# Set random seed for reproducibility
np.random.seed(42)

# Configure visualizations
plt.style.use('seaborn')
sns.set_palette('husl')
%matplotlib inline

## Data Collection and Preprocessing

### 1. Data Collection
We'll fetch data from the World Bank's World Development Indicators database using the `wbdata` package.

In [None]:
# Define indicators
indicators = {
    "NY.GDP.PCAP.CD": "GDP per capita",
    "SP.DYN.LE00.IN": "Life Expectancy",
    "EN.GHG.ALL.MT.CE.AR5": "Total GHG Emissions",
    "SP.POP.TOTL": "Population",
    "SL.TLF.CACT.ZS": "Labor Force Participation",
    "MS.MIL.XPND.GD.ZS": "Military Expenditure",
    "DT.TDS.DPPF.XP.ZS": "Debt Service"
}

# Fetch and process data
def fetch_data():
    print("Fetching data from World Bank...")
    data = {}
    for indicator in indicators.keys():
        print(f"Fetching {indicators[indicator]}...")
        result = wbdata.get_data(indicator)
        data[indicators[indicator]] = result
        
    # Convert to DataFrame
    print("\nProcessing data...")
    rows = []
    for indicator_name, indicator_data in data.items():
        for entry in indicator_data:
            if entry['value'] is not None:
                rows.append({
                    'country': entry['country']['value'],
                    'year': entry['date'],
                    'indicator': indicator_name,
                    'value': float(entry['value'])
                })
    
    # Create DataFrame
    df = pd.DataFrame(rows)
    
    # Pivot and process
    df = df.pivot_table(
        index=['country', 'year'],
        columns='indicator',
        values='value'
    ).reset_index()
    
    # Sort and compute derived metrics
    df['year'] = pd.to_numeric(df['year'])
    df = df.sort_values('year', ascending=False)
    df["GHG Emissions per Capita"] = (df["Total GHG Emissions"] * 1_000_000) / df["Population"]
    df = df.drop(columns=["Total GHG Emissions", "Population"])
    
    return df

# Fetch the data
df = fetch_data()

### 2. Data Preprocessing
We'll clean the data, handle missing values, and prepare it for the KNN algorithm.

In [None]:
def preprocess_data(df):
    # Take most recent year's data for each country
    df_recent = df.sort_values('year', ascending=False).groupby('country').first().reset_index()
    
    # Log transform GDP and GHG emissions
    df_recent['GDP per capita (log)'] = np.log(df_recent['GDP per capita'])
    df_recent['GHG Emissions per Capita (log)'] = np.log(df_recent['GHG Emissions per Capita'])
    
    # Handle missing values
    numeric_columns = df_recent.select_dtypes(include=[np.number]).columns
    df_recent[numeric_columns] = df_recent[numeric_columns].fillna(df_recent[numeric_columns].mean())
    
    # Create development categories based on GDP per capita
    df_recent['Development Category'] = pd.qcut(
        df_recent['GDP per capita'],
        q=4,
        labels=['Low', 'Lower-Middle', 'Upper-Middle', 'High']
    )
    
    return df_recent

# Preprocess the data
df_processed = preprocess_data(df)

# Display basic information about the processed dataset
print("\nProcessed Dataset Info:")
print(df_processed.info())

print("\nSample of processed data:")
print(df_processed.head())

## Data Analysis and Visualization

### 1. Exploratory Data Analysis

In [None]:
def plot_exploratory_analysis(df):
    # Create a figure with multiple subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Correlation Heatmap
    plt.subplot(2, 2, 1)
    correlation_matrix = df.select_dtypes(include=[np.number]).corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Numeric Features')
    
    # 2. GDP vs Life Expectancy by Development Category
    plt.subplot(2, 2, 2)
    sns.scatterplot(
        data=df,
        x='GDP per capita (log)',
        y='Life Expectancy',
        hue='Development Category',
        size='GHG Emissions per Capita',
        sizes=(50, 400),
        alpha=0.6
    )
    plt.title('GDP vs Life Expectancy by Development Category')
    
    # 3. Distribution of Life Expectancy by Development Category
    plt.subplot(2, 2, 3)
    sns.boxplot(
        data=df,
        x='Development Category',
        y='Life Expectancy'
    )
    plt.title('Life Expectancy Distribution by Development Category')
    plt.xticks(rotation=45)
    
    # 4. Labor Force Participation vs Military Expenditure
    plt.subplot(2, 2, 4)
    sns.scatterplot(
        data=df,
        x='Labor Force Participation',
        y='Military Expenditure',
        hue='Development Category',
        size='GHG Emissions per Capita',
        sizes=(50, 400),
        alpha=0.6
    )
    plt.title('Labor Force Participation vs Military Expenditure')
    
    plt.tight_layout()
    plt.show()

# Plot exploratory analysis
plot_exploratory_analysis(df_processed)

## Algorithm Implementation

### 1. Prepare Data for KNN

In [None]:
def prepare_knn_data(df):
    # Select features for KNN
    features = [
        'GDP per capita (log)',
        'Life Expectancy',
        'GHG Emissions per Capita (log)',
        'Labor Force Participation',
        'Military Expenditure',
        'Debt Service'
    ]
    
    X = df[features]
    y = df['Development Category']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    return X_train, X_test, y_train, y_test, features

# Prepare data for KNN
X_train, X_test, y_train, y_test, features = prepare_knn_data(df_processed)

### 2. KNN Model Implementation and Tuning

In [None]:
def implement_knn():
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())
    ])
    
    # Define parameter grid
    param_grid = {
        'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'knn__weights': ['uniform', 'distance'],
        'knn__metric': ['euclidean', 'manhattan']
    }
    
    # Perform grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    return best_model, y_pred, grid_search

# Implement KNN
best_model, y_pred, grid_search = implement_knn()

# Print results
print("Best parameters:", grid_search.best_params_)
print("\nBest cross-validation score:", grid_search.best_score_)
print("\nTest set accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Results and Interpretation

### 1. Visualize Results

In [None]:
def visualize_results(model, X_test, y_test, y_pred):
    # 1. Confusion Matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    # 2. Feature Importance (based on model performance degradation)
    importances = []
    baseline_score = accuracy_score(y_test, y_pred)
    
    for feature in features:
        X_test_temp = X_test.copy()
        X_test_temp[feature] = X_test_temp[feature].mean()
        y_pred_temp = model.predict(X_test_temp)
        score = accuracy_score(y_test, y_pred_temp)
        importances.append(baseline_score - score)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=importances, y=features)
    plt.title('Feature Importance')
    plt.xlabel('Importance Score')
    plt.show()

# Visualize results
visualize_results(best_model, X_test, y_test, y_pred)

### 2. Analyze Country Groupings

In [None]:
def analyze_country_groups(df, model):
    # Get predictions for all countries
    X_all = df[features]
    y_pred_all = model.predict(X_all)
    
    # Create analysis DataFrame
    analysis_df = pd.DataFrame({
        'Country': df['country'],
        'Actual Category': df['Development Category'],
        'Predicted Category': y_pred_all,
        'GDP per capita': df['GDP per capita'],
        'Life Expectancy': df['Life Expectancy'],
        'GHG Emissions per Capita': df['GHG Emissions per Capita']
    })
    
    # Find interesting cases (misclassifications)
    misclassified = analysis_df[analysis_df['Actual Category'] != analysis_df['Predicted Category']]
    
    print("Interesting Cases (Misclassifications):")
    print(misclassified.to_string())
    
    return analysis_df

# Analyze country groups
analysis_df = analyze_country_groups(df_processed, best_model)

# Save results
analysis_df.to_csv('knn_analysis_results.csv', index=False)
print("\nResults saved to 'knn_analysis_results.csv'")