# CSC2042S Assignment 1: Data Preprocessing

This notebook implements Part 1 (Data Preprocessing) of the assignment. It expects the WDI dataset to be located in `CSC2042S-Assignment1-Data/WDICSV.csv`.

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

DATA_DIR = Path('CSC2042S-Assignment1-Data')
WDI_PATH = DATA_DIR / 'WDICSV.csv'

def load_wdi_dataset(path: Path) -> pd.DataFrame:
    """Load WDI CSV into tidy format with rows as country-year and columns as indicators."""
    raw = pd.read_csv(path)
    id_vars = ['Country Name','Country Code','Indicator Name','Indicator Code']
    year_cols = [c for c in raw.columns if c.isdigit()]
    tidy = raw.melt(id_vars=id_vars, value_vars=year_cols,
                    var_name='Year', value_name='Value').dropna(subset=['Value'])
    tidy['Year'] = tidy['Year'].astype(int)
    pivot = tidy.pivot_table(index=['Country Name','Country Code','Year'],
                             columns='Indicator Code', values='Value').reset_index()
    return pivot

def preprocess(df: pd.DataFrame, feature_thresh: float=0.3, sample_thresh: float=0.7):
    """
    Remove features with too many missing values and samples with insufficient coverage.
    feature_thresh: maximum allowed fraction of missing values per feature.
    sample_thresh: minimum required fraction of available features per sample.
    """
    feature_missing = df.isna().mean()
    keep_features = feature_missing[feature_missing <= feature_thresh].index
    df = df[keep_features]
    sample_coverage = df.notna().mean(axis=1)
    df = df.loc[sample_coverage >= sample_thresh]
    df = df.fillna(df.mean())
    return df

def normalize(df: pd.DataFrame) -> pd.DataFrame:
    scaler = MinMaxScaler()
    numeric = df.select_dtypes(include=[np.number])
    scaled = scaler.fit_transform(numeric)
    df[numeric.columns] = scaled
    return df

def visualize_tsne(df: pd.DataFrame, perplexity: float=30.0, random_state: int=0):
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state)
    emb = tsne.fit_transform(df.select_dtypes(include=[np.number]))
    plt.figure(figsize=(6,5))
    plt.scatter(emb[:,0], emb[:,1], s=5)
    plt.title('t-SNE Visualization')
    plt.xlabel('Dim 1')
    plt.ylabel('Dim 2')
    plt.show()

# Example usage
try:
    data = load_wdi_dataset(WDI_PATH)
    print(f"Loaded dataset with shape {data.shape}")
    processed = preprocess(data, feature_thresh=0.3, sample_thresh=0.7)
    print(f"After preprocessing: {processed.shape}")
    normalized = normalize(processed)
    visualize_tsne(normalized.drop(columns=['Country Name','Country Code','Year']))
except FileNotFoundError:
    print('Dataset not found. Please ensure WDICSV.csv is present in CSC2042S-Assignment1-Data.')
except Exception as e:
    print(f'Unable to process dataset: {e}')
