In [2]:
!pip install pandas
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.0-cp311-cp311-macosx_12_0_arm64.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl (23.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[

In [3]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
target = ["PCIAT-PCIAT_Total", "sii"]
SEED = 42

In [5]:
# Get the current directory of the notebook
current_dir = os.getcwd()

# Construct the absolute path to the dataset folder
dataset_dir = os.path.abspath(os.path.join(current_dir, 'dataset'))

# Construct the absolute path to train.csv
train_csv_path = os.path.join(dataset_dir, 'train.csv')

# Load Tabular Data
train = pd.read_csv(train_csv_path, index_col=None)

#print(train.info())
#print(train.describe())

In [6]:
# Combine "PAQ_A-PAQ_A_Total" and "PAQ_C-PAQ_C_Total" Together
train = train.assign(
    PAQ_Total=train["PAQ_A-PAQ_A_Total"].combine_first(train["PAQ_C-PAQ_C_Total"])
).drop(columns=["PAQ_A-PAQ_A_Total", "PAQ_C-PAQ_C_Total"])

#print(train.info())
#print(train.describe())

In [7]:
# Feature Selection
# Step 1: Removed features containing “season”
exclude_season = [col for col in train.columns if "season" in col.lower()]

# Step 2: Dropped features with more than 70% missing values
missing_rate = train.isnull().mean() * 100
exclude_missing = list(missing_rate[missing_rate > 70].index)

# Step 3: Excluded categorical features derived from other features (e.g., “FGC-FGC_CU_Zone”)
exclude_zone = [col for col in train.columns if "zone" in col.lower()]

# Step 4: Remove PCIAT features from train (keep "PCIAT-PCIAT_Total" for ridge regression)
exclude_PCIAT = ["PCIAT-Season"]+ [col for col in train.columns if "PCIAT-PCIAT_" in col and col[-2:].isdigit()]

train = train.drop(columns=exclude_season + exclude_missing + exclude_zone + exclude_PCIAT)

#print(train.info())
#print(train.describe())

In [None]:
# Handling Outliers
outliers_id = ['cedf96c5', 'e252dcb6', '83525bbe']
train = train[~train['id'].isin(outliers_id)]

train = train.drop('id', axis=1)

#print(train.info())
#print(train.describe())

In [11]:
# Handling Missing Data
# Step 1: Dropped rows with missing labels
train = train.dropna(subset=["sii"])

# Step 2: Imputation:
# 1) Method1: Imputed missing values in other columns using regression based on age
def fill_missing_values_with_regression(train, features, predictor):
    train_imputed = train.copy()
    
    for feature in features:
        # Skip columns with no missing values
        if train_imputed[feature].isna().sum() == 0:
            continue

        # Extract non-missing data for training the regression model
        non_missing_data = train_imputed[train_imputed[feature].notna()]
        if len(non_missing_data) < 2: # Skip if insufficient data
            print(f"Skipping feature '{feature}' due to insufficient data.")
            continue

        X_train = non_missing_data[[predictor]]
        y_train = non_missing_data[feature]
        
        # Train a linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Impute missing values in training data
        train_missing = train_imputed[train_imputed[feature].isna()]
        if not train_missing.empty:
            X_train_missing = train_missing[[predictor]]
            train_imputed.loc[train_imputed[feature].isna(), feature] = model.predict(X_train_missing)

    return train_imputed

# 2) Method2: Imputed missing values in continuous variables - KNN; missing values in categorical variables - random forest
def fill_missing_values_with_knn(train, continuous_features):
    train_imputed = train.copy()
    
    # Scale continuous features
    scaler = StandardScaler()
    train_continuous_scaled = scaler.fit_transform(train[continuous_features])

    # Apply KNN imputation
    imputer = KNNImputer(n_neighbors=5)
    train_continuous_imputed = imputer.fit_transform(train_continuous_scaled)
    
    # Replace the continuous features in the dataset with the imputed values
    train_imputed[continuous_features] = scaler.inverse_transform(train_continuous_imputed)

    return train_imputed

def fill_missing_values_with_random_forest(train, category_features, continuous_features):
    train_imputed = train.copy()

    for feature in category_features:
        # Skip features with no missing values
        if train_imputed[feature].isna().sum() == 0:
                continue

        # Drop rows where feature contains missing value
        df = train_imputed.dropna(subset=[feature])

        # Calculate mutual information with continous features
        mi_continuous = mutual_info_classif(df[continuous_features], df[feature], discrete_features=False)
        mi_continuous_df = pd.DataFrame({
            'Feature': continuous_features,
            'Mutual Information': mi_continuous
        })

        # Calculate mutual information with other category features
        other_category_features = [col for col in category_features if col != feature]
        df_category = df.dropna(subset=other_category_features) # Drop rows where other category features contain missing values
        mi_category = mutual_info_classif(df_category[other_category_features], df_category[feature], discrete_features=True)
        mi_category_df = pd.DataFrame({
            'Feature': other_category_features,
            'Mutual Information': mi_category
        })

        # Combine mutual information results
        mi_combined_df = pd.concat([mi_continuous_df, mi_category_df]).sort_values(by='Mutual Information', ascending=False)

        #print(f"{feature}:")
        #print(mi_combined_df)

        # Filter predictors with mutual information > 0.08
        filtered_predictors = mi_combined_df[mi_combined_df['Mutual Information'] > 0.08]['Feature'].tolist()

        # Handle multicollinearity
        corr_matrix = df[filtered_predictors].corr()
        high_corr_pairs = [
            (col1, col2)
            for col1 in corr_matrix.columns
            for col2 in corr_matrix.columns
            if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.8
        ]

        # Resolve multicollinearity by keeping features with higher mutual information
        mi_dict = pd.concat([mi_continuous_df, mi_category_df]).set_index('Feature')['Mutual Information'].to_dict()
        features_to_drop = set()
        for col1, col2 in high_corr_pairs:
            if mi_dict.get(col1, 0) >= mi_dict.get(col2, 0):
                features_to_drop.add(col2)
            else:
                features_to_drop.add(col1)
        final_predictors = [col for col in filtered_predictors if col not in features_to_drop]

        #print(f"feature: {feature}; final predictors: {final_predictors}")

        # Train a Random Forest classifier
        X = df[final_predictors]
        y = df[feature]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
        model = RandomForestClassifier(random_state=SEED)
        model.fit(X_train, y_train)
    
        # Predict missing values in train
        missing_idx_train = train_imputed[feature].isnull()
        train_imputed.loc[missing_idx_train, feature] = model.predict(train_imputed.loc[missing_idx_train, final_predictors])
    
        
    return train_imputed

# Filter category & continuous features
data_dict = pd.read_csv("dataset/data_dictionary.csv")
category_fields = data_dict[data_dict["Type"].str.contains("categorical", case=False, na=False)]["Field"]
category_features = [col for col in train.columns if col in category_fields.values and col not in target]
continuous_features = [col for col in train.columns if col not in category_fields.values and col not in target]

# Apply imputation method1
train_re = fill_missing_values_with_regression(train, [col for col in train.columns if col not in target], "Basic_Demos-Age")

# Apply imputation method2
train_knn = fill_missing_values_with_knn(train, continuous_features)
train_rf = fill_missing_values_with_random_forest(train_knn, category_features, continuous_features)

# Save results
train_re.to_csv('dataset/train_re.csv', index=False)
train_rf.to_csv('dataset/train_rf.csv', index=False)

#print(train_re.info())
#print(train_rf.info())