In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the train and test datasets

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# Encode the target variable using LabelEncoder
le = LabelEncoder()

# Fit the encoder on the 'prognosis' column of the training DataFrame and transform it
df_train["prognosis"] = le.fit_transform(df_train["prognosis"])

# Get the class names from the encoder, which correspond to the unique target variable values
# This will be useful later when we need to convert the numerical labels back to their original textual labels
target_names = le.classes_

In [4]:
df_train.head()

Unnamed: 0,id,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
2,2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,3
3,3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,6


In [5]:
target_names

array(['Chikungunya', 'Dengue', 'Japanese_encephalitis', 'Lyme_disease',
       'Malaria', 'Plague', 'Rift_Valley_fever', 'Tungiasis',
       'West_Nile_fever', 'Yellow_Fever', 'Zika'], dtype=object)

In [6]:
# Train the RandomForest classifier
# Drop the id and target variable
X = df_train.drop(["id", "prognosis"], axis=1)
y = df_train["prognosis"]

# Drop the id column
X_test = df_test.drop("id", axis=1)

In [7]:
X.columns

Index(['sudden_fever', 'headache', 'mouth_bleed', 'nose_bleed', 'muscle_pain',
       'joint_pain', 'vomiting', 'rash', 'diarrhea', 'hypotension',
       'pleural_effusion', 'ascites', 'gastro_bleeding', 'swelling', 'nausea',
       'chills', 'myalgia', 'digestion_trouble', 'fatigue', 'skin_lesions',
       'stomach_pain', 'orbital_pain', 'neck_pain', 'weakness', 'back_pain',
       'weight_loss', 'gum_bleed', 'jaundice', 'coma', 'diziness',
       'inflammation', 'red_eyes', 'loss_of_appetite', 'urination_loss',
       'slow_heart_rate', 'abdominal_pain', 'light_sensitivity', 'yellow_skin',
       'yellow_eyes', 'facial_distortion', 'microcephaly', 'rigor',
       'bitter_tongue', 'convulsion', 'anemia', 'cocacola_urine',
       'hypoglycemia', 'prostraction', 'hyperpyrexia', 'stiff_neck',
       'irritability', 'confusion', 'tremor', 'paralysis', 'lymph_swells',
       'breathing_restriction', 'toe_inflammation', 'finger_inflammation',
       'lips_irritation', 'itchiness', 'ulcers',

In [8]:
def average_precision(y_true_row, pred_indices, k=3):
    """
    Calculate the average precision of the predicted labels for a single sample.
    
    Args:
    y_true_row (numpy.ndarray): A 1D binary array representing the true target labels
                                for a single sample, where 1 indicates the correct class.
    pred_indices (list): A list of indices representing the top k predictions in descending
                         order of probability for a single sample.
    k (int, optional): The number of top predictions to consider for the average precision
                       calculation. Default is 3.

    Returns:
    float: The average precision for the given sample.
    """
    
    # Initialize variables to track the number of correct predictions and the sum of precisions
    num_correct = 0
    precision_sum = 0
    
    # Loop through the top k predicted indices
    for i, idx in enumerate(pred_indices):
        
        # Check if the predicted index corresponds to the correct class (y_true_row[idx] == 1)
        if y_true_row[idx] == 1:
            
            # If the prediction is correct, increment the number of correct predictions
            num_correct += 1
            
            # Add the precision of the current prediction to the precision sum
            precision_sum += num_correct / (i + 1)
    
    # Calculate the average precision by dividing the precision sum by the minimum of k
    # and the number of non-zero elements in the true target row
    return precision_sum / min(k, np.count_nonzero(y_true_row))

In [9]:
def evaluate_model(name, model, X, y, X_test, n_splits=10):
    """
    Perform cross-validation, calculate the MAP@3, and generate a submission file for each fold.
    
    Args:
    name (str): A string to be used as a prefix for the submission file name.
    model (sklearn.base.BaseEstimator): The model to be used for training and prediction.
    X (pandas.DataFrame): The training input data.
    y (pandas.Series): The training target data.
    X_test (pandas.DataFrame): The test input data.
    n_splits (int, optional): The number of cross-validation splits to be performed. Default is 10.

    Returns:
    list: A list of MAP@3 values for each fold, rounded to 5 decimal places.
    """

    # Initialize a list to store MAP@3, Precision, Recall and F1 Score values for each fold
    map3s, precisions, recalls, f1s = [], [], [], []
    
    # Create a StratifiedKFold object for performing cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=500)

    # Perform cross-validation by iterating through the splits
    for train_index, test_index in skf.split(X, y):
        
        # Split the data into training and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training set
        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test set
        y_pred_proba = model.predict_proba(X_test_cv)

        # Convert the true target labels into a one-hot encoded format
        y_true_one_hot = pd.get_dummies(y_test_cv).to_numpy()

        # Get the top 3 most likely target indices for the test set
        top_3_indices = np.argsort(-y_pred_proba, axis=1)[:, :3]

        # Get the top 1 most likely target indices for the test set
        top_1_indices = np.argsort(-y_pred_proba, axis=1)[:, :1]

        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test_cv, top_1_indices, average="weighted")
        recall = recall_score(y_test_cv, top_1_indices, average="weighted")
        f1 = f1_score(y_test_cv, top_1_indices, average="weighted")

        # Calculate the average precision for each sample in the test set
        average_precisions = np.array([average_precision(y_true_one_hot[i], top_3_indices[i]) for i in range(len(y_true_one_hot))])

        # Calculate the mean average precision (MAP@3) for the test set
        map3 = average_precisions.mean()

        # Append the MAP@3 to the list of MAP@3 values
        map3s.append(map3)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    # Return the list of MAP@3 values for each fold, rounded to 5 decimal places
    return [round(value, 5) for value in map3s], np.mean(precisions), np.mean(recalls), np.mean(f1s)

In [10]:
# Initiliaze the models

input_dim = X.shape[1]

models = {
    "LightGBM": LGBMClassifier(random_state=500),
    "XGBoost": XGBClassifier(random_state=500),
    "CatBoost": CatBoostClassifier(silent=True, random_state=500),
    "RandomForest": RandomForestClassifier(random_state=500),
    "KNN": KNeighborsClassifier(),
    "MLP": MLPClassifier(random_state=500),
}

In [11]:
for name, model in models.items():
    map3_scores, precision, recall, f1 = evaluate_model(name, model, X, y, X_test)
    mean_map3 = np.mean(map3_scores)
    std = np.std(map3_scores)

    # Train the model on the training set
    model.fit(X, y)

    # # Get feature importances
    # importances = model.feature_importances_
    # feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)

    # Predict probabilities for the test set
    y_test_pred_proba = model.predict_proba(X_test)

    # Get the top 3 most likely target indices
    top3_indices = np.argsort(-y_test_pred_proba, axis=1)[:, :3]

    # Convert the indices back to the original target label
    transformed_labels = np.array([le.inverse_transform(row) for row in top3_indices])

    # Create a new DataFrame with the id and top 3 targets
    results_df = pd.DataFrame({"id": df_test["id"], "prognosis": [" ".join(targets) for targets in transformed_labels]})

    # Save the output DataFrame to a CSV file
    results_df.to_csv(f"submissions_{name}.csv", index=False)

    print(f"Model: {name}")
    print(f"MAP@3 Scores: {map3_scores}")
    print(f"Average MAP@3: {mean_map3:.5f}")
    print(f"Std. Deviation: {std:.5f}")
    print(f"Precision Score: {precision:.5f}")
    print(f"Recall Score: {recall:.5f}")
    print(f"F1 Score: {f1:.5f}")
    # print(f"Feature Importance: {feature_importances}")
    print()

Model: LightGBM
MAP@3 Scores: [0.36854, 0.3615, 0.46479, 0.42019, 0.38263, 0.39906, 0.37559, 0.39762, 0.37857, 0.45952]
Average MAP@3: 0.40080
Std. Deviation: 0.03461
Precision Score: 0.25412
Recall Score: 0.27577
F1 Score: 0.25703

Model: XGBoost
MAP@3 Scores: [0.35915, 0.40845, 0.45775, 0.40376, 0.40845, 0.38263, 0.38028, 0.39762, 0.41429, 0.45238]
Average MAP@3: 0.40648
Std. Deviation: 0.02896
Precision Score: 0.25799
Recall Score: 0.27730
F1 Score: 0.26070

Model: CatBoost
MAP@3 Scores: [0.44131, 0.41549, 0.51174, 0.40845, 0.46009, 0.47887, 0.33568, 0.35476, 0.41429, 0.48571]
Average MAP@3: 0.43064
Std. Deviation: 0.05356
Precision Score: 0.29863
Recall Score: 0.31256
F1 Score: 0.29010

Model: RandomForest
MAP@3 Scores: [0.39437, 0.42488, 0.47653, 0.43192, 0.40376, 0.50469, 0.40845, 0.39762, 0.41905, 0.46667]
Average MAP@3: 0.43279
Std. Deviation: 0.03554
Precision Score: 0.27262
Recall Score: 0.30692
F1 Score: 0.28031

Model: KNN
MAP@3 Scores: [0.34742, 0.34977, 0.41784, 0.35915, 