In [15]:
# Import all the tools you will need

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import average_precision_score

In [3]:
# Load the data
df = pd.read_csv("train.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
sudden_fever,1.0,0.0,0.0,0.0,0.0
headache,1.0,0.0,1.0,0.0,0.0
mouth_bleed,0.0,0.0,1.0,1.0,0.0
nose_bleed,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...
ulcers,0.0,0.0,0.0,0.0,1.0
toenail_loss,0.0,0.0,1.0,0.0,1.0
speech_problem,0.0,0.0,1.0,0.0,0.0
bullseye_rash,0.0,0.0,1.0,0.0,0.0


In [4]:
# The prognosis column can be turned into a categories 
# check if there are any missing values
df.isna().sum()

id                0
sudden_fever      0
headache          0
mouth_bleed       0
nose_bleed        0
                 ..
ulcers            0
toenail_loss      0
speech_problem    0
bullseye_rash     0
prognosis         0
Length: 66, dtype: int64

In [5]:
df.dtypes

id                  int64
sudden_fever      float64
headache          float64
mouth_bleed       float64
nose_bleed        float64
                   ...   
ulcers            float64
toenail_loss      float64
speech_problem    float64
bullseye_rash     float64
prognosis          object
Length: 66, dtype: object

In [6]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

prognosis


In [7]:
# Turn the prognosis column into categories
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 66 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   id                     707 non-null    int64   
 1   sudden_fever           707 non-null    float64 
 2   headache               707 non-null    float64 
 3   mouth_bleed            707 non-null    float64 
 4   nose_bleed             707 non-null    float64 
 5   muscle_pain            707 non-null    float64 
 6   joint_pain             707 non-null    float64 
 7   vomiting               707 non-null    float64 
 8   rash                   707 non-null    float64 
 9   diarrhea               707 non-null    float64 
 10  hypotension            707 non-null    float64 
 11  pleural_effusion       707 non-null    float64 
 12  ascites                707 non-null    float64 
 13  gastro_bleeding        707 non-null    float64 
 14  swelling               707 non-null    flo

In [29]:
df.prognosis.cat.categories

Index(['Chikungunya', 'Dengue', 'Japanese_encephalitis', 'Lyme_disease',
       'Malaria', 'Plague', 'Rift_Valley_fever', 'Tungiasis',
       'West_Nile_fever', 'Yellow_Fever', 'Zika'],
      dtype='object')

In [9]:
# Split the data into X and Y
X = df.drop("prognosis", axis=1)
y = df.prognosis

In [10]:
# Split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# Import all the models
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier

In [12]:
# Create a models dictionary
models = {"LinearSVC": LinearSVC(),
          "LogisticRegression": LogisticRegression(),
          "KNN": KNeighborsClassifier(),
          "RandomForestCls": RandomForestClassifier(),
          "AdaBoostClf": AdaBoostClassifier()}
baseline_results = {}

Use this article to fix the bug below: 

https://towardsdatascience.com/mean-average-precision-at-k-map-k-clearly-explained-538d8e032d2

I used code from this tutorial here: 

https://www.kaggle.com/code/nandeshwar/mean-average-precision-map-k-metric-explained-code/notebook

In [30]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [31]:
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train); 
    # Score the model
    predictions = model.predict(X_test)
    baseline_results[name] = mapk(y_test, predictions)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
baseline_results

{'LinearSVC': 0.24884114209063904,
 'LogisticRegression': 0.3087650373138301,
 'KNN': 0.20048680974737312,
 'RandomForestCls': 0.3361952535317726,
 'AdaBoostClf': 0.2747225415987991}

Now, since we have established some baseline results we can focus on improving them. We will start with improving RandomForestClassifier. 

In [None]:
%%time
# Create a GridSearchCV for RandomForestClassifier
random_forest_clf_grid = {"n_estimators": np.arange(100, 2000, 100),
                          "max_depth": (5, 8, 15, 25, 30),
                          "min_samples_split": (2, 5, 10, 15, 100),
                          "min_samples_leaf": (1, 2, 5, 10)}
gs_random_forest_clf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                                    param_grid=random_forest_clf_grid,
                                    cv=5,
                                    verbose=True,
                                    n_jobs=-1)
gs_random_forest_clf.fit(X_train, y_train)