# MEDHACK Beginner Kaggle Competition

In [33]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


pd.set_option('display.max_columns', None)

In [19]:
df = pd.read_csv('train_data.csv')
X_test = pd.read_csv('test_data.csv')

In [18]:
X = df.drop('state_label', axis=1)  # Features (input data)
y = df['state_label']  # Target (output data)

# Perform train-test split (80% train, 20% test by default)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=7)

# Print the shapes of the resulting datasets
print(f"Training data (features) shape: {X_train.shape}")
print(f"Testing data (features) shape: {X_val.shape}")
print(f"Training data (target) shape: {y_train.shape}")
print(f"Testing data (target) shape: {y_val.shape}")

Training data (features) shape: (9644544, 15)
Testing data (features) shape: (2411136, 15)
Training data (target) shape: (9644544,)
Testing data (target) shape: (2411136,)


In [23]:
training_features = X_train[
    [
        'age',
        'gender',
        'diastolic_bp',
        'systolic_bp',
        'heart_rate',
        'respiratory_rate',
        'oxygen_saturation'
    ]
]

# Display the data types of the selected columns
print(training_features.dtypes)

age                    int64
gender                object
diastolic_bp         float64
systolic_bp          float64
heart_rate           float64
respiratory_rate     float64
oxygen_saturation    float64
dtype: object


In [35]:
# Step 2: Define numeric and categorical features
numeric_features = ['age', 'diastolic_bp', 'systolic_bp', 'heart_rate', 'respiratory_rate', 'oxygen_saturation']
categorical_features = ['gender']

# Step 3: Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),  # Handle missing numeric values
            ('scaler', StandardScaler())  # Standardize numeric data
        ]), numeric_features),
        
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical values
            ('onehot', OneHotEncoder(drop='if_binary'))  # Encode binary categorical columns
        ]), categorical_features)
    ]
)

# Step 4: Define models (lightweight and quick to train)
models = {
    'Dummy Classifier': DummyClassifier(strategy='most_frequent'),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    # 'MLP': MLPClassifier(hidden_layer_sizes=(32,), max_iter=200),  # Small hidden layer, quick training
    'Decision Tree': DecisionTreeClassifier(),
    # 'KNN': KNeighborsClassifier(n_neighbors=5)  # Default is 5 neighbors
}

In [36]:
# Step 5: Train models and evaluate
results = []

for model_name, model in models.items():
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = pipeline.predict(X_val)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall
    })

# Step 6: Convert results to a DataFrame
results_df = pd.DataFrame(results)
print("\nModel Performance Metrics:")
print(results_df)

# Step 7: Plot the metrics
metrics_to_plot = ['Accuracy', 'F1 Score', 'Precision', 'Recall']

for metric in metrics_to_plot:
    plt.figure(figsize=(8, 5))
    plt.bar(results_df['Model'], results_df[metric], color='skyblue')
    plt.title(f'Model Comparison: {metric}')
    plt.ylabel(metric)
    plt.xlabel('Model')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.show()

  _warn_prf(average, modifier, msg_start, len(result))
