<a href="https://colab.research.google.com/github/ishakaran808/Covid-19-predictor/blob/main/Covid_19_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# Load the dataset
df = pd.read_csv('dataset.csv')

# Drop unnecessary columns
df = df.drop(columns=['Patient ID'])

# Separate features and target
X = df.drop(columns=['SARS-Cov-2 exam result'])
y = df['SARS-Cov-2 exam result']

# Handle missing values
numeric_cols = X.select_dtypes(include=['number']).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

categorical_cols = X.select_dtypes(include=['object']).columns
X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])

# Encode categorical features
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = encoder.fit_transform(X[categorical_cols])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Combine encoded columns with the original DataFrame
X = X.drop(columns=categorical_cols)
X = pd.concat([X, X_encoded_df], axis=1)

# Replace 'not_detected' with NaN and fill missing values
X.replace('not_detected', np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balance the dataset using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Encode target labels
label_encoder = LabelEncoder()
y_resampled_encoded = label_encoder.fit_transform(y_resampled)

# Feature selection using RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_resampled, y_resampled)
selector = SelectFromModel(rf, prefit=True, threshold='mean')
X_selected = selector.transform(X_resampled)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled_encoded, test_size=0.2, random_state=42)

# Model Training with Hyperparameter Tuning
xgb = XGBClassifier(eval_metric='logloss')

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'learning_rate': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(xgb, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Predict on test data
y_pred_encoded = best_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Decode y_test for evaluation
y_test_decoded = label_encoder.inverse_transform(y_test)

# Evaluate the model
accuracy = accuracy_score(y_test_decoded, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

print('Classification Report:')
print(classification_report(y_test_decoded, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test_decoded, y_pred))

# Cross-Validation Score
cross_val_scores = cross_val_score(best_model, X_selected, y_resampled_encoded, cv=5)
print(f'Cross-Validation Accuracy: {cross_val_scores.mean() * 100:.2f}%')

Accuracy: 83.11%
Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.86      0.84       307
    positive       0.84      0.80      0.82       297

    accuracy                           0.83       604
   macro avg       0.83      0.83      0.83       604
weighted avg       0.83      0.83      0.83       604

Confusion Matrix:
[[263  44]
 [ 58 239]]
Cross-Validation Accuracy: 82.13%


In [None]:
# Adjust the SMOTE strategy
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN()
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Re-encode the target variable
y_resampled_encoded = label_encoder.fit_transform(y_resampled)

# Split the data again
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled_encoded, test_size=0.2, random_state=42)

# Updated Hyperparameter Tuning with a wider grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 2, 5, 10]  # Important for handling class imbalance
}

grid_search = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Predict on test data
y_pred_encoded = best_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Decode y_test for evaluation
y_test_decoded = label_encoder.inverse_transform(y_test)

# Evaluate the model
accuracy = accuracy_score(y_test_decoded, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

print('Classification Report:')
print(classification_report(y_test_decoded, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test_decoded, y_pred))

# Cross-Validation Score
cross_val_scores = cross_val_score(best_model, X_resampled, y_resampled_encoded, cv=5, scoring='f1_weighted')
print(f'Cross-Validation Accuracy: {cross_val_scores.mean() * 100:.2f}%')


Fitting 5 folds for each of 3888 candidates, totalling 19440 fits
