# COVID-19 Pediatric Dataset Analysis using Logistic Regression

Applying the Supervised Learning Technique, Logistic Regression, to the COVID-19 Pediatric Dataset to predict test results based on various features.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

## Data Loading and Initial Exploration

In [None]:
# Load the dataset
data = pd.read_excel('COVID19 Pediatric Dataset.xlsx')

# Display basic information about the dataset
print("Dataset Shape:", data.shape)
print("\nFirst few rows:")
display(data.head())

print("\nColumn information:")
display(data.info())

## Data Preprocessing

In [None]:
# Check for missing values
print("Missing values in each column:")
display(data.isnull().sum())

# Drop rows with missing target variable
data = data.dropna(subset=['SARS-Cov-2 exam result'])

# Encode target variable (Positive: 1, Negative: 0)
data['SARS-Cov-2 exam result'] = data['SARS-Cov-2 exam result'].map({'positive': 1, 'negative': 0})

# Select features and target
# Remove any non-numeric columns or encode them if they're categorical
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
X = data[numeric_columns].drop('SARS-Cov-2 exam result', axis=1, errors='ignore')
y = data['SARS-Cov-2 exam result']

# Fill missing values with mean
X = X.fillna(X.mean())

print("\nFeatures used for prediction:")
display(X.columns)

## Data Visualization

In [None]:
# Create correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## Model Training and Evaluation

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_reg.predict(X_test_scaled)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Create confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(log_reg.coef_[0])
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Feature Importance in Logistic Regression Model')
plt.tight_layout()
plt.show()