In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv('monkeypox.csv')

In [4]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [5]:
# Explore the dataset
print("Shape of the dataset:\n", df.shape)
print("Info of the dataset:\n", df.info())
print("Summary statistics:\n", df.describe())
print("Checking for missing values:\n", df.isnull().sum())

Shape of the dataset:
 (25000, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Patient_ID                      25000 non-null  object
 1   Systemic Illness                18784 non-null  object
 2   Rectal Pain                     25000 non-null  bool  
 3   Sore Throat                     25000 non-null  bool  
 4   Penile Oedema                   25000 non-null  bool  
 5   Oral Lesions                    25000 non-null  bool  
 6   Solitary Lesion                 25000 non-null  bool  
 7   Swollen Tonsils                 25000 non-null  bool  
 8   HIV Infection                   25000 non-null  bool  
 9   Sexually Transmitted Infection  25000 non-null  bool  
 10  MonkeyPox                       25000 non-null  object
dtypes: bool(8), object(3)
memory usage: 781.4+ KB
Info of the dataset:
 Non

In [6]:
# Handle missing values if any (example: dropping missing values)
df.dropna(inplace=True)

In [7]:
df.shape

(18784, 11)

In [8]:
df = df.drop('Patient_ID', axis = 1)

In [9]:
# Encode categorical variables if any
# Assuming 'target' is the name of the target column and other categorical columns need encoding
label_encoders = {}
for column in df.select_dtypes(include=['object', 'bool']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [10]:
df.head()

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
1,0,1,0,1,1,0,0,1,0,1
2,0,0,1,1,0,0,0,1,0,1
4,2,1,1,1,0,0,1,1,0,1
5,2,0,1,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,1


In [11]:
X = df.drop('MonkeyPox', axis = 1)
y = df['MonkeyPox']

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Function to train and evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return accuracy

In [14]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}

In [15]:
# Evaluate each model
model_accuracies = {}
for model_name, model in models.items():
    print(f"Evaluating {model_name}")
    accuracy = evaluate_model(model, X_train, y_train, X_test, y_test)
    model_accuracies[model_name] = accuracy

Evaluating Random Forest
Accuracy: 0.6888474846952356
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.30      0.37      1173
           1       0.73      0.87      0.79      2584

    accuracy                           0.69      3757
   macro avg       0.62      0.58      0.58      3757
weighted avg       0.66      0.69      0.66      3757

Confusion Matrix:
 [[ 348  825]
 [ 344 2240]]
Evaluating Support Vector Machine
Accuracy: 0.7104072398190046
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.22      0.33      1173
           1       0.73      0.93      0.82      2584

    accuracy                           0.71      3757
   macro avg       0.66      0.58      0.57      3757
weighted avg       0.69      0.71      0.66      3757

Confusion Matrix:
 [[ 263  910]
 [ 178 2406]]
Evaluating K-Nearest Neighbors


In [None]:
# Display model accuracies
print("Model Accuracies:", model_accuracies)

Model Accuracies: {'Random Forest': 0.6909768432259782, 'Support Vector Machine': 0.7104072398190046, 'K-Nearest Neighbors': 0.660367314346553, 'Logistic Regression': 0.7000266169816343, 'Decision Tree': 0.6861857865318073}


In [None]:
# Hyperparameter tuning for the best model
best_model = SVC()

In [None]:
# Define hyperparameters for tuning
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [None]:
# Perform Grid Search
grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   4.5s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   4.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   4.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   4.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   3.9s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  37.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  46.5s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  41.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  56.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.0s
[CV] END ...................

In [None]:
# Display the best parameters
print("Best Parameters found: ", grid_search.best_params_)

NameError: name 'grid_search' is not defined

In [None]:
# Evaluate the best model with optimized hyperparameters
best_model = grid_search.best_estimator_
evaluate_model(best_model, X_train, y_train, X_test, y_test)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(scores, annot=True, cmap='viridis')
plt.title('Hyperparameter Tuning Results')
plt.xlabel('Gamma')
plt.ylabel('C')
plt.show()