In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
file_path = '/kaggle/input/dengue/dengue_dataset.csv'
dengue_data = pd.read_csv(file_path)
dengue_data.head()

Unnamed: 0,Age,Sex,Month Affected,Residence Address,Duration of fever (days),Current body temperature (°C),White Blood Cell (WBC) Count,Back pain in eyes,Joint pain,Muscle pain,...,Rash,Pleural effusion,Ascites,Bleeding,Slow heart rate,Igg,Igm,NS1,Travelling history,Headache
0,13,Male,May,"595 Dennis Fields Apt. 416\nGilbertfurt, MT 61961",11,38.2,11257,Low,Medium,High,...,Medium,Medium,Medium,Medium,Medium,Negative,Positive,Positive,Yes,High
1,4,Female,January,Unit 4051 Box 4377\nDPO AP 53265,8,38.7,9824,Low,Low,High,...,Medium,Medium,High,Low,Medium,Negative,Negative,Positive,Yes,High
2,68,Female,October,"6835 Eric Cliffs Apt. 831\nNew Rebeccamouth, K...",14,38.9,4731,High,Low,Medium,...,High,Low,Medium,Low,Low,Negative,Negative,Negative,Yes,Medium
3,73,Male,July,"5947 Tyler Forges\nPageview, ND 63430",11,37.8,5055,Medium,High,Medium,...,Low,High,High,Medium,High,Positive,Positive,Positive,No,Medium
4,31,Female,April,"73008 Brown Springs\nMichaelville, WI 90268",4,38.4,5755,High,Medium,Medium,...,Low,Medium,High,Low,High,Negative,Negative,Negative,Yes,Low


In [3]:
numerical_cols = ['Age', 'Duration of fever (days)', 'Current body temperature (°C)', 'White Blood Cell (WBC) Count']
desc_stats = dengue_data[numerical_cols].describe()
gender_distribution = dengue_data['Sex'].value_counts()
monthly_distribution = dengue_data['Month Affected'].value_counts()
symptoms = ['Back pain in eyes', 'Joint pain', 'Muscle pain', 'Rash']
symptoms_distribution = dengue_data[symptoms].apply(pd.value_counts)
print(desc_stats)
print(gender_distribution)
print(monthly_distribution)
print(symptoms_distribution)

              Age  Duration of fever (days)  Current body temperature (°C)  \
count  300.000000                 300.00000                     300.000000   
mean    48.100000                   7.91000                      38.336333   
std     28.702662                   4.20088                       0.987165   
min      1.000000                   1.00000                      36.500000   
25%     21.750000                   5.00000                      37.500000   
50%     46.500000                   8.00000                      38.300000   
75%     73.000000                  12.00000                      39.200000   
max     98.000000                  15.00000                      40.000000   

       White Blood Cell (WBC) Count  
count                    300.000000  
mean                    7443.440000  
std                     2639.825925  
min                     3084.000000  
25%                     5045.500000  
50%                     7423.000000  
75%                     9765.50

A brief overview of the first few entries in the dataset:

- **Age**: The age of the patient.
- **Sex**: The gender of the patient (Male/Female).
- **Month Affected**: The month in which the patient was affected.
- **Residence Address**: Address of the patient.
- **Duration of fever (days)**: How long the patient had a fever.
- **Current body temperature (°C)**: The current body temperature of the patient.
- **White Blood Cell (WBC) Count**: WBC count of the patient.
- **Symptoms and conditions**: Various columns such as 'Back pain in eyes', 'Joint pain', 'Muscle pain', etc., indicating the severity of these symptoms (Low/Medium/High).

1. Descriptive statistics of numerical features like age, duration of fever, body temperature, and WBC count.
2. Distribution of patients by gender.
3. Distribution of cases over months.
4. Frequency of different symptoms.

summary of the basic analysis of the Dengue dataset:

### Descriptive Statistics
- **Age**: Ranges from 1 to 98 years, with an average of approximately 48 years.
- **Duration of Fever**: Varies between 1 to 15 days, averaging around 7.9 days.
- **Body Temperature**: Ranges from 36.5°C to 40.0°C, with a mean of 38.3°C.
- **WBC Count**: Fluctuates between 3084 to 11976, with an average count of 7443.

### Gender Distribution
- **Male**: 162 cases
- **Female**: 138 cases

### Distribution Over Months
- The cases are fairly distributed throughout the year, with August having the highest number of cases (31) and September and July having the lowest (21 each).

### Frequency of Symptoms
- **Back Pain in Eyes**: High in 97 cases, Medium in 102 cases, Low in 101 cases.
- **Joint Pain**: High in 98 cases, Medium in 93 cases, Low in 109 cases.
- **Muscle Pain**: High in 100 cases, Medium in 96 cases, Low in 104 cases.
- **Rash**: High in 102 cases, Medium in 94 cases, Low in 104 cases.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
categorical_cols = dengue_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    dengue_data[col] = label_encoder.fit_transform(dengue_data[col])

y = dengue_data['Igm']
X = dengue_data.drop('Igm', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)

0.6166666666666667
              precision    recall  f1-score   support

           0       0.77      0.49      0.60        35
           1       0.53      0.80      0.63        25

    accuracy                           0.62        60
   macro avg       0.65      0.64      0.62        60
weighted avg       0.67      0.62      0.61        60



In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
categorical_cols = dengue_data.select_dtypes(include=['object']).columns

In [6]:
# Fit the encoder on the entire dataset
encoder = OneHotEncoder(sparse=False)
encoder.fit(dengue_data[categorical_cols])


X_train_categorical = encoder.transform(X_train[categorical_cols])
X_test_categorical = encoder.transform(X_test[categorical_cols])

# Create DataFrames from the encoded features
X_train_categorical_df = pd.DataFrame(X_train_categorical, columns=encoder.get_feature_names_out(categorical_cols))
X_test_categorical_df = pd.DataFrame(X_test_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns and concatenate the new one-hot encoded columns
X_train_encoded = X_train.drop(categorical_cols, axis=1).reset_index(drop=True)
X_train_encoded = pd.concat([X_train_encoded, X_train_categorical_df], axis=1)

X_test_encoded = X_test.drop(categorical_cols, axis=1).reset_index(drop=True)
X_test_encoded = pd.concat([X_test_encoded, X_test_categorical_df], axis=1)

# Ensure the train and test sets have the same features
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='inner', axis=1)

X_train_encoded.shape, X_test_encoded.shape



((240, 24), (60, 24))

# 1. Decision Tree Classifier

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_encoded, y_train)
y_pred_dt = dt_classifier.predict(X_test_encoded)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(accuracy_dt)
y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)

0.55
0.55
              precision    recall  f1-score   support

           0       0.63      0.54      0.58        35
           1       0.47      0.56      0.51        25

    accuracy                           0.55        60
   macro avg       0.55      0.55      0.55        60
weighted avg       0.56      0.55      0.55        60



# 2. Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Applying Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
# Training the model
rf_classifier.fit(X_train_encoded, y_train)
# Predicting on the test set
y_pred_rf = rf_classifier.predict(X_test_encoded)
# Classification Report
classification_report_rf = classification_report(y_test, y_pred_rf)

# Confusion Matrix
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print(classification_report_rf)
print(confusion_matrix_rf)

              precision    recall  f1-score   support

           0       0.74      0.40      0.52        35
           1       0.49      0.80      0.61        25

    accuracy                           0.57        60
   macro avg       0.61      0.60      0.56        60
weighted avg       0.63      0.57      0.55        60

[[14 21]
 [ 5 20]]


# 3. Support Vector Machine Classifier

In [9]:
from sklearn.svm import SVC

svm_classifier = SVC()

svm_classifier.fit(X_train_encoded, y_train)
y_pred_svm = svm_classifier.predict(X_test_encoded)
classification_report_svm = classification_report(y_test, y_pred_svm)
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm)
print(classification_report_svm)
print(confusion_matrix_svm)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.42      1.00      0.59        25

    accuracy                           0.42        60
   macro avg       0.21      0.50      0.29        60
weighted avg       0.17      0.42      0.25        60

[[ 0 35]
 [ 0 25]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 4. KNeighbors Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()

knn_classifier.fit(X_train_encoded, y_train)
y_pred_knn = knn_classifier.predict(X_test_encoded)
classification_report_knn = classification_report(y_test, y_pred_knn)
confusion_matrix_knn = confusion_matrix(y_test, y_pred_knn)
print(classification_report_knn)
print(confusion_matrix_knn)

              precision    recall  f1-score   support

           0       0.66      0.54      0.59        35
           1       0.48      0.60      0.54        25

    accuracy                           0.57        60
   macro avg       0.57      0.57      0.56        60
weighted avg       0.58      0.57      0.57        60

[[19 16]
 [10 15]]


# 5. XGBClassifier

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_classifier.fit(X_train_encoded, y_train_encoded)
y_pred_xgb = xgb_classifier.predict(X_test_encoded)
y_pred_xgb = label_encoder.inverse_transform(y_pred_xgb)

classification_report_xgb = classification_report(y_test, y_pred_xgb)
confusion_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

print(classification_report_xgb)
print(confusion_matrix_xgb)

              precision    recall  f1-score   support

           0       0.64      0.46      0.53        35
           1       0.46      0.64      0.53        25

    accuracy                           0.53        60
   macro avg       0.55      0.55      0.53        60
weighted avg       0.56      0.53      0.53        60

[[16 19]
 [ 9 16]]


# 6. Linear Discriminant Analysis

In [12]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix

pca = PCA(n_components=24)  
X_train_pca = pca.fit_transform(X_train_encoded)
X_test_pca = pca.transform(X_test_encoded)

lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train_pca, y_train)
y_pred_lda = lda_classifier.predict(X_test_pca)

classification_report_lda = classification_report(y_test, y_pred_lda)
confusion_matrix_lda = confusion_matrix(y_test, y_pred_lda)

print(classification_report_lda)
print(confusion_matrix_lda)

              precision    recall  f1-score   support

           0       0.67      0.51      0.58        35
           1       0.48      0.64      0.55        25

    accuracy                           0.57        60
   macro avg       0.58      0.58      0.57        60
weighted avg       0.59      0.57      0.57        60

[[18 17]
 [ 9 16]]


# 7. Quadratic Discriminant Analysis

In [13]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix

pca = PCA(n_components=24)  
X_train_pca = pca.fit_transform(X_train_encoded)
X_test_pca = pca.transform(X_test_encoded)


qda_classifier = QuadraticDiscriminantAnalysis()
qda_classifier.fit(X_train_pca, y_train)
y_pred_qda = qda_classifier.predict(X_test_pca)

classification_report_qda = classification_report(y_test, y_pred_qda)
confusion_matrix_qda = confusion_matrix(y_test, y_pred_qda)

print(classification_report_qda)
print(confusion_matrix_qda)


              precision    recall  f1-score   support

           0       0.61      0.54      0.58        35
           1       0.45      0.52      0.48        25

    accuracy                           0.53        60
   macro avg       0.53      0.53      0.53        60
weighted avg       0.54      0.53      0.54        60

[[19 16]
 [12 13]]


# 8. Gaussian Process Classifier

In [14]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import classification_report, confusion_matrix

gp_classifier = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))

# Training the model
gp_classifier.fit(X_train_encoded, y_train)

# Predicting on the test set
y_pred_gp = gp_classifier.predict(X_test_encoded)

# Classification Report and Confusion Matrix
classification_report_gp = classification_report(y_test, y_pred_gp)
confusion_matrix_gp = confusion_matrix(y_test, y_pred_gp)

print(classification_report_gp)
print(confusion_matrix_gp)

              precision    recall  f1-score   support

           0       0.58      1.00      0.74        35
           1       0.00      0.00      0.00        25

    accuracy                           0.58        60
   macro avg       0.29      0.50      0.37        60
weighted avg       0.34      0.58      0.43        60

[[35  0]
 [25  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 9. AdaBoostClassifier

In [15]:
from sklearn.ensemble import AdaBoostClassifier

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


file_path = '/kaggle/input/dengue/dengue_dataset.csv'
dengue_data = pd.read_csv(file_path)

categorical_cols = dengue_data.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse=False)
X_categorical = encoder.fit_transform(dengue_data[categorical_cols])


X_categorical_df = pd.DataFrame(X_categorical, columns=encoder.get_feature_names_out(categorical_cols))

X_encoded = dengue_data.drop(categorical_cols, axis=1)
X_encoded = pd.concat([X_encoded, X_categorical_df], axis=1)


y = dengue_data['Igm']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(X_train_encoded, y_train_encoded)
y_pred_ada = ada_classifier.predict(X_test_encoded)
classification_report_ada = classification_report(y_test_encoded, y_pred_ada)
confusion_matrix_ada = confusion_matrix(y_test_encoded, y_pred_ada)

print(classification_report_ada)
print(confusion_matrix_ada)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        25

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

[[35  0]
 [ 0 25]]


# 10. LGBMClassifier

In [16]:
import re
def clean_column_names(column_name):
    return re.sub(r'\W', '_', column_name).strip('_')

X_train_encoded.columns = [clean_column_names(col) for col in X_train_encoded.columns]
X_test_encoded.columns = [clean_column_names(col) for col in X_test_encoded.columns]

lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train_encoded, y_train)
y_pred_lgbm = lgbm_classifier.predict(X_test_encoded)
classification_report_lgbm = classification_report(y_test, y_pred_lgbm)
confusion_matrix_lgbm = confusion_matrix(y_test, y_pred_lgbm)

print(classification_report_lgbm)
print(confusion_matrix_lgbm)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        25

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

[[35  0]
 [ 0 25]]


# 11. CatBoostClassifier

In [17]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

catboost_classifier = CatBoostClassifier(verbose=0)
catboost_classifier.fit(X_train_encoded, y_train)
y_pred_catboost = catboost_classifier.predict(X_test_encoded)

classification_report_catboost = classification_report(y_test, y_pred_catboost)
confusion_matrix_catboost = confusion_matrix(y_test, y_pred_catboost)

print(classification_report_catboost)
print(confusion_matrix_catboost)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        25

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

[[35  0]
 [ 0 25]]


# 12. Nearest Centroid Classifier

In [18]:
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report, confusion_matrix
nc_classifier = NearestCentroid()

nc_classifier.fit(X_train_encoded, y_train)

y_pred_nc = nc_classifier.predict(X_test_encoded)
classification_report_nc = classification_report(y_test, y_pred_nc)
confusion_matrix_nc = confusion_matrix(y_test, y_pred_nc)

print(classification_report_nc)
print(confusion_matrix_nc)

              precision    recall  f1-score   support

           0       0.75      0.60      0.67        35
           1       0.56      0.72      0.63        25

    accuracy                           0.65        60
   macro avg       0.66      0.66      0.65        60
weighted avg       0.67      0.65      0.65        60

[[21 14]
 [ 7 18]]


# 13. Ridge Classifier

In [19]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, confusion_matrix

ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train_encoded, y_train)
y_pred_ridge = ridge_classifier.predict(X_test_encoded)
classification_report_ridge = classification_report(y_test, y_pred_ridge)
confusion_matrix_ridge = confusion_matrix(y_test, y_pred_ridge)
print(classification_report_ridge)
print(confusion_matrix_ridge)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        25

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

[[35  0]
 [ 0 25]]


# 14. PassiveAggressiveClassifier

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load your dataset
file_path = '/kaggle/input/dengue/dengue_dataset.csv'
data = pd.read_csv(file_path)

label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

y = data['Igm']
X = data.drop('Igm', axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

pa_classifier = PassiveAggressiveClassifier(max_iter=1000, random_state=42)
pa_classifier.fit(X_train, y_train)

y_pred_pa = pa_classifier.predict(X_test)
classification_report_pa = classification_report(y_test, y_pred_pa)
confusion_matrix_pa = confusion_matrix(y_test, y_pred_pa)
print(classification_report_pa)
print(confusion_matrix_pa)

              precision    recall  f1-score   support

           0       0.62      0.57      0.60        35
           1       0.46      0.52      0.49        25

    accuracy                           0.55        60
   macro avg       0.54      0.55      0.54        60
weighted avg       0.56      0.55      0.55        60

[[20 15]
 [12 13]]


# 15. Decision Stump

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

file_path = '/kaggle/input/dengue/dengue_dataset.csv'
data = pd.read_csv(file_path)

label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

y = data['Igm']
X = data.drop('Igm', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
decision_stump = DecisionTreeClassifier(max_depth=1, random_state=42)

decision_stump.fit(X_train, y_train)
y_pred_stump = decision_stump.predict(X_test)
classification_report_stump = classification_report(y_test, y_pred_stump)
confusion_matrix_stump = confusion_matrix(y_test, y_pred_stump)

print(classification_report_stump)
print(confusion_matrix_stump)


              precision    recall  f1-score   support

           0       0.60      0.51      0.55        35
           1       0.43      0.52      0.47        25

    accuracy                           0.52        60
   macro avg       0.52      0.52      0.51        60
weighted avg       0.53      0.52      0.52        60

[[18 17]
 [12 13]]


# 16. Bagging (Bootstrap Aggregating)

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

file_path = '/kaggle/input/dengue/dengue_dataset.csv'
data = pd.read_csv(file_path)

label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

y = data['Igm']
X = data.drop('Igm', axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
bagging_classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                                       n_estimators=10000, random_state=42)
bagging_classifier.fit(X_train, y_train)

y_pred_bagging = bagging_classifier.predict(X_test)

classification_report_bagging = classification_report(y_test, y_pred_bagging)
confusion_matrix_bagging = confusion_matrix(y_test, y_pred_bagging)

print(classification_report_bagging)
print(confusion_matrix_bagging)




              precision    recall  f1-score   support

           0       0.80      0.46      0.58        35
           1       0.53      0.84      0.65        25

    accuracy                           0.62        60
   macro avg       0.66      0.65      0.61        60
weighted avg       0.69      0.62      0.61        60

[[16 19]
 [ 4 21]]


# 17. Elastic Net

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop('Igm', axis=1))
y = data['Igm']

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
elastic_net_classifier = ElasticNet()
elastic_net_classifier.fit(X_train, y_train)
y_pred_en = elastic_net_classifier.predict(X_test)
threshold = 0.512 
y_pred_en_class = (y_pred_en > threshold).astype(int)

classification_report_en = classification_report(y_test, y_pred_en_class)
confusion_matrix_en = confusion_matrix(y_test, y_pred_en_class)

print(classification_report_en)
print(confusion_matrix_en)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.42      1.00      0.59        25

    accuracy                           0.42        60
   macro avg       0.21      0.50      0.29        60
weighted avg       0.17      0.42      0.25        60

[[ 0 35]
 [ 0 25]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 18. Gaussian Naive Bayes

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

y = data['Igm']
X = data.drop('Igm', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred_gnb = gnb_classifier.predict(X_test)

classification_report_gnb = classification_report(y_test, y_pred_gnb)
confusion_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)

print(classification_report_gnb)
print(confusion_matrix_gnb)

              precision    recall  f1-score   support

           0       0.74      0.40      0.52        35
           1       0.49      0.80      0.61        25

    accuracy                           0.57        60
   macro avg       0.61      0.60      0.56        60
weighted avg       0.63      0.57      0.55        60

[[14 21]
 [ 5 20]]


# 19. Multinomial Naive Bayes

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(data.drop('Igm', axis=1))
y = data['Igm']
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)
y_pred_mnb = mnb_classifier.predict(X_test)
classification_report_mnb = classification_report(y_test, y_pred_mnb)
confusion_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
print(classification_report_mnb)
print(confusion_matrix_mnb)

              precision    recall  f1-score   support

           0       0.68      0.37      0.48        35
           1       0.46      0.76      0.58        25

    accuracy                           0.53        60
   macro avg       0.57      0.57      0.53        60
weighted avg       0.59      0.53      0.52        60

[[13 22]
 [ 6 19]]


# 20. Bernoulli Naive Bayes

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

y = data['Igm']
X = data.drop('Igm', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bnb_classifier = BernoulliNB()
bnb_classifier.fit(X_train, y_train)
y_pred_bnb = bnb_classifier.predict(X_test)
classification_report_bnb = classification_report(y_test, y_pred_bnb)
confusion_matrix_bnb = confusion_matrix(y_test, y_pred_bnb)

print(classification_report_bnb)
print(confusion_matrix_bnb)

              precision    recall  f1-score   support

           0       0.61      0.31      0.42        35
           1       0.43      0.72      0.54        25

    accuracy                           0.48        60
   macro avg       0.52      0.52      0.48        60
weighted avg       0.54      0.48      0.47        60

[[11 24]
 [ 7 18]]


# 21. Categorical Naive Bayes

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = encoder.fit_transform(data[categorical_cols])

y = data['Igm']
X = data.drop('Igm', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


cnb_classifier = CategoricalNB()
cnb_classifier.fit(X_train, y_train)
y_pred_cnb = cnb_classifier.predict(X_test)

classification_report_cnb = classification_report(y_test, y_pred_cnb)
confusion_matrix_cnb = confusion_matrix(y_test, y_pred_cnb)

print(classification_report_cnb)
print(confusion_matrix_cnb)

              precision    recall  f1-score   support

           0       0.50      0.26      0.34        35
           1       0.38      0.64      0.48        25

    accuracy                           0.42        60
   macro avg       0.44      0.45      0.41        60
weighted avg       0.45      0.42      0.40        60

[[ 9 26]
 [ 9 16]]


# 22. Stochastic Gradient Descent (SGD) Classifier

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler


y = data['Igm']
X = data.drop('Igm', axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
sgd_classifier = SGDClassifier(max_iter=10, random_state=4200)
sgd_classifier.fit(X_train, y_train)


y_pred_sgd = sgd_classifier.predict(X_test)
classification_report_sgd = classification_report(y_test, y_pred_sgd)
confusion_matrix_sgd = confusion_matrix(y_test, y_pred_sgd)
print(classification_report_sgd)
print(confusion_matrix_sgd)

              precision    recall  f1-score   support

           0       0.62      0.66      0.64        35
           1       0.48      0.44      0.46        25

    accuracy                           0.57        60
   macro avg       0.55      0.55      0.55        60
weighted avg       0.56      0.57      0.56        60

[[23 12]
 [14 11]]




# 23. IsolationForest

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

y = data['Igm']
X = data.drop('Igm', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

iso_forest = IsolationForest(random_state=42)
iso_forest.fit(X_train)

y_pred_iso = iso_forest.predict(X_test)
y_pred_iso = (y_pred_iso > 0.5).astype(int)

classification_report_iso = classification_report(y_test, y_pred_iso)
confusion_matrix_iso = confusion_matrix(y_test, y_pred_iso)

print(classification_report_iso)
print(confusion_matrix_iso)


              precision    recall  f1-score   support

           0       0.58      1.00      0.74        35
           1       0.00      0.00      0.00        25

    accuracy                           0.58        60
   macro avg       0.29      0.50      0.37        60
weighted avg       0.34      0.58      0.43        60

[[35  0]
 [25  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 24. One-Class SVM

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
oc_svm = OneClassSVM(gamma='auto')
oc_svm.fit(X_train[y_train == 1])
y_pred_ocsvm = oc_svm.predict(X_test)
y_pred_ocsvm = (y_pred_ocsvm > 0).astype(int)
classification_report_ocsvm = classification_report(y_test, y_pred_ocsvm)
confusion_matrix_ocsvm = confusion_matrix(y_test, y_pred_ocsvm)
print(classification_report_ocsvm)
print(confusion_matrix_ocsvm)


              precision    recall  f1-score   support

           0       0.67      0.83      0.74        35
           1       0.65      0.44      0.52        25

    accuracy                           0.67        60
   macro avg       0.66      0.63      0.63        60
weighted avg       0.66      0.67      0.65        60

[[29  6]
 [14 11]]


# 25. Principal Component Regression (PCR)

In [31]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

y = data['Igm']
X = data.drop('Igm', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_components = 2 

pcr = make_pipeline(StandardScaler(), PCA(n_components=n_components), LinearRegression())
cv_scores = cross_val_score(pcr, X_train, y_train, cv=5)
cv_scores_mean = cv_scores.mean()
cv_scores_std = cv_scores.std()

print(f'Cross-Validation Mean: {cv_scores_mean}, Standard Deviation: {cv_scores_std}')

Cross-Validation Mean: -0.016918495376177532, Standard Deviation: 0.007474817410499675


In [32]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

y = data['Igm']
X = data.drop('Igm', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_components = 24
pcr_logistic = make_pipeline(StandardScaler(), PCA(n_components=n_components), LogisticRegression())
y_pred_pcr = cross_val_predict(pcr_logistic, X_train, y_train, cv=5)
classification_report_pcr = classification_report(y_train, y_pred_pcr)
confusion_matrix_pcr = confusion_matrix(y_train, y_pred_pcr)
print(classification_report_pcr)
print(confusion_matrix_pcr)

              precision    recall  f1-score   support

           0       0.50      0.46      0.48       114
           1       0.54      0.58      0.56       126

    accuracy                           0.53       240
   macro avg       0.52      0.52      0.52       240
weighted avg       0.52      0.53      0.52       240

[[53 61]
 [53 73]]


# 26. Canonical Correlation Analysis (CCA)

In [33]:
import pandas as pd
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

midpoint = X.shape[1] // 2
X1 = X.iloc[:, :midpoint]
X2 = X.iloc[:, midpoint:]

n_components = 12  

cca = CCA(n_components=n_components)
cca.fit(X1, X2)
X1_c, X2_c = cca.transform(X1, X2)
X_combined = pd.concat([pd.DataFrame(X1_c), pd.DataFrame(X2_c)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(report)
print(conf_matrix)


              precision    recall  f1-score   support

           0       0.67      0.51      0.58        35
           1       0.48      0.64      0.55        25

    accuracy                           0.57        60
   macro avg       0.58      0.58      0.57        60
weighted avg       0.59      0.57      0.57        60

[[18 17]
 [ 9 16]]


# 27. Radial Basis Function (RBF) Kernel

In [34]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svc_rbf = SVC(kernel='rbf', random_state=4200)
svc_rbf.fit(X_train_scaled, y_train)
y_pred_svc_rbf = svc_rbf.predict(X_test_scaled)
classification_report_rbf = classification_report(y_test, y_pred_svc_rbf)
confusion_matrix_rbf = confusion_matrix(y_test, y_pred_svc_rbf)
print(classification_report_rbf)
print(confusion_matrix_rbf)

              precision    recall  f1-score   support

           0       0.62      0.37      0.46        35
           1       0.44      0.68      0.53        25

    accuracy                           0.50        60
   macro avg       0.53      0.53      0.50        60
weighted avg       0.54      0.50      0.49        60

[[13 22]
 [ 8 17]]


# 28. Particle Swarm Optimization (PSO)

In [35]:
!pip install pyswarm


Collecting pyswarm
  Downloading pyswarm-0.6.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: pyswarm
  Building wheel for pyswarm (setup.py) ... [?25l- \ done
[?25h  Created wheel for pyswarm: filename=pyswarm-0.6-py3-none-any.whl size=4464 sha256=4deec6d1551ae3b73cd042fd30e4063a227b6d65bb6ffae78037520f73447207
  Stored in directory: /root/.cache/pip/wheels/71/67/40/62fa158f497f942277cbab8199b05cb61c571ab324e67ad0d6
Successfully built pyswarm
Installing collected packages: pyswarm
Successfully installed pyswarm-0.6


In [36]:
import numpy as np
from pyswarm import pso
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def svm_objective_function(hyperparameters):
    C, gamma = hyperparameters
    model = SVC(C=C, gamma=gamma)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return -accuracy_score(y_test, predictions)  # Negative accuracy, because PSO minimizes the function

# Define the bounds for C and gamma (this may vary and needs tuning)
lb = [0.001, 0.0001]
ub = [1000, 1]


optimal_hyperparameters, _ = pso(svm_objective_function, lb, ub, swarmsize=50, maxiter=100)

optimal_C, optimal_gamma = optimal_hyperparameters
optimal_model = SVC(C=optimal_C, gamma=optimal_gamma)
optimal_model.fit(X_train, y_train)
optimal_predictions = optimal_model.predict(X_test)


from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, optimal_predictions))
print(confusion_matrix(y_test, optimal_predictions))

Stopping search: maximum iterations reached --> 100
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.42      1.00      0.59        25

    accuracy                           0.42        60
   macro avg       0.21      0.50      0.29        60
weighted avg       0.17      0.42      0.25        60

[[ 0 35]
 [ 0 25]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
