In [1]:
# Base Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Analysis Libraries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

# Machine Learning Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# İgnore Warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
patients = pd.read_table('symbipredict_2022.csv', sep=',')
patients.head(5)

Unnamed: 0,Itching,SkinRash,NodalSkinEruptions,ContinuousSneezing,Shivering,Chills,JointPain,StomachPain,Acidity,UlcersOnTongue,...,Blackheads,Scurring,SkinPeeling,SilverLikeDusting,SmallDentsInNails,InflammatoryNails,Blister,RedSoreAroundNose,YellowCrustOoze,Prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection


In [3]:
X = patients.iloc[:,0:130].values
y = patients.iloc[:,131].values

print('Type of x: ', type(X))
print('Type of y: ', type(y))

print("Shape of x:", X.shape)
print("Shape of y:", y.shape)

Type of x:  <class 'numpy.ndarray'>
Type of y:  <class 'numpy.ndarray'>
Shape of x: (4961, 130)
Shape of y: (4961,)


In [4]:
unique_values = np.unique(y)

# Using len to count unique values
total_unique_values = len(unique_values)
print("Total Number of Unique Values:", total_unique_values)

# Alternatively, using .size for the same purpose
total_unique_values = unique_values.size
print("Total Number of Unique Values:", total_unique_values)


Total Number of Unique Values: 41
Total Number of Unique Values: 41


In [5]:
# Split the data into training and hold-out sets with stratified sampling
X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.3, random_state=1)

# Further split the hold-out set into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_hold, y_hold, test_size=0.5, random_state=1)

# Print the shapes of the training, validation, and test feature arrays to verify correct split
print(f'{"Training Features Shape: ":<30} {X_train.shape}')
print(f'{"Validation Features Shape: ":<30} {X_valid.shape}')
print(f'{"Test Features Shape: ":<30} {X_test.shape}')

Training Features Shape:       (3472, 130)
Validation Features Shape:     (744, 130)
Test Features Shape:           (745, 130)


In [6]:
# Initialize and train a Logistic Regression model
lr_mod = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial', penalty=None)
lr_mod.fit(X_train, y_train)

# Print the training and validation accuracy of the Logistic Regression model
print(f'{"Training Accuracy: ":<30} {round(lr_mod.score(X_train, y_train), 4)}')
print(f'{"Validation Accuracy: ":<30} {round(lr_mod.score(X_valid, y_valid), 4)}')

Training Accuracy:             1.0
Validation Accuracy:           1.0


In [7]:
# Initialize lists to store training and validation accuracy for Decision Trees
dt_train_acc = []
dt_valid_acc = []
depth_range = range(2,30)

# Loop through the depth range and train a Decision Tree for each depth
for d in depth_range:
    temp_tree = DecisionTreeClassifier(max_depth=d, random_state=1)
    temp_tree.fit(X_train, y_train)
    dt_train_acc.append(temp_tree.score(X_train, y_train))
    dt_valid_acc.append(temp_tree.score(X_valid, y_valid))

# Find the optimal max_depth based on validation accuracy
dt_idx = np.argmax(dt_valid_acc)
dt_opt_depth = depth_range[dt_idx]

# Print the optimal max_depth and corresponding training and validation accuracy
print(f'{"Optimal value for max_depth: ":<40} {dt_opt_depth}')
print(f'{"Training Accuracy for Optimal Model: ":<40} {round(dt_train_acc[dt_idx], 4)}')
print(f'{"Validation Accuracy for Optimal Model: ":<40} {round(dt_valid_acc[dt_idx], 4)}')

Optimal value for max_depth:             29
Training Accuracy for Optimal Model:     0.8378
Validation Accuracy for Optimal Model:   0.8024


In [8]:
# Initialize lists to store training and validation accuracy for Random Forests
rf_train_acc = []
rf_valid_acc = []

# Loop through the depth range and train a Random Forest for each depth
for d in depth_range:
    temp_forest = RandomForestClassifier(n_estimators=100, max_depth=d, random_state=1)
    temp_forest.fit(X_train, y_train)
    rf_train_acc.append(temp_forest.score(X_train, y_train))
    rf_valid_acc.append(temp_forest.score(X_valid, y_valid))

# Find the optimal max_depth based on validation accuracy for Random Forests
rf_idx = np.argmax(rf_valid_acc)
rf_opt_depth = depth_range[rf_idx]

# Print the optimal max_depth and corresponding training and validation accuracy for Random Forests
print(f'{"Optimal value for max_depth: ":<40} {rf_opt_depth}')
print(f'{"Training Accuracy for Optimal Model: ":<40} {round(rf_train_acc[rf_idx], 4)}')
print(f'{"Validation Accuracy for Optimal Model: ":<40} {round(rf_valid_acc[rf_idx], 4)}')

Optimal value for max_depth:             9
Training Accuracy for Optimal Model:     1.0
Validation Accuracy for Optimal Model:   1.0


In [9]:
rf = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=1)
rf.fit(X_train, y_train)

In [10]:
x0 = [[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]

pred_0_m1 = rf.predict(x0)
print(pred_0_m1)

prob_0_m1 = rf.predict_proba(x0)
print(np.round(prob_0_m1, 4))

['Psoriasis']
[[0.022  0.0256 0.0109 0.0235 0.0188 0.0217 0.0215 0.0119 0.0222 0.008
  0.0166 0.0101 0.0144 0.0201 0.0263 0.0193 0.0236 0.0217 0.02   0.0144
  0.0213 0.0227 0.0083 0.0215 0.0163 0.0143 0.0077 0.0166 0.0228 0.0239
  0.0159 0.0215 0.022  0.0175 0.0125 0.2787 0.0063 0.0203 0.0175 0.0182
  0.0216]]


Prediction is correct. 

In [12]:
print(f'{"Testing Accuracy for Final Model: ":<40} {round(rf.score(X_test, y_test), 4)}')

Testing Accuracy for Final Model:        1.0


In [78]:
symptoms = ["Itching", "SkinRash", "NodalSkinEruptions", "ContinuousSneezing", "Shivering", "Chills", "JointPain", "StomachPain", "Acidity", "UlcersOnTongue", "MuscleWasting", "Vomiting",
            "BurningMicturition", "SpottingUrination", "Fatigue", "WeightGain", "Anxiety", "ColdHandsAndFeet", "MoodSwings", "WeightLoss", "Restlessness", "Lethargy", "PatchesInThroat",
            "IrregularSugarLevel", "Cough", "HighFever", "SunkenEyes", "Breathlessness", "Sweating", "Dehydration", "Indigestion", "Headache", "YellowishSkin", "DarkUrine", "Nausea",
            "LossOfAppetite", "PainBehindTheEyes", "BackPain", "Constipation", "AbdominalPain", "Diarrhea", "MildFever", "YellowUrine", "YellowingOfEyes", "AcuteLiverFailure",
            "SwellingOfStomach", "SwelledLymphNodes", "Malaise", "BlurredAndDistortedVision", "Phlegm", "ThroatIrritation", "RednessOfEyes", "SinusPressure", "RunnyNose", "Congestion",
            "ChestPain", "WeaknessInLimbs", "FastHeartRate", "PainDuringBowelMovements", "PainInAnalRegion", "BloodyStool", "IrritationInAnus", "NeckPain", "Dizziness", "Cramps", "Bruising",
            "Obesity", "SwollenLegs", "SwollenBloodVessels", "PuffyFaceAndEyes", "EnlargedThyroid", "BrittleNails", "SwollenExtremeties", "ExcessiveHunger", "ExtraMaritalContacts",
            "DryingAndTinglingLips", "SlurredSpeech", "KneePain", "HipJointPain", "MuscleWeakness", "StiffNeck", "SwellingJoints", "MovementStiffness", "SpinningMovements", "LossOfBalance",
            "Unsteadiness", "WeaknessOfOneBodySide", "LossOfSmell", "BladderDiscomfort", "FoulSmellOfUrine", "ContinuousFeelOfUrine", "PassageOfGases", "InternalItching", "ToxicLook_Typhos",
            "Depression", "Irritability", "MusclePain", "AlteredSensorium", "RedSpotsOverBody", "BellyPain", "AbnormalMenstruation", "DischromicPatches", "WateringFromEyes",
            "IncreasedAppetite", "Polyuria", "FamilyHistory", "MucoidSputum", "RustySputum", "LackOfConcentration", "VisualDisturbances", "ReceivingBloodTransfusion",
            "ReceivingUnsterileInjections", "Coma", "StomachBleeding", "DistentionOfAbdomen", "HistoryOfAlcoholConsumption", "fluid_overload", "BloodInSputum", "ProminentVeinsOnCalf",
            "Palpitations", "PainfulWalking", "PusFilledPimples", "Blackheads", "Scurring", "SkinPeeling", "SilverLikeDusting", "SmallDentsInNails", "InflammatoryNails", "Blister",
            "RedSoreAroundNose", "YellowCrustOoze"]

for symptom in symptoms:
    print(f'{symptom}, ')


Itching, 
SkinRash, 
NodalSkinEruptions, 
ContinuousSneezing, 
Shivering, 
Chills, 
JointPain, 
StomachPain, 
Acidity, 
UlcersOnTongue, 
MuscleWasting, 
Vomiting, 
BurningMicturition, 
SpottingUrination, 
Fatigue, 
WeightGain, 
Anxiety, 
ColdHandsAndFeet, 
MoodSwings, 
WeightLoss, 
Restlessness, 
Lethargy, 
PatchesInThroat, 
IrregularSugarLevel, 
Cough, 
HighFever, 
SunkenEyes, 
Breathlessness, 
Sweating, 
Dehydration, 
Indigestion, 
Headache, 
YellowishSkin, 
DarkUrine, 
Nausea, 
LossOfAppetite, 
PainBehindTheEyes, 
BackPain, 
Constipation, 
AbdominalPain, 
Diarrhea, 
MildFever, 
YellowUrine, 
YellowingOfEyes, 
AcuteLiverFailure, 
SwellingOfStomach, 
SwelledLymphNodes, 
Malaise, 
BlurredAndDistortedVision, 
Phlegm, 
ThroatIrritation, 
RednessOfEyes, 
SinusPressure, 
RunnyNose, 
Congestion, 
ChestPain, 
WeaknessInLimbs, 
FastHeartRate, 
PainDuringBowelMovements, 
PainInAnalRegion, 
BloodyStool, 
IrritationInAnus, 
NeckPain, 
Dizziness, 
Cramps, 
Bruising, 
Obesity, 
SwollenLegs, 
Swoll