In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from lazypredict.Supervised import LazyClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import joblib
# import shap

In [3]:
data = pd.read_csv('Training.csv')

In [4]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
data.iloc[:, :-1] = imputer.fit_transform(data.iloc[:, :-1])

In [5]:
data.describe()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
count,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,...,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0
mean,0.14,0.16,0.02,0.05,0.02,0.16,0.14,0.05,0.05,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
std,0.34,0.37,0.15,0.21,0.15,0.37,0.35,0.21,0.21,0.15,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
print(data.isnull())

      itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  \
0       False      False                 False                False   
1       False      False                 False                False   
2       False      False                 False                False   
3       False      False                 False                False   
4       False      False                 False                False   
...       ...        ...                   ...                  ...   
4915    False      False                 False                False   
4916    False      False                 False                False   
4917    False      False                 False                False   
4918    False      False                 False                False   
4919    False      False                 False                False   

      shivering  chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  \
0         False   False       False         False    False         

In [7]:
# Encode the 'prognosis' column
label_encoder = LabelEncoder()
data['prognosis'] = label_encoder.fit_transform(data['prognosis'])

In [8]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [9]:
# # Feature selection
# k = 20  # Adjust the number of features as needed
# selector = SelectKBest(score_func=f_classif, k=k)
# X_new = selector.fit_transform(X, y)
# selected_features = selector.get_support(indices=True)
# feature_names = X.columns[selected_features]

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

In [11]:

# # Train and test models
# models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# # Display the results
# print(models)

In [12]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

In [14]:
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   0.0s
[CV] END ....................C=0.1, gamma=0.1, 

In [15]:
# Get the best parameters and train the model
best_params = grid_search.best_params_
best_svc_model = SVC(**best_params)
best_svc_model.fit(X_train, y_train)

In [16]:
# Make predictions and evaluate the model
y_pred_best = best_svc_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Accuracy with best parameters: {accuracy_best * 100:.2f}%")
print(classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))

Accuracy with best parameters: 100.00%
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      1.00      1.0

In [17]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))


Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [18]:
# Cross-validation
cv_scores = cross_val_score(best_svc_model, X, y, cv=10)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0


In [19]:
# # Explain the model's predictions using SHAP
# explainer = shap.KernelExplainer(best_svc_model.predict, X_train)
# shap_values = explainer.shap_values(X_test, nsamples=20)

# # Plot SHAP summary plot
# shap.summary_plot(shap_values, X_test, feature_names=feature_names)

In [20]:
# # Initialize and train the SVC model
# svc_model = SVC(kernel='linear', random_state=42)
# svc_model.fit(X_train, y_train)
# y_pred = svc_model.predict(X_test)
# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy * 100:.2f}%")
# # Display the classification report
# print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
# # Display the confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:")
# print(conf_matrix)

In [21]:

# Save the model to a file
joblib.dump(best_svc_model, open('svc_model.pkl','wb'))

# Load the model from the file (for future use)
loaded_model = joblib.load(open('svc_model.pkl','rb'))
y_pred_loaded = loaded_model.predict(X_test)
print(f"Accuracy with loaded model: {accuracy_score(y_test, y_pred_loaded) * 100:.2f}%")

Accuracy with loaded model: 100.00%


In [22]:
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [31]:
#test 1
print("Predicted Disease: ", loaded_model.predict(X_test.iloc[0].reshape(1,-1)))
print("Actual Disease: ",y_test[0])

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'