## Import Libraries

In [1]:
import pandas as pd

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

## Load the Dataset

In [2]:
# Load the dataframe from csv
df = pd.read_csv('https://raw.githubusercontent.com/ktxdev/symp-check/main/backend/data/processed/symptoms_disease.csv')
# Display dataframe
df.head()

Unnamed: 0,diseases,symptoms
0,panic disorder,"anxiety and nervousness ,shortness of breath ,..."
1,panic disorder,"shortness of breath ,depressive or psychotic s..."
2,panic disorder,"anxiety and nervousness ,depression ,shortness..."
3,panic disorder,"anxiety and nervousness ,depressive or psychot..."
4,panic disorder,"anxiety and nervousness ,depression ,insomnia ..."


## Split the data into a training and testing set

In [3]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Spliting the train and test data
X_train = train['symptoms']
X_test = test['symptoms']
y_train = train['diseases'] 
y_test = test['diseases']

## Text Vectorization

In [4]:
# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(min_df=10)
# Fit the training data
X_train_bow = tfidf_vectorizer.fit_transform(X_train)
# Transform the test data
X_test_bow = tfidf_vectorizer.transform(X_test)

print(X_train_bow.shape)
print(X_test_bow.shape)

(395112, 340)
(98778, 340)


## Model Selection and Cross Validation
### Support Vector Machine

In [5]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_bow, y_train)

svm_model_acc = cross_val_score(estimator=svm_model, X = X_train_bow, y = y_train, cv = 5, n_jobs = -1)
svm_model_acc



array([0.86428002, 0.86462169, 0.86120321, 0.86363291, 0.86383539])

### Logistic Regression

In [6]:
lg_model = LogisticRegression()
lg_model.fit(X_train_bow, y_train)

lg_model_acc = cross_val_score(estimator=lg_model, X = X_train_bow, y = y_train, cv = 5, n_jobs = -1)
lg_model_acc



array([0.86234387, 0.86164787, 0.85943155, 0.86127914, 0.86165878])

### Decision Tree Classifier

In [7]:
dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train_bow, y_train)

dtc_model_acc = cross_val_score(estimator=dtc_model, X=X_train_bow, y=y_train, cv=5, n_jobs=-1)
dtc_model_acc



array([0.83662984, 0.83680701, 0.83412214, 0.8357546 , 0.83476753])

### Multinomial Naive Bayes

In [8]:
mnb_model = MultinomialNB()
mnb_model.fit(X_train_bow, y_train)

mnb_model_acc = cross_val_score(estimator=mnb_model, X=X_train_bow, y=y_train, cv=5, n_jobs=-1)
mnb_model_acc



array([0.79803348, 0.79622388, 0.79699324, 0.79581635, 0.79634785])

## Evaluation

In [9]:
print("Support Vector Machine Accuracy:\t", svm_model.score(X_test_bow, y_test))
print("Decision Tree Classifier Accuracy:\t", dtc_model.score(X_test_bow, y_test))
print("Multinomial Naive Bayes Accurracy:\t", mnb_model.score(X_test_bow.toarray(), y_test))
print("Logistic Regression Accuracy:\t\t", lg_model.score(X_test_bow, y_test))

Support Vector Machine Accuracy:	 0.8644839944117111
Decision Tree Classifier Accuracy:	 0.841381684180688
Multinomial Naive Bayes Accurracy:	 0.8022029196784709
Logistic Regression Accuracy:		 0.863066674765636


## Tuning Hyperparameters

In [10]:
params = {'C': [0.1, 1, 10], 'solver': ['liblinear']}

lg_model = LogisticRegression()
lg_model.fit(X_train_bow, y_train)

gscv = GridSearchCV(lg_model, params, cv=5, n_jobs=-1)
gscv.fit(X_train_bow, y_train)

print("Best Params:\t", gscv.best_params_)
print("Accurracy:\t\t", gscv.score(X_test_bow, y_test))



Best Params:	 {'C': 10, 'solver': 'liblinear'}
Accurracy:		 0.8639373139767964
