## Import Libraries

In [2]:
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

## Load the Dataset

In [4]:
# Load the dataframe from csv
df = pd.read_csv('../data/processed/symptoms_disease.csv')
# Display dataframe
df.head()

Unnamed: 0,diseases,symptoms
0,panic disorder,"anxiety and nervousness ,shortness of breath ,..."
1,panic disorder,"shortness of breath ,depressive or psychotic s..."
2,panic disorder,"anxiety and nervousness ,depression ,shortness..."
3,panic disorder,"anxiety and nervousness ,depressive or psychot..."
4,panic disorder,"anxiety and nervousness ,depression ,insomnia ..."


## Split the data into a training and testing set

In [5]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Spliting the train and test data
X_train = train['symptoms']
X_test = test['symptoms']
y_train = train['diseases'] 
y_test = test['diseases']

## Text Vectorization

In [6]:
# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(min_df=10)
# Fit the training data
X_train_bow = tfidf_vectorizer.fit_transform(X_train)
# Transform the test data
X_test_bow = tfidf_vectorizer.transform(X_test)

print(X_train_bow.shape)
print(X_test_bow.shape)

(395112, 340)
(98778, 340)


## Model Selection and Cross Validation
### Support Vector Machine

In [27]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_bow, y_train)

svm_model_acc = cross_val_score(estimator=svm_model, X = X_train_bow, y = y_train, cv = 5, n_jobs = -1)
svm_model_acc



array([0.86428002, 0.86462169, 0.86120321, 0.86363291, 0.86383539])

### Random Forest Classifier 

In [7]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train_bow, y_train)

rfc_model_acc = cross_val_score(estimator=rfc_model, X = X_train_bow, y = y_train, cv = 5, n_jobs = -1)
rfc_model_acc



: 