In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset
df = pd.read_csv("symptom-disease-train-dataset.csv")

# Step 2: Inspect and clean the data
print(df.head())

# Assume the columns are 'text' and 'label'. Rename if needed:
df.columns = ['text', 'label']  # Adjust this if your actual column names differ

# Step 3: Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

# Intermittent Step: Transform the label into bins.

def binner(scores):
    output = []
    for score in scores:
        if score > 900:
            output.append(1)
        elif score > 650:
            output.append(2)
        elif score > 400:
            output.append(3)
        elif score > 200:
            output.append(4)
        else:
            output.append(5)
    return output

# Step 4: Define labels
df['label'] = binner(df['label'])
y = df['label']

# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.275, random_state=42)

# Step 6: Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 7: Predict on test set
y_pred = model.predict(X_test)

# Step 8: Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


                                                text  label
0  I have been having migraines and headaches. I ...    308
1  I have asthma and I get wheezing and breathing...     35
2  Signs and symptoms of primary ovarian insuffic...    798
3  cough,high_fever,breathlessness,family_history...    149
4  chills,vomiting,high_fever,sweating,headache,n...    596
Accuracy: 0.7903225806451613
Classification Report:
              precision    recall  f1-score   support

           1       0.80      0.67      0.73       172
           2       0.61      0.74      0.67       231
           3       0.91      0.85      0.88       549
           4       0.77      0.72      0.74       264
           5       0.76      0.86      0.81       334

    accuracy                           0.79      1550
   macro avg       0.77      0.77      0.77      1550
weighted avg       0.80      0.79      0.79      1550



In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset
df = pd.read_csv("symptom-disease-train-dataset.csv")

# Step 2: Inspect and clean the data
print(df.head())

# Assume the columns are 'text' and 'label'. Rename if needed:
df.columns = ['text', 'label']  # Adjust this if your actual column names differ

# Step 3: Vectorize the text using CountVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

# Intermittent Step: Transform the label into bins.

def binner(scores):
    output = []
    for score in scores:
        if score > 900:
            output.append(1)
        elif score > 650:
            output.append(2)
        elif score > 400:
            output.append(3)
        elif score > 200:
            output.append(4)
        else:
            output.append(5)
    return output

# Step 4: Define labels
df['label'] = binner(df['label'])
y = df['label']

# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.275, random_state=42)

# Step 6: Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 7: Predict on test set
y_pred = model.predict(X_test)

# Step 8: Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

                                                text  label
0  I have been having migraines and headaches. I ...    308
1  I have asthma and I get wheezing and breathing...     35
2  Signs and symptoms of primary ovarian insuffic...    798
3  cough,high_fever,breathlessness,family_history...    149
4  chills,vomiting,high_fever,sweating,headache,n...    596
Accuracy: 0.8038709677419354
Classification Report:
              precision    recall  f1-score   support

           1       0.97      0.62      0.76       172
           2       0.64      0.71      0.67       231
           3       0.85      0.93      0.89       549
           4       0.85      0.65      0.73       264
           5       0.77      0.89      0.83       334

    accuracy                           0.80      1550
   macro avg       0.82      0.76      0.78      1550
weighted avg       0.81      0.80      0.80      1550



In [13]:
sandwich = ["I am hungry and want sandwiches.","I am not hungry, I am sleepy."]
wich = ["hungry,want_sandwich", "not_hungry,sleepy"]

x1 = CountVectorizer().fit_transform(sandwich)
x2 = CountVectorizer().fit_transform(wich)
print(x1)

  (0, 0)	1
  (0, 2)	1
  (0, 1)	1
  (0, 6)	1
  (0, 4)	1
  (1, 0)	2
  (1, 2)	1
  (1, 3)	1
  (1, 5)	1
