In [1]:
import json
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Ensure NLTK data packages are available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Load the JSON data with error handling
try:
    with open('am_ill.json') as file:
        data = json.load(file)
except FileNotFoundError:
    print("Error: JSON file not found.")
    exit()
except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")
    exit()

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Prepare training data
X_train = []
y_train = []

# Process illnesses and symptoms with data validation
for illness in data['illnesses']:
    if not isinstance(illness['symptoms'], list) or not isinstance(illness['illness'], str):
        print(f"Warning: Invalid data format for illness '{illness['illness']}'")
        continue
    for symptom in illness['symptoms']:
        # Tokenize and lemmatize the symptom text
        words = word_tokenize(symptom)
        lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]
        X_train.append(" ".join(lemmatized_words))
        y_train.append(illness['illness'])

# Check for empty training data before vectorization
if not X_train:
    print("Error: No training data for vectorization.")
else:
    # Encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_train_encoded = to_categorical(y_train_encoded)  # Convert to one-hot encoding

    # Vectorize the symptoms using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train).toarray()

    # Define a more complex neural network model
    model = Sequential()
    model.add(Dense(128, input_shape=(X_train_vectorized.shape[1],), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train_vectorized, y_train_encoded, epochs=20, batch_size=8, validation_split=0.2)

    # Save the model using the recommended.h5 file extension
    model.save('chatbot_model.h5')
    print('Model training complete and saved as "chatbot_model.h5"')

[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.0554 - loss: 3.0009 - val_accuracy: 0.0000e+00 - val_loss: 3.0686
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0704 - loss: 2.9567 - val_accuracy: 0.0000e+00 - val_loss: 3.1189
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0191 - loss: 2.9663 - val_accuracy: 0.0000e+00 - val_loss: 3.1707
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1770 - loss: 2.8834 - val_accuracy: 0.0000e+00 - val_loss: 3.2271
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1002 - loss: 2.8886 - val_accuracy: 0.0000e+00 - val_loss: 3.2895
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1615 - loss: 2.8412 - val_accuracy: 0.0000e+00 - val_loss: 3.3685
Epoch 7/20




Model training complete and saved as "chatbot_model.h5"
