In [1]:

import os
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KASHISH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KASHISH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
all_df = []
for main, subfolders, filename in os.walk("data"):
    for file in filename:
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(main, file))
            all_df.append(df)
data = pd.concat(all_df, ignore_index=True)

In [3]:
def preprocess(text):
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words]    
    return ' '.join(words)
data['Question'] = data['Question'].apply(preprocess)

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Question'])
label_encoder = LabelEncoder()
y_int = label_encoder.fit_transform(data['topic'])
y = to_categorical(y_int)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=15, batch_size=16, validation_data=(X_test, y_test))

# Save model and preprocessing objects if needed
model.save('chatbot_model.h5')
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.7499 - loss: 0.7270 - val_accuracy: 0.8601 - val_loss: 0.3463
Epoch 2/15
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 18ms/step - accuracy: 0.8734 - loss: 0.3274 - val_accuracy: 0.8703 - val_loss: 0.2797
Epoch 3/15
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.8879 - loss: 0.2671 - val_accuracy: 0.8816 - val_loss: 0.2581
Epoch 4/15
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 17ms/step - accuracy: 0.8983 - loss: 0.2357 - val_accuracy: 0.8859 - val_loss: 0.2486
Epoch 5/15
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step - accuracy: 0.9017 - loss: 0.2163 - val_accuracy: 0.8889 - val_loss: 0.2365
Epoch 6/15
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 13ms/step - accuracy: 0.9022 - loss: 0.2064 - val_accuracy: 0.8834 - val_loss: 0.2321
Epoch 7



In [6]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Model accuracy: {accuracy * 100:.2f}%")

Model accuracy: 88.22%
