In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
import joblib

# Load the updated dataset
df = pd.read_csv('QandA.csv')

# Preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to 'Question' column
df['Question'] = df['Question'].apply(preprocess_text)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Convert categorical variables to numerical format
df['Question_encoded'] = label_encoder.fit_transform(df['Question'])
df['Answer'] = label_encoder.fit_transform(df['Answer'])

# Save the fitted LabelEncoder
joblib.dump(label_encoder, 'label_encoder.joblib')

# Initialize TF-IDF vectorizer with adjusted parameters
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Question'])

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Train kNN model with adjusted parameters
knn_model = NearestNeighbors(n_neighbors=1, metric='cosine', algorithm='brute')
knn_model.fit(tfidf_matrix)

# Save kNN model
joblib.dump(knn_model, 'knn_model.joblib')

print("Model retraining completed.")


Model retraining completed.
