In [4]:
import nltk
import sklearn

print("NLTK version:", nltk.__version__)
print("Scikit-learn version:", sklearn.__version__)

NLTK version: 3.9.1
Scikit-learn version: 1.5.1


In [5]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer

print("Libraries are working fine!")

Libraries are working fine!


In [6]:
import json

# Load the dataset
with open("new_dataset.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Print sample data
print(json.dumps(data, indent=4))

{
    "intents": [
        {
            "tag": "fir_registration",
            "patterns": [
                "Where to register an FIR? (18)",
                "How can I file an FIR? (29)",
                "How can I file an FIR? (22)",
                "Where to register an FIR? (3)",
                "FIR kaise karein? (12)",
                "How can I file an FIR? (15)",
                "Where to register an FIR? (13)",
                "FIR kaise karein? (19)",
                "How can I file an FIR? (9)",
                "FIR kaise karein? (24)",
                "How can I file an FIR? (11)",
                "FIR kaise karein? (14)",
                "Where to register an FIR? (6)",
                "Where to register an FIR? (5)",
                "Where to register an FIR? (25)",
                "Where to register an FIR? (15)",
                "How can I file an FIR? (1)",
                "How can I file an FIR? (12)",
                "Where to register an FIR? (17)",
              

In [10]:
# Function to convert text to lowercase
def to_lowercase(text):
    return text.lower()

# Apply to dataset
for intent in data["intents"]:  # Using 'data' instead of 'intents'
    intent["patterns"] = [to_lowercase(pattern) for pattern in intent["patterns"]]

print("âœ… Step 2.1: Text converted to lowercase!")

âœ… Step 2.1: Text converted to lowercase!


In [12]:
# This code will remove the useless words from the sentence entered by the user like (is,the,of)

import nltk
from nltk.corpus import stopwords

# Download stopwords (only needed once)
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Function to remove stop words from text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]  # Remove stop words
    return " ".join(filtered_words)

# Apply stopword removal to all patterns in dataset
for intent in data["intents"]:
    intent["patterns"] = [remove_stopwords(pattern) for pattern in intent["patterns"]]

print("âœ… Step 2.2: Stop words removed successfully!")

âœ… Step 2.2: Stop words removed successfully!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
print(data["intents"][0]["patterns"])  # Check first intent's patterns

['register fir? (18)', 'file fir? (29)', 'file fir? (22)', 'register fir? (3)', 'fir kaise karein? (12)', 'file fir? (15)', 'register fir? (13)', 'fir kaise karein? (19)', 'file fir? (9)', 'fir kaise karein? (24)', 'file fir? (11)', 'fir kaise karein? (14)', 'register fir? (6)', 'register fir? (5)', 'register fir? (25)', 'register fir? (15)', 'file fir? (1)', 'file fir? (12)', 'register fir? (17)', 'file fir? (16)']


In [16]:
#TF-IDF vectorization converts words into numerical values based on how often they appear, but with a twistâ€”it also considers how unique a word is across all queries.
#ðŸ”¹ How It Works
#âœ… Term Frequency (TF) â†’ Counts how many times a word appears in a pattern.
#âœ… Inverse Document Frequency (IDF) â†’ Reduces the importance of words that appear too often across multiple queries.
#âœ… Final TF-IDF Score â†’ Words with higher scores are more relevant for identifying intent.


from sklearn.feature_extraction.text import TfidfVectorizer

# Extract patterns for vectorization
patterns = [pattern for intent in data["intents"] for pattern in intent["patterns"]]

# Apply TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(patterns)

print("âœ… Step 2.3: TF-IDF Vectorization applied successfully!")
print(f"Shape of TF-IDF matrix: {X_tfidf.shape}")  # Shows number of processed patterns & features

âœ… Step 2.3: TF-IDF Vectorization applied successfully!
Shape of TF-IDF matrix: (320, 97)


In [18]:
TfidfVectorizer(min_df=1) 

In [20]:
print(tfidf_vectorizer.get_feature_names_out())

['10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '21' '22' '23'
 '24' '25' '26' '27' '28' '29' '30' '31' '32' '33' '34' '35' '36' '37'
 '38' '39' '40' 'agreement' 'apply' 'bail' 'banayein' 'case' 'category'
 'challan' 'check' 'complaint' 'contract' 'court' 'crime' 'cyber'
 'dekhein' 'dekhna' 'disputes' 'divorce' 'do' 'encroached' 'file' 'filing'
 'fine' 'fir' 'fraud' 'get' 'hai' 'hearing' 'immediately' 'income' 'india'
 'itr' 'jameen' 'ka' 'kab' 'kaise' 'karein' 'kiraye' 'kya' 'land' 'lein'
 'lene' 'make' 'marriage' 'online' 'payment' 'portal' 'procedure'
 'process' 'property' 'register' 'registration' 'rent' 'report' 'resolve'
 'return' 'shaadi' 'status' 'suljhaayein' 'synthetic' 'talaq' 'tax' 'tell'
 'topic' 'traffic' 'vivad' 'work']


# Model Training

In [107]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Create the NaÃ¯ve Bayes model with TF-IDF
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Extract patterns and their corresponding intent labels
patterns = [pattern for intent in data["intents"] for pattern in intent["patterns"]]
labels = [intent["tag"] for intent in data["intents"] for _ in intent["patterns"]]

# Train the model
model.fit(patterns, labels)

print("âœ… Step 3.2: NaÃ¯ve Bayes model trained successfully!")

âœ… Step 3.2: NaÃ¯ve Bayes model trained successfully!


In [109]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split data into training & testing sets (80% train, 20% test)
patterns_train, patterns_test, labels_train, labels_test = train_test_split(patterns, labels, test_size=0.2, random_state=42)

# Retrain model on training data
model.fit(patterns_train, labels_train)

# Predict on test data
predictions = model.predict(patterns_test)

# Evaluate accuracy
accuracy = accuracy_score(labels_test, predictions)
print(f"âœ… Model Accuracy: {accuracy * 100:.2f}%")  # Display accuracy percentage

# Detailed classification report
print("\nðŸ“Œ Classification Report:")
print(classification_report(labels_test, predictions))

âœ… Model Accuracy: 67.19%

ðŸ“Œ Classification Report:
                       precision    recall  f1-score   support

         bail_process       0.30      1.00      0.46         3
 court_hearing_status       1.00      1.00      1.00         5
          cyber_crime       0.09      1.00      0.17         1
    divorce_procedure       1.00      1.00      1.00         7
     fir_registration       1.00      1.00      1.00         5
    income_tax_return       1.00      1.00      1.00         6
marriage_registration       0.83      1.00      0.91         5
     property_dispute       1.00      1.00      1.00         4
     rental_agreement       0.67      1.00      0.80         4
 synthetic_category_1       0.00      0.00      0.00         1
synthetic_category_11       0.00      0.00      0.00         1
synthetic_category_14       0.00      0.00      0.00         1
synthetic_category_15       0.00      0.00      0.00         1
 synthetic_category_2       0.00      0.00      0.00         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [111]:
print(model.predict(["How can I file an FIR?"]))  # Example query

['fir_registration']


In [127]:
print(model.predict(["Can I register an FIR online?"])) 
print(model.predict(["can i get bail for a non-bailable offense"])) 
print(model.predict(["How do I register a lease contract?"])) 
print(model.predict(["Can I file a case for encroachment on my land?"])) 

['fir_registration']
['bail_process']
['marriage_registration']
['property_dispute']
