In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from train_model import NaiveBayesImplementation, tokenize

In [4]:
df = pd.read_csv("../Dataset/Caption Data/stray_animal_captions_dataset.csv")

In [3]:
df['Category'] = df['Category'].replace({
    'Illegal': 'Illegal Activities',
    'Illegal Activity': 'Illegal Activities'
})

In [5]:
df.to_csv('stray_animal_captions_dataset.csv', index=False)

In [6]:
df.head(5)

Unnamed: 0,Caption,Category
0,Wounded animal lying on the road at Itahari. V...,Help
1,Puppy crying in pain at Pokhara. Please help!,Help
2,Wounded animal lying on the road at Butwal. Vo...,Help
3,Abandoned kitten found in Itahari. Looking for...,Help
4,Injured stray dog spotted near Butwal. Immedia...,Help


In [7]:
X = df["Caption"].astype(str).tolist()
Y = df["Category"].astype(str).tolist()

In [8]:
X_tokens = [tokenize(text) for text in X]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_tokens, Y, test_size=0.2, random_state=42)

In [10]:
model = NaiveBayesImplementation(alpha=1.0)

In [11]:
model.fit(X_train,y_train)

In [12]:
y_pred = model.predict(X_test)

In [13]:
accuracy = float(np.mean([p == t for p, t in zip(y_pred, y_test)]))
accuracy

1.0

In [14]:
with open("naive_bayes_model.json", "w", encoding="utf-8") as f:
    f.write(model.to_json())

In [15]:
import json
import math
import re

In [16]:
with open("naive_bayes_model.json", "r") as f:
    loaded_model = json.load(f)


In [17]:
class_priors = loaded_model["class_priors_log"]
token_likelihoods = loaded_model["likelihoods_log"]
vocab = loaded_model["vocab"]
vocab_size = len(vocab)

In [18]:
STOPWORDS = {
    "a","an","the","and","or","if","in","on","of","for","to","from","is","are","was","were",
    "be","been","being","it","its","this","that","these","those","as","at","by","with","but",
    "about","into","over","after","before","while","so","no","not","too","very","can","cannot",
    "we","you","your","yours","our","ours","they","them","their","theirs","he","she","his","her",
    "i","me","my","mine","do","does","did","doing","have","has","had","having","will","would",
    "should","could","may","might","also","than","then","there","here","up","down","out",
    "just","like"
}

In [19]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[@#]\w+", " ", text)
    tokens = re.findall(r"[a-z]+", text)
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]


In [20]:
def predict(text):
    tokens = tokenize(text)
    scores = {}
    
    for cls in class_priors:
        scores[cls] = class_priors[cls]
        
        for token in tokens:
            if token in vocab:  
                word_index = vocab[token]  
                scores[cls] += token_likelihoods[cls][word_index]  
            else:
               
                scores[cls] += math.log(1e-6)  
    
    return max(scores, key=scores.get)

In [32]:
sample = "A boy fed poison to a dog"


In [33]:
print(predict(sample))

Illegal Activities


In [22]:
df

Unnamed: 0,Caption,Category
0,Wounded animal lying on the road at Itahari. V...,Help
1,Puppy crying in pain at Pokhara. Please help!,Help
2,Wounded animal lying on the road at Butwal. Vo...,Help
3,Abandoned kitten found in Itahari. Looking for...,Help
4,Injured stray dog spotted near Butwal. Immedia...,Help
...,...,...
19995,Free rabies vaccination for street dogs in Pok...,Vaccination
19996,Urgent help needed for injured stray puppy #9996,Help
19997,Looking to adopt a 2-month-old vaccinated kitt...,Adoption
19998,Local butcher illegally killed a street dog. A...,Illegal Activities


In [23]:
from collections import Counter

# Check if your training data is imbalanced
class_counts = Counter(y_train)
print("Class distribution in training data:")
for cls, count in class_counts.items():
    print(f"{cls}: {count} samples")

Class distribution in training data:
Illegal Activities: 4003 samples
Adoption: 3986 samples
Vaccination: 3989 samples
Help: 4022 samples
