In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from train_model import NaiveBayesImplementation, tokenize

In [2]:
df = pd.read_csv("../Dataset/Caption Data/stray_animal_captions_dataset.csv")

In [14]:
df.drop_duplicates(subset=["Caption"],inplace=True)

In [3]:
df['Category'] = df['Category'].replace({
    'Illegal': 'Illegal Activities',
    'Illegal Activity': 'Illegal Activities'
})

In [5]:
df.to_csv('stray_animal_captions_dataset.csv', index=False)

In [6]:
df.head(5)

Unnamed: 0,Caption,Category
0,Wounded animal lying on the road at Itahari. V...,Help
1,Puppy crying in pain at Pokhara. Please help!,Help
2,Wounded animal lying on the road at Butwal. Vo...,Help
3,Abandoned kitten found in Itahari. Looking for...,Help
4,Injured stray dog spotted near Butwal. Immedia...,Help


In [15]:
X = df["Caption"].astype(str).tolist()
Y = df["Category"].astype(str).tolist()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
X_train_tokenized = [tokenize(text) for text in X_train]
X_test_tokenized = [tokenize(text) for text in X_test]


In [18]:
model2 = NaiveBayesImplementation(alpha=1.0)

In [19]:
model2.fit(X_train_tokenized,y_train)

In [20]:
y_pred = model2.predict(X_test_tokenized)

In [21]:
test_accuracy = np.mean([p == t for p, t in zip(y_pred, y_test)])
y_train_pred = model2.predict(X_train_tokenized)
train_accuracy = np.mean([p == t for p, t in zip(y_train_pred, y_train)])
print(f"Test accuracy: {test_accuracy:.3f}")
print(f"Training accuracy: {train_accuracy:.3f}")

Test accuracy: 0.999
Training accuracy: 0.999


In [22]:
from sklearn.metrics import confusion_matrix, classification_report

In [23]:
class_to_idx = {cls: i for i, cls in enumerate(model2.classes_)}
y_test_idx = [class_to_idx[cls] for cls in y_test]
y_pred_idx = [class_to_idx[cls] for cls in y_pred]
print("Confusion Matrix:")
print(confusion_matrix(y_test_idx, y_pred_idx))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=model2.classes_))

Confusion Matrix:
[[684   0   0   0]
 [  2 483   0   0]
 [  0   0 511   0]
 [  0   0   0 514]]
Classification Report:
                    precision    recall  f1-score   support

          Adoption       1.00      1.00      1.00       684
              Help       1.00      1.00      1.00       485
Illegal Activities       1.00      1.00      1.00       511
       Vaccination       1.00      1.00      1.00       514

          accuracy                           1.00      2194
         macro avg       1.00      1.00      1.00      2194
      weighted avg       1.00      1.00      1.00      2194



In [24]:
# Add these checks to verify no leakage
print("=== DATA INTEGRITY CHECK ===")
print(f"Raw training samples: {len(X_train)}")
print(f"Raw test samples: {len(X_test)}")
print(f"Tokenized training samples: {len(X_train_tokenized)}")
print(f"Tokenized test samples: {len(X_test_tokenized)}")

# Check for overlapping content between train and test
train_texts = set([' '.join(tokens) for tokens in X_train_tokenized])
test_texts = set([' '.join(tokens) for tokens in X_test_tokenized])
overlap = train_texts.intersection(test_texts)
print(f"Overlapping samples between train and test: {len(overlap)}")  # Should be 0

# Check if your model is cheating
print("First 5 test predictions:")
for i in range(5):
    print(f"True: {y_test[i]}, Pred: {y_pred[i]}")

=== DATA INTEGRITY CHECK ===
Raw training samples: 8775
Raw test samples: 2194
Tokenized training samples: 8775
Tokenized test samples: 2194
Overlapping samples between train and test: 4
First 5 test predictions:
True: Adoption, Pred: Adoption
True: Illegal Activities, Pred: Illegal Activities
True: Vaccination, Pred: Vaccination
True: Adoption, Pred: Adoption
True: Adoption, Pred: Adoption


In [25]:
# Check for near-duplicates (similar texts)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Check for highly similar captions
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['Caption'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Find highly similar pairs (cosine similarity > 0.9)
highly_similar = []
for i in range(len(cosine_sim)):
    for j in range(i+1, len(cosine_sim)):
        if cosine_sim[i][j] > 0.9:
            highly_similar.append((i, j, cosine_sim[i][j]))

print(f"Highly similar caption pairs (cosine > 0.9): {len(highly_similar)}")

# Show some examples
for i, j, sim in highly_similar[:5]:
    print(f"\nSimilarity: {sim:.3f}")
    print(f"Caption 1: {df.iloc[i]['Caption']}")
    print(f"Caption 2: {df.iloc[j]['Caption']}")
    print(f"Category 1: {df.iloc[i]['Category']}")
    print(f"Category 2: {df.iloc[j]['Category']}")

Highly similar caption pairs (cosine > 0.9): 283

Similarity: 0.930
Caption 1: Wounded animal lying on the road at Itahari. Volunteers needed urgently!
Caption 2: Wounded animal lying on the road at Butwal. Volunteers needed urgently!
Category 1: Help
Category 2: Help

Similarity: 0.930
Caption 1: Wounded animal lying on the road at Itahari. Volunteers needed urgently!
Caption 2: Wounded animal lying on the road at Lalitpur. Volunteers needed urgently!
Category 1: Help
Category 2: Help

Similarity: 0.929
Caption 1: Wounded animal lying on the road at Itahari. Volunteers needed urgently!
Caption 2: Wounded animal lying on the road at Biratnagar. Volunteers needed urgently!
Category 1: Help
Category 2: Help

Similarity: 0.957
Caption 1: Wounded animal lying on the road at Itahari. Volunteers needed urgently!
Caption 2: Wounded animal lying on the road at Pokhara. Volunteers needed urgently!
Category 1: Help
Category 2: Help

Similarity: 0.930
Caption 1: Wounded animal lying on the road a

In [14]:
with open("naive_bayes_model.json", "w", encoding="utf-8") as f:
    f.write(model.to_json())

In [15]:
import json
import math
import re

In [16]:
with open("naive_bayes_model.json", "r") as f:
    loaded_model = json.load(f)


In [17]:
class_priors = loaded_model["class_priors_log"]
token_likelihoods = loaded_model["likelihoods_log"]
vocab = loaded_model["vocab"]
vocab_size = len(vocab)

In [18]:
STOPWORDS = {
    "a","an","the","and","or","if","in","on","of","for","to","from","is","are","was","were",
    "be","been","being","it","its","this","that","these","those","as","at","by","with","but",
    "about","into","over","after","before","while","so","no","not","too","very","can","cannot",
    "we","you","your","yours","our","ours","they","them","their","theirs","he","she","his","her",
    "i","me","my","mine","do","does","did","doing","have","has","had","having","will","would",
    "should","could","may","might","also","than","then","there","here","up","down","out",
    "just","like"
}

In [19]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[@#]\w+", " ", text)
    tokens = re.findall(r"[a-z]+", text)
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]


In [20]:
def predict(text):
    tokens = tokenize(text)
    scores = {}
    
    for cls in class_priors:
        scores[cls] = class_priors[cls]
        
        for token in tokens:
            if token in vocab:  
                word_index = vocab[token]  
                scores[cls] += token_likelihoods[cls][word_index]  
            else:
               
                scores[cls] += math.log(1e-6)  
    
    return max(scores, key=scores.get)

In [32]:
sample = "A boy fed poison to a dog"


In [33]:
print(predict(sample))

Illegal Activities


In [22]:
df

Unnamed: 0,Caption,Category
0,Wounded animal lying on the road at Itahari. V...,Help
1,Puppy crying in pain at Pokhara. Please help!,Help
2,Wounded animal lying on the road at Butwal. Vo...,Help
3,Abandoned kitten found in Itahari. Looking for...,Help
4,Injured stray dog spotted near Butwal. Immedia...,Help
...,...,...
19995,Free rabies vaccination for street dogs in Pok...,Vaccination
19996,Urgent help needed for injured stray puppy #9996,Help
19997,Looking to adopt a 2-month-old vaccinated kitt...,Adoption
19998,Local butcher illegally killed a street dog. A...,Illegal Activities


In [3]:
from collections import Counter

# Check if your training data is imbalanced
class_counts = Counter(y_train)
print("Class distribution in training data:")
for cls, count in class_counts.items():
    print(f"{cls}: {count} samples")

NameError: name 'y_train' is not defined