In [5]:
import pandas as pd
import re
import os
import warnings
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category = UserWarning)

# === Load data ===
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
kaggle_test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print("\nBelow is the training data:\n")
train_df


Below is the training data:



Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
# === Preprocessing ===
def clean_text(text):
    text = re.sub(r"@\w+", "", text)     # Remove @mentions
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)    # Remove HTML tags
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove special chars
    text = re.sub(r"\s+", " ", text)     # Remove extra whitespace
    return text.strip().lower()

# Combine keyword and text, fill missing keywords
train_df["keyword"] = train_df["keyword"].fillna("")
train_df["text"] = train_df["keyword"] + " " + train_df["text"]
train_df["text"] = train_df["text"].apply(clean_text)

kaggle_test_df["keyword"] = kaggle_test_df["keyword"].fillna("")
kaggle_test_df["text"] = kaggle_test_df["keyword"] + " " + kaggle_test_df["text"]
kaggle_test_df["text"] = kaggle_test_df["text"].apply(clean_text)

# === Train-test split ===
X_train, X_val, y_train, y_val = train_test_split(train_df["text"],
                                                  train_df["target"],
                                                  test_size=0.1,
                                                  random_state=42)

In [7]:
# === TF-IDF + Naive Bayes Pipeline ===
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# === Evaluation ===
val_preds = nb_model.predict(X_val_vec)
print("\nValidation Results:\n")
print(classification_report(y_val, val_preds))


Validation Results:

              precision    recall  f1-score   support

           0       0.76      0.89      0.82       426
           1       0.83      0.65      0.73       336

    accuracy                           0.79       762
   macro avg       0.80      0.77      0.78       762
weighted avg       0.79      0.79      0.78       762



In [8]:
# === Predict on Kaggle test data ===
X_test_kaggle = vectorizer.transform(kaggle_test_df["text"])
test_preds = nb_model.predict(X_test_kaggle)

submission = pd.DataFrame({
    "id": kaggle_test_df["id"],
    "target": test_preds
})

submission.to_csv("submission.csv", index=False)

os.makedirs("naive_bayes_model", exist_ok=True)
joblib.dump(nb_model, "naive_bayes_model/model.pkl")
joblib.dump(vectorizer, "naive_bayes_model/vectorizer.pkl")
print("✅ Saved model and vectorizer to naive_bayes_model/")

✅ Saved model and vectorizer to naive_bayes_model/
