In [5]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [w for w in tokens if w.isalpha() and w not in stop_words]
    return words

w2v = KeyedVectors.load_word2vec_format("/kaggle/input/gnewsvector/GoogleNews-vectors-negative300.bin", binary=True)

def vectorize(text):
    words = preprocess(text)
    vectors = [w2v[w] for w in words if w in w2v]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

X = np.array([vectorize(msg) for msg in df['Message']])
y = df['Label'].map({'ham': 0, 'spam': 1}).values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.9426008968609866


In [11]:
def predict_message_class(model, w2v_model, message):
    words = preprocess(message)
    vectors = [w2v_model[w] for w in words if w in w2v_model]
    vec = np.mean(vectors, axis=0) if vectors else np.zeros(300)
    return 'spam' if model.predict(vec.reshape(1, -1))[0] else 'ham'