In [2]:
!pip install pandas nltk gensim scikit-learn


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m86.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download stopwords and punkt tokenizer
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']

# Preprocessing
stop_words = set(stopwords.words('english'))

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['Tokens'] = df['Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Label,Message,Tokens
0,ham,"Go until jurong point, crazy.. Available only ...","Go jurong point, crazy.. Available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,U dun say early hor... U c already say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I think goes usf, lives around though"
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,This 2nd time tried 2 contact u. U å£750 Pound...
5568,ham,Will Ì_ b going to esplanade fr home?,Will Ì_ b going esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...","Pity, * mood that. So...any suggestions?"
5570,ham,The guy did some bitching but I acted like i'd...,The guy bitching I acted like interested buyin...


In [15]:
import gensim.downloader as api

# Load the pre-trained Google News Word2Vec model (this downloads and caches it)
model = api.load("word2vec-google-news-300")



In [46]:
# Load Google News Word2Vec model (binary format)
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]


def vectorize(tokens, model, dim=300):
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

df['Vector'] = df['Tokens'].apply(lambda x: vectorize(x,model))
X = np.vstack(df['Vector'].values)
y = df['Label'].map({'ham': 0, 'spam': 1})


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.9686


In [48]:
def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vec = vectorize(tokens, w2v_model).reshape(1, -1)
    prediction = model.predict(vec)[0]
    return "spam" if prediction == 1 else "ham"


In [51]:
example = "Exclusive offer! Act now to get a free iPhone 15. Limited stock, click here to register!"
print("Prediction:", predict_message_class(clf, model, example))


Prediction: spam
