In [70]:
!pip install gensim
import pandas as pd



In [71]:
messages=pd.read_csv('/SMSSpamCollection.txt',sep='\t',names=['label','message'])

In [72]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [73]:
messages.shape

(5572, 2)

In [74]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [75]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [76]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [77]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()

    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)


In [78]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,binary=True,ngram_range=(2,2))
x=cv.fit_transform(corpus).toarray()


In [80]:
x.shape

(5572, 2500)

In [81]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y = y.astype(int)


In [82]:

y

array([0, 0, 1, ..., 0, 0, 0])

In [83]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [84]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model=MultinomialNB().fit(x_train,y_train)

In [85]:
y_pred = spam_detect_model.predict(x_test)


In [86]:
spam_detect_model.predict_proba(x_test)


array([[9.99999999e-01, 1.08062171e-09],
       [4.92978526e-03, 9.95070215e-01],
       [8.68297061e-01, 1.31702939e-01],
       ...,
       [9.60144335e-01, 3.98556654e-02],
       [4.53823719e-04, 9.99546176e-01],
       [8.68297061e-01, 1.31702939e-01]])

In [87]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)
print(score)

0.9721973094170404


In [88]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       955
           1       1.00      0.81      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [89]:
from sklearn.ensemble import RandomForestClassifier
spam_detect_model=RandomForestClassifier().fit(x_train,y_train)

In [90]:
spam_detect_model.predict_proba(x_test)

array([[0.99669006, 0.00330994],
       [0.02      , 0.98      ],
       [0.97703783, 0.02296217],
       ...,
       [0.95952928, 0.04047072],
       [0.        , 1.        ],
       [0.97703783, 0.02296217]])

In [91]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [92]:
corpus1=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
    review.lower()
    review.split()

    review={lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')}
    review=' '.join(review)
    corpus1.append(review)

In [93]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [95]:
nltk.download('punkt_tab')
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [96]:
words

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli',
  'over'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'r

In [97]:
import gensim

In [98]:
from gensim.models import Word2Vec
model=Word2Vec(words,window=5,min_count=2)

In [99]:
model.wv.index_to_key

['call',
 'go',
 'get',
 'ur',
 'gt',
 'lt',
 'come',
 'day',
 'ok',
 'free',
 'know',
 'love',
 'like',
 'time',
 'want',
 'good',
 'got',
 'text',
 'send',
 'txt',
 'need',
 'one',
 'today',
 'take',
 'stop',
 'see',
 'home',
 'think',
 'repli',
 'lor',
 'sorri',
 'still',
 'tell',
 'mobil',
 'back',
 'da',
 'dont',
 'make',
 'phone',
 'pleas',
 'week',
 'hi',
 'say',
 'new',
 'work',
 'pl',
 'later',
 'miss',
 'hope',
 'ask',
 'co',
 'msg',
 'min',
 'meet',
 'messag',
 'dear',
 'night',
 'wait',
 'happi',
 'well',
 'give',
 'tri',
 'thing',
 'much',
 'great',
 'oh',
 'claim',
 'wat',
 'hey',
 'number',
 'friend',
 'thank',
 'ye',
 'way',
 'www',
 'prize',
 'let',
 'feel',
 'right',
 'even',
 'tomorrow',
 'win',
 'pick',
 'alreadi',
 'care',
 'cash',
 'said',
 'amp',
 'im',
 'leav',
 'yeah',
 'tone',
 'realli',
 'find',
 'babe',
 'life',
 'morn',
 'sleep',
 'last',
 'uk',
 'servic',
 'keep',
 'year',
 'sure',
 'nokia',
 'com',
 'would',
 'buy',
 'use',
 'anyth',
 'contact',
 'start',

In [100]:
model.corpus_count

5565

In [114]:
model.corpus_total_words

47789

In [102]:
model.epochs

5

In [103]:
model.wv.most_similar('prize')

[('claim', 0.9994415044784546),
 ('award', 0.9991815090179443),
 ('guarante', 0.9989039897918701),
 ('call', 0.998846173286438),
 ('line', 0.9988207221031189),
 ('cash', 0.9987624287605286),
 ('contact', 0.9987478256225586),
 ('mobil', 0.9987103343009949),
 ('urgent', 0.9986355900764465),
 ('draw', 0.9986283183097839)]

In [104]:
model.wv.similar_by_word('free')

[('repli', 0.9996502995491028),
 ('tone', 0.9996228814125061),
 ('mobil', 0.9996172189712524),
 ('txt', 0.9996074438095093),
 ('text', 0.9995826482772827),
 ('call', 0.9995545148849487),
 ('stop', 0.9995022416114807),
 ('week', 0.9994515776634216),
 ('www', 0.9994415640830994),
 ('nokia', 0.9994403123855591)]

In [105]:
model.wv['free'].shape

(100,)

In [106]:
import numpy as np

def avg_word2vec(doc):
    # Filter out words not in the model's vocabulary
    words_in_vocab = [word for word in doc if word in model.wv.index_to_key]
    if not words_in_vocab:
        # Return a zero vector if the document is empty or has no words in the vocabulary
        return np.zeros(model.wv.vector_size)
    return np.mean([model.wv[word] for word in words_in_vocab], axis=0)

In [107]:
!pip install tqdm



In [108]:
from tqdm import tqdm

In [109]:
words[73]

['perform']

In [118]:
x=[]
# Keep track of the original indices of the messages that are not empty after preprocessing
valid_indices = []
for i in tqdm(range(len(words))):
    # Check if the processed message (list of words) is not empty
    if words[i]:
        x.append(avg_word2vec(words[i]))
        valid_indices.append(i) # Store the original index

100%|██████████| 5565/5565 [00:00<00:00, 5729.37it/s]


In [119]:
X_new=np.array(x)

In [120]:
X_new[0]

array([-0.22624941,  0.22413309,  0.08032385,  0.03244784,  0.0573249 ,
       -0.44238797,  0.16802044,  0.56971568, -0.10083989, -0.14108534,
       -0.12990141, -0.32878911, -0.09282934,  0.09094641,  0.15360759,
       -0.2332107 ,  0.04383187, -0.29629865,  0.03650404, -0.41822219,
        0.133421  ,  0.15446581,  0.02646315, -0.14921613, -0.04228611,
        0.06409658, -0.23671395, -0.26451081, -0.16038923,  0.0889018 ,
        0.25466195,  0.09166324,  0.05736777, -0.11498964, -0.16761571,
        0.29145792,  0.0029602 , -0.23770134, -0.1303501 , -0.46442816,
        0.01495813, -0.23489164, -0.07513127,  0.05753924,  0.2369266 ,
       -0.13030843, -0.21614929, -0.13139659,  0.15686439,  0.20504807,
        0.13717133, -0.28419182, -0.07303312, -0.02663722, -0.11074754,
        0.1727249 ,  0.18524379,  0.03809047, -0.26272419,  0.05900725,
        0.03406711,  0.14863889, -0.02486886, -0.04918087, -0.37393343,
        0.147043  ,  0.10197983,  0.17845772, -0.25163314,  0.22

In [121]:
X_new[0].shape

(100,)

In [122]:
# Example: labels = ['spam', 'ham', 'spam', ...]
from sklearn.preprocessing import LabelEncoder

# Convert text labels to numeric: spam → 1, ham → 0
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(messages['label'])

In [123]:
from sklearn.model_selection import train_test_split

# Use the valid_indices to select the corresponding labels from the original y
y_new = y[valid_indices]

x_train, x_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=0)

In [124]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(x_train, y_train)


In [125]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.8597122302158273
              precision    recall  f1-score   support

         ham       0.86      1.00      0.92       956
        spam       0.00      0.00      0.00       156

    accuracy                           0.86      1112
   macro avg       0.43      0.50      0.46      1112
weighted avg       0.74      0.86      0.79      1112



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [128]:
new_msg = "vote for me "
new_tokens = new_msg.lower().split()  # or use your tokenizer
new_vec = avg_word2vec(new_tokens).reshape(1, -1)

prediction = clf.predict(new_vec)
print("Predicted label:", label_encoder.inverse_transform(prediction))


Predicted label: ['ham']
