In [1]:
import pandas as pd
dataset = pd.read_csv(
    'sms+spam+collection\SMSSpamCollection', 
    sep='\t',
    names=["label", "message"]
    )
dataset

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
dataset['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [2]:
messages = dataset['message']

In [2]:
import string
import nltk

In [3]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [5]:
ss = SnowballStemmer('english')


In [4]:
stop_words_EN = set(stopwords.words('english'))

In [6]:
corpus = []
for i in range(len(messages)):
    messages[i] = messages[i].lower().translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(messages[i])
    words = [ss.stem(word) for word in words if word not in stop_words_EN]
    messages[i] = ' '.join(words)
    corpus.append(messages[i])

In [20]:
messages[100]

'pleas dont text anymor noth els say'

In [26]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkts 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say ear hor u c alreadi say',
 'nah dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chgs send £150 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea £900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'im gon na home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc appli repli hl 4 info',
 'urg

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [36]:
y=pd.get_dummies(dataset['label'])
y

Unnamed: 0,ham,spam
0,True,False
1,True,False
2,False,True
3,True,False
4,True,False
...,...,...
5567,False,True
5568,True,False
5569,True,False
5570,True,False


In [37]:
#Getting Spam values
y=y.iloc[:,1].values
y

array([False, False,  True, ..., False, False, False])

# BOW Model Implementation

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [31]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [32]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       953
        True       0.95      0.94      0.94       162

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



# TF-IDF Model implementation

In [34]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [39]:
spam_detect_model_tf_idf = MultinomialNB().fit(X_train, y_train)

ValueError: setting an array element with a sequence.

In [40]:
#TF-IDF prediction
y_pred_tf_idf = spam_detect_model_tf_idf.predict(X_test)

In [41]:
print(classification_report(y_pred_tf_idf,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       973
        True       0.88      0.99      0.93       142

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Word2vec Implementation

## Preprocessing W2Vec

In [10]:
%pip install gensim



In [5]:
import gensim   

In [6]:
from gensim.models import Word2Vec, KeyedVectors

In [13]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [7]:
messages = dataset['message']

In [11]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words_EN = set(stopwords.words('english'))

def preprocess_message(message):
    # Convert to lowercase and remove punctuation
    message = message.lower().translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    words = word_tokenize(message)
    
    # Lemmatize and remove stop words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words_EN]
    
    return words

# Lists to store preprocessed messages and labels
preprocessed_messages = []
labels = []

# Loop through the DataFrame to preprocess each message
for idx, row in dataset.iterrows():
    message = row['message']
    label = row['label']
    
    # Preprocess the message
    tokens = preprocess_message(message)
    
    # If there are valid tokens, keep the row, else remove it
    if tokens:
        preprocessed_messages.append(tokens)  # Store as list of tokens
        labels.append(label)
    else:
        print(f"Removed row {idx} due to invalid tokens.")
        print(f"Original message: {message}")
        print(f"Tokens: {tokens}")
        print("---")

# Create a new cleaned DataFrame
cleaned_df = pd.DataFrame({
    'label': labels,
    'tokens': preprocessed_messages
})

# Display the cleaned dataset
print(f"Number of valid rows: {len(cleaned_df)}")
print(cleaned_df.head())

# Optional: Print some statistics
print(f"\nOriginal dataset size: {len(dataset)}")
print(f"Cleaned dataset size: {len(cleaned_df)}")
print(f"Removed {len(dataset) - len(cleaned_df)} rows")

# Create corpus_w2v (list of preprocessed messages)
corpus_w2v = preprocessed_messages

# Display some examples from corpus_w2v
print("\nFirst 5 preprocessed messages in corpus_w2v:")
for i, tokens in enumerate(corpus_w2v[:5], 1):
    print(f"{i}. {tokens}")

# # Optional: Save preprocessed data
# cleaned_df.to_csv('cleaned_dataset.csv', index=False)
# print("\nCleaned dataset saved to 'cleaned_dataset.csv'")

# # If you need to save corpus_w2v for later use
# import pickle
# with open('corpus_w2v.pkl', 'wb') as f:
#     pickle.dump(corpus_w2v, f)
# print("corpus_w2v saved to 'corpus_w2v.pkl'")

Removed row 960 due to invalid tokens.
Original message: Where @
Tokens: []
---
Removed row 2807 due to invalid tokens.
Original message: Can a not?
Tokens: []
---
Removed row 3376 due to invalid tokens.
Original message: :) 
Tokens: []
---
Removed row 4575 due to invalid tokens.
Original message: :( but your not here....
Tokens: []
---
Removed row 4824 due to invalid tokens.
Original message: :-) :-)
Tokens: []
---
Number of valid rows: 5567
  label                                             tokens
0   ham  [go, jurong, point, crazy, available, bugis, n...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, 2, wkly, comp, win, fa, cup, fin...
3   ham      [u, dun, say, early, hor, u, c, already, say]
4   ham  [nah, dont, think, go, usf, life, around, though]

Original dataset size: 5572
Cleaned dataset size: 5567
Removed 5 rows

First 5 preprocessed messages in corpus_w2v:
1. ['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world',

In [17]:
cleaned_df['tokens'].loc[100]

['please', 'dont', 'text', 'anymore', 'nothing', 'else', 'say']

In [18]:
words_in_sentences = cleaned_df['tokens'].tolist()
words_in_sentences

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  '08452810075over18s'],
 ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'],
 ['nah', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  '£150',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
 

In [67]:
cleaned_df['tokens'].isnull().sum()

0

## Generating model: declaration

In [68]:
### Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words_in_sentences,window=5,min_count=1)
model.wv.index_to_key

['u',
 'call',
 '2',
 'im',
 'get',
 'ur',
 'go',
 '4',
 'dont',
 'ok',
 'ltgt',
 'free',
 'know',
 'got',
 'come',
 'like',
 'day',
 'ill',
 'good',
 'time',
 'want',
 'text',
 'love',
 'send',
 'need',
 'one',
 'going',
 'ü',
 'today',
 'r',
 'txt',
 'home',
 'lor',
 'see',
 'sorry',
 'stop',
 'still',
 'back',
 'mobile',
 'think',
 'n',
 'reply',
 'take',
 'tell',
 'phone',
 'new',
 'later',
 'well',
 'week',
 'hi',
 'da',
 'please',
 'make',
 'cant',
 'night',
 'say',
 'claim',
 'thing',
 'much',
 'oh',
 'great',
 'hey',
 'dear',
 'pls',
 'give',
 'number',
 'happy',
 'work',
 'na',
 'friend',
 'hope',
 'message',
 'way',
 'wat',
 'thats',
 'msg',
 'prize',
 'right',
 'wan',
 'c',
 'let',
 'already',
 'tomorrow',
 'ask',
 'said',
 'yes',
 'yeah',
 'really',
 'min',
 'amp',
 'e',
 '1',
 'co',
 'babe',
 'life',
 'miss',
 'meet',
 'didnt',
 'last',
 'morning',
 'win',
 'service',
 'would',
 'year',
 'anything',
 'cash',
 'thanks',
 'find',
 'ive',
 'feel',
 'tone',
 'lol',
 'every',
 

In [69]:
model.corpus_count

5567

In [46]:
model.epochs

5

In [70]:
model.wv.similar_by_word('winner')

[('shit', 0.988513171672821),
 ('prize', 0.9882519245147705),
 ('ive', 0.9882087707519531),
 ('stop', 0.9881839156150818),
 ('18', 0.9881641268730164),
 ('service', 0.9881330728530884),
 ('draw', 0.9880269765853882),
 ('lot', 0.9880094528198242),
 ('16', 0.9880028367042542),
 ('he', 0.9879880547523499)]

In [71]:
model.wv['winner'].shape

(100,)

In [21]:
import numpy as np
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)


In [20]:
from tqdm import tqdm

In [72]:
words_in_sentences[73]

['kkwhere', 'youhow', 'performed']

In [73]:
len(words_in_sentences)

5567

In [37]:
type(model.wv.index_to_key)

list

In [74]:
#apply for the entire sentences
X_w2v=[]
for i in tqdm(range(len(words_in_sentences))):
    X_w2v.append(avg_word2vec(words_in_sentences[i]))

100%|██████████| 5567/5567 [00:01<00:00, 5158.92it/s]


In [25]:
type(X_w2v)

list

In [75]:
X_w2v

[array([-0.102405  ,  0.20007972,  0.1172621 , -0.00936747, -0.03596563,
        -0.34107357,  0.04697861,  0.41388494, -0.14857686, -0.11947347,
        -0.10070763, -0.29351994, -0.05134381,  0.09977987,  0.11731727,
        -0.19812982, -0.00404938, -0.31056342,  0.06020798, -0.42392564,
         0.12385891,  0.06806765,  0.14141329, -0.0705805 , -0.04729001,
         0.11617251, -0.1930397 , -0.09660539, -0.20360549, -0.00854713,
         0.25552925,  0.04623343,  0.03187591, -0.15384543, -0.07487012,
         0.237853  ,  0.03889425, -0.17171334, -0.15609479, -0.3929305 ,
        -0.05359719, -0.2065856 , -0.03314725, -0.05784303,  0.08921127,
        -0.11800025, -0.17495646, -0.05577526,  0.08741011,  0.14301889,
         0.0705376 , -0.14816168, -0.05502738, -0.12542945, -0.16601017,
         0.08035033,  0.16124149,  0.04745151, -0.2687184 ,  0.07528266,
         0.08395925,  0.10554524, -0.02656582, -0.035848  , -0.27919084,
         0.15155064,  0.0700734 ,  0.22145683, -0.2

In [76]:
array_w2v = np.array(X_w2v, dtype=np.float32)
array_w2v

array([[-0.102405  ,  0.20007972,  0.1172621 , ..., -0.17663291,
         0.0547845 , -0.058989  ],
       [-0.11343634,  0.21284007,  0.12740032, ..., -0.18564253,
         0.06415263, -0.0631922 ],
       [-0.08587184,  0.17193122,  0.10327996, ..., -0.14683658,
         0.04569665, -0.04953408],
       ...,
       [-0.00353921,  0.01452213,  0.01111312, ..., -0.01674189,
         0.00416349, -0.00363929],
       [-0.12492196,  0.24555239,  0.14746349, ..., -0.21050446,
         0.06789251, -0.07041014],
       [-0.07324173,  0.13606514,  0.08932451, ..., -0.11491849,
         0.04253566, -0.03414932]], dtype=float32)

In [64]:
help(np.array)

Help on built-in function array in module numpy:

array(...)
    array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0,
          like=None)
    
    Create an array.
    
    Parameters
    ----------
    object : array_like
        An array, any object exposing the array interface, an object whose
        ``__array__`` method returns an array, or any (nested) sequence.
        If object is a scalar, a 0-dimensional array containing object is
        returned.
    dtype : data-type, optional
        The desired data-type for the array. If not given, NumPy will try to use
        a default ``dtype`` that can represent the values (by applying promotion
        rules when necessary.)
    copy : bool, optional
        If true (default), then the object is copied.  Otherwise, a copy will
        only be made if ``__array__`` returns a copy, if obj is a nested
        sequence, or if a copy is needed to satisfy any of the other
        requirements (``dtype``, ``order``, e

In [27]:
array_w2v.shape

(5567,)

## Getting labels

In [28]:
results_from_df = pd.get_dummies(cleaned_df['label'])
results_from_df

Unnamed: 0,ham,spam
0,True,False
1,True,False
2,False,True
3,True,False
4,True,False
...,...,...
5562,False,True
5563,True,False
5564,True,False
5565,True,False


In [35]:
spam_values = results_from_df.iloc[:,1]
spam_values

0       False
1       False
2        True
3       False
4       False
        ...  
5562     True
5563    False
5564    False
5565    False
5566    False
Name: spam, Length: 5567, dtype: bool

In [34]:
spam_array= np.array(spam_values)
spam_array

array([False, False,  True, ..., False, False, False])

## Prediction

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(array_w2v, spam_array, test_size = 0.20, random_state = 42)

In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Random Forest classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
random_forest_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = random_forest_classifier.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9452423698384201
              precision    recall  f1-score   support

       False       0.94      1.00      0.97       951
        True       0.96      0.65      0.78       163

    accuracy                           0.95      1114
   macro avg       0.95      0.82      0.87      1114
weighted avg       0.95      0.95      0.94      1114

