# Spam-Ham Classification Project

In [70]:
import pandas as pd

In [71]:
df = pd.read_csv('spam.csv',encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [72]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.rename(columns={"v1": "label", "v2": "message"}, inplace=True)
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [73]:
# The re module in Python stands for Regular Expressions and is used for pattern matching and text processing.
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lalra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
ss = SnowballStemmer('english')

1. **Removing special characters & numbers** using `re.sub(r"[^a-zA-Z\s]", "", df['message'][i])`.  
2. **Converting text to lowercase** with `.lower()`.  
3. **Splitting text into words** using `.split()`.  
4. **Applying stemming** (`port_stem(word)`) to reduce words to their root form.  
5. **Removing stopwords** using `stopwords.words('english')`.  
6. **Joining words back into a sentence** using `' '.join(review)`.  

In [75]:
corpus = []
for i in range(0, len(df)):
    review = re.sub(r"[^a-zA-Z\s]", "", df['message'][i])
    review = review.lower()
    review = review.split()
    review = [ss.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [76]:
# Our depndent feature is label - spam/ham
Y = pd.get_dummies(df['label']).astype(int)

In [77]:
Y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [78]:
Y = Y.iloc[:,0].values
Y

array([1, 1, 0, ..., 1, 1, 1], shape=(5572,))

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(corpus,Y, test_size=0.20)

In [80]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
# Initialize CountVectorizer with a vocabulary size limit of 100 unique words
# binary=True means each word presence is represented as 0 or 1 (instead of actual word counts)
cv = CountVectorizer(max_features=100, binary=True)

In [81]:
# Fit and transform the 'corpus' into a Bag of Words (BoW) representation
# X_train is Independent features
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [82]:
# 100 words with frquency
cv.vocabulary_

{'im': np.int64(33),
 'last': np.int64(35),
 'night': np.int64(54),
 'today': np.int64(84),
 'day': np.int64(12),
 'love': np.int64(41),
 'msg': np.int64(50),
 'repli': np.int64(65),
 'stop': np.int64(75),
 'would': np.int64(97),
 'didnt': np.int64(14),
 'know': np.int64(34),
 'need': np.int64(52),
 'got': np.int64(25),
 'lor': np.int64(40),
 'one': np.int64(58),
 'time': np.int64(83),
 'get': np.int64(21),
 'back': np.int64(3),
 'miss': np.int64(47),
 'alreadi': np.int64(0),
 'ask': np.int64(1),
 'hi': np.int64(29),
 'babe': np.int64(2),
 'like': np.int64(39),
 'sorri': np.int64(73),
 'realli': np.int64(64),
 'see': np.int64(69),
 'tomorrow': np.int64(85),
 'call': np.int64(4),
 'ill': np.int64(32),
 'text': np.int64(78),
 'think': np.int64(82),
 'work': np.int64(96),
 'pleas': np.int64(61),
 'come': np.int64(9),
 'said': np.int64(67),
 'ok': np.int64(57),
 'home': np.int64(30),
 'sleep': np.int64(72),
 'good': np.int64(24),
 'pls': np.int64(62),
 'send': np.int64(70),
 'da': np.int64

In [83]:
from sklearn.naive_bayes import MultinomialNB

In [84]:
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [85]:
spam_detect_model

In [86]:
y_pred = spam_detect_model.predict(X_test)

In [87]:
from sklearn.metrics import accuracy_score, classification_report

In [88]:
accuracy_score(Y_test, y_pred)

0.9542600896860987

In [89]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.74      0.81       150
           1       0.96      0.99      0.97       965

    accuracy                           0.95      1115
   macro avg       0.93      0.86      0.89      1115
weighted avg       0.95      0.95      0.95      1115



## Spam Ham using - TF-IDF Model

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_v = TfidfVectorizer(max_features=2500, ngram_range=(1,2))

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(corpus,Y, test_size=0.20)

In [99]:
X_train = tf_v.fit_transform(X_train)
X_test = tf_v.transform(X_test)

In [100]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_model = MultinomialNB().fit(X_train, Y_train)

In [101]:
tf_v.vocabulary_

{'pick': np.int64(1584),
 'ur': np.int64(2234),
 'fone': np.int64(724),
 'sorri': np.int64(1953),
 'msg': np.int64(1394),
 'yar': np.int64(2451),
 'lor': np.int64(1225),
 'poor': np.int64(1634),
 'thing': np.int64(2107),
 'one': np.int64(1504),
 'night': np.int64(1449),
 'tmr': np.int64(2141),
 'ull': np.int64(2220),
 'brand': np.int64(215),
 'new': np.int64(1436),
 'room': np.int64(1775),
 'sleep': np.int64(1914),
 'yar lor': np.int64(2452),
 'let': np.int64(1176),
 'send': np.int64(1839),
 'free': np.int64(735),
 'messag': np.int64(1319),
 'see': np.int64(1827),
 'send free': np.int64(1842),
 'send messag': np.int64(1844),
 'think': np.int64(2108),
 'chennai': np.int64(343),
 'well': np.int64(2361),
 'settl': np.int64(1864),
 'sure': np.int64(2041),
 'understand': np.int64(2223),
 'wine': np.int64(2392),
 'good': np.int64(830),
 'idea': np.int64(995),
 'yeah': np.int64(2454),
 'wouldnt': np.int64(2428),
 'leav': np.int64(1163),
 'hour': np.int64(973),
 'least': np.int64(1162),
 'how'

In [102]:
y_pred = spam_tfidf_model.predict(X_test)

In [103]:
print(accuracy_score(Y_test, y_pred))

0.9704035874439462


In [104]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.80      0.88       158
           1       0.97      1.00      0.98       957

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



#### Important for interview point of view to remember the steps
1. Preprocessing and Cleaning
2. Train test split
3. BOW and TF-IDF (Perform this after Train test split so that our model will have no information regarding our test data)
4. Trained our models

# Spam Ham classification using Word2vec and RandomwforestClassifier

#### Colab - https://colab.research.google.com/drive/1DMK0Z3MM8D5st0-DdBmVdQFxTj8u-P0e#scrollTo=2EgUKwoih0-q