In [1]:
import pandas as pd

In [23]:
messages = pd.read_csv(r"smsdata\SMSSpamCollection.txt", 
                      sep = "\t", 
                      names= ["label", "message"] )                   


In [24]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Cleaning & Text Preprocessing


In [18]:
import re
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [25]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(message)):
    # We need textual data only
    review = re.sub('[a-zA-Z]', " ", messages['message'][i])
    
    # convert entire data into lower-case
    review = review.lower()
    
    # SPlit our entire data
    review = review.split()
    
    # remove the stop words
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    
    review = " ".join(review)
    corpus.append(review)

In [26]:
corpus

[', .. ... ...',
 '... ...',
 "2 21 2005. 87121 ( ) & ' 08452810075 18'",
 '... ...',
 "' ,",
 "' 3 ' ! ' ? ! , £1.50",
 '. .',
 "' ( )' . *9",
 '!! £900 ! 09061701461. 341. 12 .',
 '11 ? ! 08002986030',
 "' ' , ? ' .",
 '! 100 20,000 > 11 87575. 150 / , 6 , 16+ 4',
 '! 1 £100,000 ! : : 81010 & . . 4403 1 7 18',
 "' . . .",
 '!!',
 ': , >> :// . . ? =',
 "... ' :)",
 '2 ... . .',
 '\x92 . \x92',
 '- / . 87077 87077 : , 4 /ú1.20 36504 45 16+',
 '?',
 '‘ 2',
 'ü ... ...',
 '. 3 . ?',
 '. ?',
 ". ' . . . ' .",
 '.',
 "? ? ? ' ? ?",
 "' & ; ' , ' '",
 '. . ! ?',
 "' , ' '",
 "2 . . 2! ' ! . ?",
 '.',
 '?',
 '£5/ .',
 '... ü ... 2 8',
 ", ' '",
 '',
 '... ...',
 "! ' ? ' . ' !",
 '. . .',
 '? , , ... ... ... ...',
 '07732584351 - - = + . 08000930705',
 '?',
 '! . & ;#& ; ...',
 '.. ..',
 "' .",
 ', ?',
 ", '",
 "' . ' . ' . .",
 '. .',
 '& ;#& ; , & ;#& ;',
 "'",
 '. . , " ". \' \' \' . \' . \' .',
 '. : . ? ?',
 '? @ & ; & ;',
 '! 1 2 . 09061209465 ! , 3, 3, 4 ! 420- 4-5 . 150 . !',
 ", ' 

In [None]:
# numerical data
#BAG of words, TFIDF, Word2vec and Doc2vec

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [49]:
cv = CountVectorizer()

In [31]:
X = cv.fit_transform(corpus).toarray()

In [32]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values

In [35]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

# Split data into train & test

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 0)

In [40]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Build ML Model

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
spem_detect = MultinomialNB().fit(X_train,y_train )

In [43]:
spem_detect

MultinomialNB()

In [44]:
y_pred = spem_detect.predict(X_test)

In [45]:
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

# Evluation Metrics

In [59]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [55]:
accuracy = accuracy_score(y_test, y_pred)

In [58]:
accuracy

0.9192825112107623

In [56]:
cm = confusion_matrix(y_test, y_pred)

In [61]:
cm

array([[954,   1],
       [ 89,  71]], dtype=int64)

In [60]:
cr  = classification_report(y_test, y_pred)

In [62]:
cr

'              precision    recall  f1-score   support\n\n           0       0.91      1.00      0.95       955\n           1       0.99      0.44      0.61       160\n\n    accuracy                           0.92      1115\n   macro avg       0.95      0.72      0.78      1115\nweighted avg       0.92      0.92      0.91      1115\n'

# Cross Validation

In [51]:
from sklearn.model_selection import cross_val_score

In [65]:
cv_Scores = cross_val_score(spem_detect, X_train, y_train, cv= 10, scoring = "accuracy")

In [66]:
cv_Scores

array([0.9058296 , 0.91479821, 0.91704036, 0.91704036, 0.93721973,
       0.91255605, 0.92152466, 0.92134831, 0.90786517, 0.92359551])