In [1]:
# Importing the libraries

import numpy as np
import pandas as pd

In [2]:
# Loading the spam messages dataset

df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [3]:
# Viewing first few rows of data

df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
# Checking for null values

df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
# Value counts for label

df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
# Spliting the dataset into vectors

X = df['message']
y = df['label']

In [34]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [35]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [36]:
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [40]:
# Splitting the dataset into test and train data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [41]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3733,)
(3733,)
(1839,)
(1839,)


In [42]:
# Count Vectorizer for message data

from sklearn.feature_extraction.text import CountVectorizer

In [43]:
count_vect = CountVectorizer()

In [44]:
# FIT VECTORIZER the data (build a vocab, count the number of words)

# count_vect.fit(X_train)

# TRANSFORM the original text message --> vector

# X_train_counts = count.vect.transform(X_train)

# Or can directly fit_transform

X_train_counts = count_vect.fit_transform(X_train)

In [45]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [46]:
print(X_train.shape)
print(X_train_counts.shape)

(3733,)
(3733, 7082)


In [47]:
# Using tfidf Transformer

from sklearn.feature_extraction.text import TfidfTransformer

In [48]:
tfidf_transformer = TfidfTransformer()

In [49]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [50]:
X_train_tfidf

<3733x7082 sparse matrix of type '<class 'numpy.float64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [51]:
print(X_train.shape)
print(X_train_tfidf.shape)

(3733,)
(3733, 7082)


In [52]:
# Now using TfidfVectorizer
# It combines both CountVectorizer and TfidfVectorizer steps

from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
tfidf_vect = TfidfVectorizer()

In [54]:
X_train_tfidf = tfidf_vect.fit_transform(X_train)

In [55]:
X_train_tfidf

<3733x7082 sparse matrix of type '<class 'numpy.float64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [56]:
print(X_train.shape)
print(X_train_tfidf.shape)
print(y_train.shape)

(3733,)
(3733, 7082)
(3733,)


In [57]:
# Support Vector Machine (SVM)
# Support Vector Classifier

from sklearn.svm import LinearSVC

In [58]:
classifier = LinearSVC()

In [59]:
classifier.fit(X_train_tfidf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [60]:
# Using pipeline

from sklearn.pipeline import Pipeline

In [61]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [63]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [66]:
predictions = text_clf.predict(X_test)

In [67]:
from sklearn.metrics import confusion_matrix, classification_report

In [69]:
print(confusion_matrix(y_test, predictions))

[[1586    7]
 [  12  234]]


In [70]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [71]:
from sklearn import metrics

In [72]:
metrics.accuracy_score(y_test, predictions)

0.989668297988037

In [74]:
text_clf.predict(['Hii!! How are you?'])

array(['ham'], dtype=object)

In [75]:
text_clf.predict(['Congratulations..!! You have won free passes to enter the contest.!'])

array(['spam'], dtype=object)