In [1]:
# Based on the Udemy lecture with modifications

In [2]:
import numpy as np
import pandas as pd

In [3]:
sms = pd.read_csv('../TextFiles/smsspamcollection.tsv',sep='\t')

In [4]:
sms.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
# Check for null
sms.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
sms['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X = sms['message']
y = sms['label']

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [10]:
X.head(10)

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: message, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [12]:
# Fit to the data and then transform
count_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [18]:
count_vect.transform(X_train)

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [13]:
X_train_counts =  count_vect.fit_transform(X_train)

In [15]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
# TfidfTransformer is used on the existing count matrix
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [19]:
X_train_tfidf

<3733x7082 sparse matrix of type '<class 'numpy.float64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [21]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [30]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf',TfidfVectorizer()), ('clf', LinearSVC())])

In [33]:
text_clf.fit(X_train,y_train)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [34]:
predictions = text_clf.predict(X_test)

In [35]:
X_test

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
2973    Sary just need Tim in the bollox &it hurt him ...
2991    Love isn't a decision, it's a feeling. If we c...
2942    My supervisor find 4 me one lor i thk his stud...
230                    Dear good morning now only i am up
1181                           I'm in chennai velachery:)
1912    Lol grr my mom is taking forever with my presc...
1992    No other Valentines huh? The proof is on your ...
5435                    I'm wif him now buying tix lar...
4805    Er, hello, things didn‘t quite go to plan – is...
401     FREE RINGTONE text FIRST to 87131 for a poly o...
1859                     Sir, i am waiting for your call.
1344    Crazy ar he's married. Ü like gd looking guys ...
2952          

In [36]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [37]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [38]:
from sklearn import metrics

metrics.accuracy_score(y_test,predictions)

0.989668297988037

In [42]:
text_clf.predict(['Text this number to get free money!'])

array(['spam'], dtype=object)

In [46]:
# still falls short - see below
text_clf.predict(['Click here for your reward! THIS IS NOT REAL HAHA'])

array(['ham'], dtype=object)