In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t') #loading in the data separated by tab
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
df.isnull().sum() #checking to see if there's any null data lines

label      0
message    0
length     0
punct      0
dtype: int64

In [4]:
df['label'].value_counts() #so we have an unbalanced dataset with more ham than spam

ham     4825
spam     747
Name: label, dtype: int64

# **Part 1 - Setting up Train Test Split**

In [5]:
X = df['message'] #passing in the list of columns, which will be length of message and number of punctuation
y = df['label'] #y is the label on if it's ham or spam

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=117) #creating a train test split

In [6]:
X_train

1417              No..few hours before.went to hair cut .
2757    Have a good trip. Watch out for . Remember whe...
3863                     Customer place, i wil cal u sir.
4551    Hey! do u fancy meetin me at 4 at cha  hav a ...
5233     Hey what how about your project. Started aha da.
                              ...                        
275                       No objection. My bf not coming.
3671                        Ok thanx... Take care then...
2385                               Sorry, I'll call later
1136                      K do I need a login or anything
3566    We know TAJ MAHAL as symbol of love. But the o...
Name: message, Length: 4179, dtype: object

In [7]:
y_train

1417    ham
2757    ham
3863    ham
4551    ham
5233    ham
       ... 
275     ham
3671    ham
2385    ham
1136    ham
3566    ham
Name: label, Length: 4179, dtype: object

# **Part 2: Creating a Pipeline to Fit and Transform**

In [10]:
#Below is to show what TfidfVectorizer does to the train data
X_train.shape #so original shape of 4179 messages

(4179,)

In [13]:
#TfidfVectorizer first performs a counter vectorization on each unique word, then applies a Tfidf transform on the unique words
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape #resulting shape now detected 7462 unique words over 4179 messages 

(4179, 7462)

In [15]:
#Our pipeline does the following steps:
#Step 1: Count vectorize each unique word then apply TFIDF feature extraction transform on X_train data. This is done using TfidfVectorizer() imported above 
#Step 2: Fit the data using LinearSVC. The purpose of the Support Vector Classifier is to fit to the data and return a best fit hyperplane that categorizes our data
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())]) #so each tuple in the pipeline is a step, starting with the TfidfVectorizer, then utilizing LinearSVC

In [16]:
text_clf.fit(X_train,y_train) #now we just called our pipelines and ran our X_train & y_train to it. We didn't need to create an instance of each item and fit them separately

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

# **Part 3 - Predicting and Results**

In [17]:
predictions = text_clf.predict(X_test)

df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['ham','spam'], columns=['ham','spam'])
df 

Unnamed: 0,ham,spam
ham,1210,4
spam,17,162


In [18]:
print(classification_report(y_test,predictions)) 

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1214
        spam       0.98      0.91      0.94       179

    accuracy                           0.98      1393
   macro avg       0.98      0.95      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [19]:
print(metrics.accuracy_score(y_test,predictions))

0.9849246231155779


In [20]:
text_clf.predict(['Hi how are you doing today?']) #tells me this is ham

array(['ham'], dtype=object)

In [16]:
text_clf.predict(['Halo is my favourite video game series, what about you!?']) #tells me this is ham

array(['ham'], dtype=object)

In [18]:
text_clf.predict(['Come follow my puppy, @jindodooboo on instagram!']) #tells me this is ham

array(['ham'], dtype=object)

In [22]:
text_clf.predict(['Congratulations, you are a winner. Text 117 to get entry to a contest']) #tells me this is spam

array(['spam'], dtype=object)

In [20]:
text_clf.predict(['LAST CHANCE TO CLAIM YOUR FREE PRIZE, A COPY OF HALO 3']) #tells me this is spam

array(['spam'], dtype=object)