In [1]:
# Download the data
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

--2018-09-20 08:56:51--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/zip]
Saving to: ‘smsspamcollection.zip’


2018-09-20 08:56:54 (79.5 KB/s) - ‘smsspamcollection.zip’ saved [203415/203415]



In [2]:
# Decompress the data
!unzip smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [6]:
# Read the data into a dataframe
import pandas as pd
df = pd.read_csv("SMSSpamCollection", sep="\t", header=None)
df.shape

(5572, 2)

In [17]:
# Inspect the raw data
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
# Assign meaningful column names to the dataframe
df.columns = ["label", "text"]
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# Extract the training data
X = df["text"]
y = df["label"].apply(lambda x: 1 if x == "spam" else 0)

In [11]:
# Split data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=100)

In [19]:
# Train a naive Bayes model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
clf = Pipeline([                 # Creating a pipeline
    ('vec', CountVectorizer()),  # The count vectorizer using default params
    ('nb', MultinomialNB())      # The multinomial NB using default params
])
clf.fit(X_train, y_train)        # Use the training data to fit the model

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [14]:
# Check the performance of the model
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
y_pred = clf.predict(X_test)
print("Accuracy : {:.4f}".format(accuracy_score(y_test, y_pred)))
print("AUC      : {:.4f}".format(roc_auc_score(y_test, y_pred)))
print("Precision: {:.4f}".format(precision_score(y_test, y_pred)))
print("Recall   : {:.4f}".format(recall_score(y_test, y_pred)))

Accuracy : 0.9839
AUC      : 0.9548
Precision: 0.9624
Recall   : 0.9152


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1448
          1       0.96      0.92      0.94       224

avg / total       0.98      0.98      0.98      1672

