In [1]:
conda install beautifulsoup4

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.3
  latest version: 4.11.0

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.10.0      |     pyha770c72_0          77 KB  conda-forge
    certifi-2021.10.8          |   py39hf3d152e_1         145 KB  conda-forge
    soupsieve-2.3.1            |     pyhd8ed1ab_0          33 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         255 KB

The following NEW packages will be INSTALLED:

  beautifulsoup4     conda-forge/noarch::beautifulsoup4-4.10.0-pyha770c72_0
  soupsieve          conda-forge/noarch::soupsieve-2.3.1-p

In [1]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup as sp

In [2]:
data = pd.read_csv('SMSSpamCollection', sep="	", header=None)

## Part 1: Data Cleaning

In [3]:
data

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
data.describe()

Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


We are going to maintain duplicates.  As we see here, there are 30 'Sorry, I'll call later.' This means duplicates may be informative to our model as they can come from non-spam automated response messages.

#### Change to Lowercase

In [11]:
text = data[1]
text_lowercase = text.str.lower()

#### Remove Punctuation

In [12]:
from string import punctuation

def remove_punctuation(document):
    no_punct = ''.join([character for character in document if character not in punctuation])
    return no_punct

In [14]:
text_no_punct  = text_lowercase.apply(remove_punctuation)
text_no_punct[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

#### Remove Digits 

In [15]:
def remove_digit(document): 
    
    no_digit = ''.join([character for character in document if not character.isdigit()])
              
    return no_digit

In [16]:
text_no_digit = text_no_punct.apply(remove_digit)

#### Tokenization

In [17]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
from nltk.tokenize import word_tokenize

text_tokenized = text_no_digit.apply(word_tokenize)
text_tokenized.head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, a, wkly, comp, to, win, fa, ...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: 1, dtype: object

In [21]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /opt/conda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def remove_stopwords(document):
    
    words = [word for word in document if not word in stop_words]
    
    return words

In [23]:
text_no_stop = text_tokenized.apply(remove_stopwords)

In [24]:
len(text_no_stop)

5572

#### Stemming

In [25]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def stemmer(document):
    
    stemmed_document = [porter.stem(word) for word in document]
    
    return stemmed_document

In [26]:
text_stemmed = text_no_stop.apply(stemmer)

#### Detokenization

In [27]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

text_detokenized = text_stemmed.apply(TreebankWordDetokenizer().detokenize)

#### Document-term Matrix

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()

sparse_dtm = countvec.fit_transform(text_detokenized)

In [29]:
# 0.5% of the posts or more 

countvec2 = CountVectorizer(min_df=0.005)
sparse_dtm2 = countvec2.fit_transform(text_detokenized)

dtm2 = pd.DataFrame(sparse_dtm2.toarray(), columns=countvec2.get_feature_names(), index=data.index)
dtm2.sum().sort_values(ascending=False) 

call         657
im           467
go           454
get          448
ur           390
            ... 
enough        29
await         28
detail        28
afternoon     28
tv            28
Length: 336, dtype: int64

In [34]:
# Now, let's try with 0.25% of the posts or more

countvec3 = CountVectorizer(min_df=0.0025)
sparse_dtm3 = countvec3.fit_transform(text_detokenized)

dtm3 = pd.DataFrame(sparse_dtm3.toarray(), columns=countvec3.get_feature_names(), index=data.index)
dtm3.sum().sort_values(ascending=False)

call      657
im        467
go        454
get       448
ur        390
         ... 
cours      14
scream     14
comput     14
whole      14
jay        14
Length: 610, dtype: int64

Compared to Homework 5, the quantity of terms in the DTM is comparable under much smaller values of min_df. 

In [40]:
data[0]

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: 0, Length: 5572, dtype: object

#### Train-Test Split

In [41]:
# Let's take a 70 - 30 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dtm2, data[0], test_size=0.3, random_state=42)

In [46]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

# Model Building

## Random Forest Classifier

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_features=8, min_samples_leaf=5, n_estimators=500, random_state=88)
rf.fit(X_train, y_train)

RandomForestClassifier(max_features=8, min_samples_leaf=5, n_estimators=500,
                       random_state=88)

In [48]:
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix: 
 [[1447    1]
 [  57  167]]

Accuracy: 0.965311004784689
