In [1]:
conda install beautifulsoup4

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.3
  latest version: 4.11.0

Please update conda by running

    $ conda update -n base conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup as sp

In [3]:
data = pd.read_csv('SMSSpamCollection', sep="	", header=None)

In [4]:
data

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data.describe()


Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
#change to lowercase
text = data[1]
text_lowercase = text.str.lower()

In [7]:
#remove punctuation
from string import punctuation

def remove_punctuation(document):
    no_punct = ''.join([character for character in document if character not in punctuation])
    return no_punct

In [8]:
text_no_punct  = text_lowercase.apply(remove_punctuation)
text_no_punct[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [9]:
#remove digits
def remove_digit(document): 
    
    no_digit = ''.join([character for character in document if not character.isdigit()])
              
    return no_digit

In [10]:
text_no_digit = text_no_punct.apply(remove_digit)

In [11]:
#tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from nltk.tokenize import word_tokenize

text_tokenized = text_no_digit.apply(word_tokenize)
text_tokenized.head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, a, wkly, comp, to, win, fa, ...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: 1, dtype: object

In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /opt/conda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def remove_stopwords(document):
    
    words = [word for word in document if not word in stop_words]
    
    return words

In [15]:
text_no_stop = text_tokenized.apply(remove_stopwords)
len(text_no_stop)


5572

In [16]:
#stemming
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def stemmer(document):
    
    stemmed_document = [porter.stem(word) for word in document]
    
    return stemmed_document
text_stemmed = text_no_stop.apply(stemmer)

In [17]:
#detokenization
from nltk.tokenize.treebank import TreebankWordDetokenizer

text_detokenized = text_stemmed.apply(TreebankWordDetokenizer().detokenize)

In [18]:
#document-term matrix
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()

sparse_dtm = countvec.fit_transform(text_detokenized)
# 0.5% of the posts or more 

countvec2 = CountVectorizer(min_df=0.005)
sparse_dtm2 = countvec2.fit_transform(text_detokenized)

dtm2 = pd.DataFrame(sparse_dtm2.toarray(), columns=countvec2.get_feature_names(), index=data.index)
dtm2.sum().sort_values(ascending=False) 

call         657
im           467
go           454
get          448
ur           390
            ... 
enough        29
await         28
detail        28
afternoon     28
tv            28
Length: 336, dtype: int64

In [19]:
# Now, let's try with 0.25% of the posts or more

countvec3 = CountVectorizer(min_df=0.0025)
sparse_dtm3 = countvec3.fit_transform(text_detokenized)

dtm3 = pd.DataFrame(sparse_dtm3.toarray(), columns=countvec3.get_feature_names(), index=data.index)
dtm3.sum().sort_values(ascending=False)

call      657
im        467
go        454
get       448
ur        390
         ... 
cours      14
scream     14
comput     14
whole      14
jay        14
Length: 610, dtype: int64

Compared to Homework 5, the quantity of terms in the DTM is comparable under much smaller values of min_df.

In [20]:
data[0]


0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: 0, Length: 5572, dtype: object

In [21]:
#Train-Test Split
# Let's take a 70 - 30 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dtm2, data[0], test_size=0.3, random_state=42)
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

In [28]:
def accuracy(a, b, c, d):
    return (a + d)/(a+b+c+d)

In [29]:
######
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score



dtc_test_b = DecisionTreeClassifier(min_samples_leaf=5, 
                              ccp_alpha=0.0005,
                              #class_weight = {0: 1, 1: 20},
                              random_state = 88)
dtc_test_b = dtc_test_b.fit(X_train, y_train)
y_pred_dtc_test_b = dtc_test_b.predict(X_test)
cm_clatree_b = confusion_matrix(y_test, y_pred_dtc_test_b)

accuracy(cm_clatree_b.item((0,0)), cm_clatree_b.item((0,1)), cm_clatree_b.item((1, 0)), cm_clatree_b.item((1, 1)))

0.9527511961722488

In [32]:
# import time
# def bootstrap(test_data, test_label, train_label, model, metrics_list, sample=500, random_state=66):
#     tic = time.time()
#     n_sample = sample
#     n_metrics = len(metrics_list)
#     output_array=np.zeros([n_sample, n_metrics])
#     output_array[:]=np.nan
#     print(output_array.shape)
#     for bs_iter in range(n_sample):
#         bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
#         bs_data = test_data.loc[bs_index]
#         bs_label = test_label.loc[bs_index]
#         bs_predicted = model.predict(bs_data)
#         for metrics_iter in range(n_metrics):
#             metrics = metrics_list[metrics_iter]
#             output_array[bs_iter, metrics_iter]=metrics(bs_predicted,bs_label)

#     output_df = pd.DataFrame(output_array)
#     return output_df

In [33]:
# cart_bs = bootstrap(X_test,y_test,y_train,dtc_test_b,
#                                  metrics_list=[accuracy_score],
#                                  sample = 5000)

(5000, 1)
