# Bag of Words - SMS Spam Collection

Dataset: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

In [None]:
import pandas as pd

In [None]:
messages0 = pd.read_csv('data/SMSSpamCollection.csv', sep='\t', names=['label','message'])

In [None]:
messages = messages0.copy()
messages

---

## Text Preprocessing

In [None]:
import re
import nltk

In [None]:
from nltk.corpus import stopwords
stopwords_en = stopwords.words('english')

### PorterStemmer

In [None]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [None]:
corpus_stemmed = []

# Go through all the messages
for i in range(0,len(messages)):

    # In each message, replace non-letters with empty space
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])

    # Make all letters lower case
    review = review.lower()

    # Split each message into a list of word
    review = review.split()

    # Porter stem each word that doesn't belong to the stopwords list
    review = [porter_stemmer.stem(word) for word in review if not word in stopwords_en]

    # Join each word by empty space
    review = ' '.join(review)

    # Append the message in the corpus list
    corpus_stemmed.append(review)

In [None]:
corpus_stemmed

### WordNet Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
messages = messages0.copy()
messages

In [None]:
corpus_lemmatized = []

# Go through all the messages
for i in range(0,len(messages)):

    # In each message, replace non-letters with empty space
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])

    # Make all letters lower case
    review = review.lower()

    # Split each message into a list of word
    review = review.split()

    # Porter stem each word that doesn't belong to the stopwords list
    review = [wordnet_lemmatizer.lemmatize(word) for word in review if not word in stopwords_en]

    # Join each word by empty space
    review = ' '.join(review)

    # Append the message in the corpus list
    corpus_lemmatized.append(review)

In [None]:
corpus_lemmatized

---

## Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#### Create a Bag of Words from the dataset that has a max frequency (max_features) of N words

In [None]:
# count_vectorizer = CountVectorizer(max_features=2500)

#### Create a Binary Bag of Words

In [None]:
count_vectorizer = CountVectorizer(max_features=100, binary=True)

In [None]:
X = count_vectorizer.fit_transform(corpus_lemmatized).toarray()

In [None]:
X.shape

In [None]:
X

---