In [1]:
import pandas as pd
import numpy as np

## Load in the Data

In [2]:
data_path = "~/downloads/sms+spam+collection/SMSSpamCollection"
df = pd.read_csv(data_path, sep="\t", header=None, names=["label", "text"])
print(df.label.value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


Dataset is imbalanced due to the proportion of positive to negative labels, which will determine the choice of metrics in the future.

## Create a Holdout Set

Most tutorials use test/train splits for easier data processing, but in production a holdout set of unseen cases helps evaluate your model more accurately.

In [3]:
from sklearn.model_selection import train_test_split

# Splitting data into a training validation set and a holdout set
df_train_val, df_holdout = train_test_split(df, 
                                            test_size=0.1,  # splits 10% of the data into the holdout set
                                            stratify=df['label'],   # stratify ensures that the correct class distribution is preserved
                                            random_state=42)

In [4]:
df_train_val.to_csv('./data/raw/spam_train_val.csv', index=False)
df_holdout.to_csv('./data/raw/spam_holdout.csv', index=False)

## Preprocess the Text

Text data is noisy in terms of "win" vs "winning" and other nuances that are not important for the meaning of the text, especially in the context of spam detection. 

In [5]:
from ntlk.corpus import stopwords   # Words like "the", "a", "but", "and" w/ no meaning 
from nltk.stem import PorterStemmer # Stemming = removing the "ing" so the meaning is still preserved
from nltk.tokenize import word_tokenize
import string, nltk

nltk.download('punkt_tab')  # Punctuation
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))    # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    return ''.join(tokens)

df_train_val['clean_text'] = df_train_val['text'].apply(preprocess_text)
df_train_val['label_num'] = df_train_val['label'].map({'ham': 0, 'spam': 1})

ModuleNotFoundError: No module named 'ntlk'