In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [24]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('/content/drive/My Drive/emails.csv')
# Split the data into train and temp sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(data['text'], data['spam'], test_size=0.2, random_state=42)

# Split the temp set into validation and test sets (50% validation, 50% test)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the train, validation, and test sets to CSV files
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
messages = train_data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Let's see the count of (distinct) messages for each label.

In [25]:
messages.groupby('spam').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,3504,3482,"Subject: re : eprm 2001 houston layla , my a...",2
1,1078,1078,Subject: secretly record all internet activity...,1


##Step 2: Data preprocessing

In this section we'll massage the raw messages (sequence of characters) into vectors (sequences of numbers).

The mapping is not 1-to-1; we'll use the [bag-of-words](http://en.wikipedia.org/wiki/Bag-of-words_model) approach, where each unique word in a text will be represented by one number.

As a first step, let's write a function that will split a message into its individual words:

In [28]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def preprocess_text(text: str) -> pd.DataFrame:

    text = text.strip().lower()


    text = re.sub(r"[^a-zA_Z\d\s]", "", text)

    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    processed_text = " ".join(filtered_text)

    return processed_text

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:

    data = data.dropna()

    data["text"] = data["text"].apply(preprocess_text)

    data.drop_duplicates("text", inplace = True)

    return data
messages = preprocess_data(messages)
val_data = preprocess_data(val_data)
test_data = preprocess_data(test_data)

In [30]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
import nltk
nltk.download('punkt')
def split_into_tokens(message):
    return TextBlob(message).words[1:]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

and normalize words into their base form ([lemmas](http://en.wikipedia.org/wiki/Lemmatisation)) (with the word "Subject" removed) with:

In [33]:
import nltk
nltk.download('wordnet')
def split_into_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words[1:]
    # for each word, take its "base form" = lemma
    return [word.lemma for word in words]

messages.text.head().apply(split_into_lemmas)
val_data.text.head().apply(split_into_lemmas)
test_data.text.head().apply(split_into_lemmas)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


3529    [conference, steve, slide, ok, negotiation, ri...
1044    [fbi, color, 003399, font, size, 14, px, font,...
5552    [looking, fat, tail, time, series, ngi, socal,...
1315    [failure, notice, hi, qmail, send, program, ba...
5070    [tony, hamilton, tony, already, done, desleigh...
Name: text, dtype: object

In [34]:
messages.to_csv('/content/drive/My Drive/train_data.csv', index=False)
val_data.to_csv('/content/drive/My Drive/val_data.csv', index=False)
test_data.to_csv('/content/drive/My Drive/test_data.csv', index=False)