# Data preprocessing

In this notebook, we perform data preprocessing on sample datasets that we will use as case studies. The datasets are the following:

* [Enron email dataset](https://www.kaggle.com/datasets/wcukierski/enron-email-dataset)
* [Fraudulent emails dataset](https://www.kaggle.com/datasets/rtatman/fraudulent-email-corpus)
* [IMDB movie reviews](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)

You will need to download those to the `data` directory.

In [None]:
import os
import time
import random
import numpy as np
import pandas as pd
import email
import re
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

## Enron and fraudulent emails datasets

In [None]:
enron_filepath = "../data/enron-email-dataset/emails.csv"
# We will preserve the typo in the filename as that is how it appears on Kaggle.
fraud_filepath = "../data/fraudulent-email-corpus/fradulent_emails.txt"

In [None]:
emails = pd.read_csv(enron_filepath)
emails.shape

In [None]:
emails.head()

In [None]:
print(emails.loc[0]["message"])

In [None]:
def extract_messages(df):
    messages = []
    for item in df["message"]:
        e = email.message_from_string(item)
        message_body = e.get_payload()
        messages.append(message_body)
    return messages

In [None]:
bodies = extract_messages(emails)

In [None]:
bodies_df = pd.DataFrame(bodies)
bodies_df.head()

In [None]:
with open(fraud_filepath, "r", encoding="latin1") as infile:
    data = infile.read()
fraud_emails = data.split("From r")
len(fraud_emails)

In [None]:
fraud_bodies = extract_messages(
    pd.DataFrame(fraud_emails, columns=["message"], dtype=str)
)
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])
fraud_bodies_df.head()

In [None]:
print(fraud_bodies_df[0][0])

In [None]:
Nsamp = 1000
maxtokens = 50
maxtokenlen = 20

In [None]:
def tokenize(row):
    if row in [None, ""]:
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

In [None]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r"[\W\d]", "", token)
            token = token[:maxtokenlen]
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

In [None]:
nltk.download("stopwords")

In [None]:
english_stopwords = stopwords.words("english")

In [None]:
def stop_word_removal(row):
    token = [token for token in row if token not in english_stopwords]
    token = filter(None, token)
    return token

In [None]:
EnronEmails = bodies_df.iloc[:, 0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)
SpamEmails = fraud_bodies_df.iloc[:, 0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)
raw_data = pd.concat((SpamEmails, EnronEmails), axis=0).values

In [None]:
print(raw_data.shape)
print(raw_data[:5])

In [None]:
Categories = ["spam", "notspam"]
header = [1] * Nsamp + [0] * Nsamp

In [None]:
def assemble_bag(data):
    used_tokens = []
    all_tokens = []
    for item in data:
        for token in item:
            if token in all_tokens:
                if token not in used_tokens:
                    used_tokens.append(token)
            else:
                all_tokens.append(token)
    df = pd.DataFrame(0, index=np.arange(len(data)), columns=used_tokens)
    for i, item in enumerate(data):
        for token in item:
            if token in used_tokens:
                df.iloc[i][token] += 1
    return df

In [None]:
EnronSpamBag = assemble_bag(raw_data)
predictors = [column for column in EnronSpamBag.columns]
EnronSpamBag

In [None]:
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

In [None]:
data, header = unison_shuffle_data(EnronSpamBag.values, header)
idx = int(0.7 * data.shape[0])
train_x = data[:idx]
train_y = header[:idx]
test_x = data[idx:]
test_y = header[idx:]

In [None]:
def fit(train_x, train_y):
    model = LogisticRegression()
    try:
        model.fit(train_x, train_y)
    except:
        pass
    return model

In [None]:
model = fit(train_x, train_y)

In [None]:
predicted_labels = model.predict(test_x)
acc_score = accuracy_score(test_y, predicted_labels)
acc_score

In [None]:
clf = SVC(C=1, gamma="auto", kernel="linear", probability=False)
start_time = time.time()
clf.fit(train_x, train_y)
end_time = time.time()
print(f"Training complete in {end_time - start_time}s")
predicted_labels = clf.predict(test_x)
acc_score = accuracy_score(test_y, predicted_labels)
acc_score

## IMDB movie reviews dataset

In [None]:
def load_data(path):
    data, sentiments = [], []
    for folder, sentiment in (("neg", 0), ("pos", 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), "r") as reader:
                text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(data)
    data, sentiments = unison_shuffle_data(data_np, sentiments)
    return data, sentiments

In [None]:
train_path = os.path.join("..", "data", "aclImdb", "train")
raw_data, raw_header = load_data(train_path)

In [None]:
print(raw_data.shape)
print(len(raw_header))

In [None]:
random_indices = np.random.choice(
    range(len(raw_header)), size=(Nsamp * 2,),
    replace=False
)
data_train = raw_data[random_indices]
header = raw_header[random_indices]

In [None]:
# Ensure roughly balanced class distribution.
unique_elements, counts_elements = np.unique(header, return_counts=True)
print(unique_elements)
print(counts_elements)

In [None]:
MixedBagOfReviews = assemble_bag(data_train)
MixedBagOfReviews.head()

In [None]:
data = MixedBagOfReviews.values
idx = int(0.7 * data.shape[0])
train_x = data[:idx, :]
train_y = header[:idx]
test_x = data[idx:, :]
test_y = header[idx:] 
print(len(train_x))
print(train_x)
print(train_y[:5])
print(len(train_y))

In [None]:
model = fit(train_x, train_y)

In [None]:
predicted_labels = model.predict(test_x)
acc_score = accuracy_score(test_y, predicted_labels)
acc_score

In [None]:
clf = SVC(C=1, gamma="auto", kernel="linear", probability=False)
start_time = time.time()
clf.fit(train_x, train_y)
end_time = time.time()
print(f"Training complete in {end_time - start_time}s")
predicted_labels = clf.predict(test_x)
acc_score = accuracy_score(test_y, predicted_labels)
acc_score