In [84]:
# Importing dependencies

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
# Source: https://github.com/imdeepmind/TextPreprocessingScript/blob/master/preprocess.py
class Preprocess:
    def sentence_tokenizer(self, doc):
        return sent_tokenize(doc)
    
    def word_tokenizer(self, sentence):
        return word_tokenize(sentence) 
    
    def remove_stop_words(self, tokenized_sentences):
        stop_words = set(stopwords.words('english'))
        return [t for t in tokenized_sentences if not t in stop_words] 
    
    def normalize(self, doc):
        doc = doc.lower()
        
        doc = re.sub(r"i'm", "i am", doc)
        doc = re.sub(r"aren't", "are not", doc)
        doc = re.sub(r"couldn't", "counld not", doc)
        doc = re.sub(r"didn't", "did not", doc)
        doc = re.sub(r"doesn't", "does not", doc)
        doc = re.sub(r"don't", "do not", doc)
        doc = re.sub(r"hadn't", "had not", doc)
        doc = re.sub(r"hasn't", "has not", doc)
        doc = re.sub(r"haven't", "have not", doc)
        doc = re.sub(r"isn't", "is not", doc)
        doc = re.sub(r"it't", "had not", doc)
        doc = re.sub(r"hadn't", "had not", doc)
        doc = re.sub(r"won't", "will not", doc)
        doc = re.sub(r"can't", "cannot", doc)
        doc = re.sub(r"mightn't", "might not", doc)
        doc = re.sub(r"mustn't", "must not", doc)
        doc = re.sub(r"needn't", "need not", doc)
        doc = re.sub(r"shouldn't", "should not", doc)
        doc = re.sub(r"wasn't", "was not", doc)
        doc = re.sub(r"weren't", "were not", doc)
        doc = re.sub(r"won't", "will not", doc)
        doc = re.sub(r"wouldn't", "would not", doc)
        
        doc = re.sub(r"\'s", " is", doc)
        doc = re.sub(r"\'ll", " will", doc)
        doc = re.sub(r"\'ve", " have", doc)
        doc = re.sub(r"\'re", " are", doc)
        doc = re.sub(r"\'d", " would", doc)
        
        return doc
    
    def remove_unchars(self, doc):
        doc = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', doc, flags=re.MULTILINE)
        doc = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', '', doc)
        doc = re.sub(r'\b[0-9]+\b\s*', '', doc)
        
        return doc
    
    
    def get_pos(self, sentence):
        """
            This method is used for POS tagging
        """
        pos = []
        for word in sentence:
            w, p = nltk.pos_tag([word])[0]
            if p.startswith('J'):
                pos.append((w, wordnet.ADJ))
            elif p.startswith('V'):
                pos.append((w, wordnet.VERB))
            elif p.startswith('N'):
                pos.append((w, wordnet.NOUN))
            elif p.startswith('R'):
                pos.append((w, wordnet.ADV))
            else:
                pos.append(('',''))
    
        return pos
    
    def lemmatizer(self, words):
        lemmatizer = WordNetLemmatizer() 
        lemmatized_sentence = []
    
        for word in words:
            w,p = self.get_pos([word])[0]
            if p != '':
                w = lemmatizer.lemmatize(word, pos=p)
            else:
                w = lemmatizer.lemmatize(word)
            lemmatized_sentence.append(w)
        
        return lemmatized_sentence

In [3]:
data = pd.read_csv('data/processed_data1.csv')

In [4]:
data.head()

Unnamed: 0,label,subject,email_to,email_from,message
0,1,"Generic Cialis, branded quality@",the00@speedy.uwaterloo.ca,"""Tomas Jacobs"" <RickyAmes@aol.com>",Content-Type: text/html;\nContent-Transfer-Enc...
1,0,Typo in /debian/README,debian-mirrors@lists.debian.org,Yan Morin <yan.morin@savoirfairelinux.com>,"Hi, i've just updated from the gulus and I che..."
2,1,authentic viagra,<the00@plg.uwaterloo.ca>,"""Sheila Crenshaw"" <7stocknews@tractionmarketin...","Content-Type: text/plain;\n\tcharset=""iso-8859..."
3,1,Nice talking with ya,opt4@speedy.uwaterloo.ca,"""Stormy Dempsey"" <vqucsmdfgvsg@ruraltek.com>","\nHey Billy, \n\nit was really fun going out t..."
4,1,or trembling; stomach cramps; trouble in sleep...,ktwarwic@speedy.uwaterloo.ca,"""Christi T. Jernigan"" <dcube@totalink.net>",Content-Type: multipart/alternative;\n ...


In [82]:
preprocess = Preprocess()

def remove_html(message):
    soup = BeautifulSoup(message)
    for s in soup(['script', 'style', 'head', 'meta', 'noscript']):
        s.decompose()
    return ' '.join(soup.stripped_strings)

def clean_subject(subject):
    if isinstance(subject, str):
        subject = preprocess.normalize(subject)
        subject = preprocess.remove_unchars(subject)
        
        return subject
    
    return None

def get_emails(text):
    if isinstance(text, str):
        match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    
        return match.group(0) if match else None
    
    return None

def clean_messages(message):
    if isinstance(message, str):
        message = remove_html(message)

        message = preprocess.normalize(message)
        message = preprocess.remove_unchars(message)

        return message
    
    return None

In [86]:
tqdm.pandas()

def process_df(df):
    df["subject"] = clean_subject(df["subject"])
    df["email_to"] = get_emails(df["email_to"])
    df["email_from"] = get_emails(df["email_from"])
    df["message"] = clean_messages(df["message"])
    
    return df

data = data.progress_apply(process_df, axis=1)




" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
100%|██████████| 75419/75419 [06:29<00:00, 193.79it/s]


In [87]:
data.head()

Unnamed: 0,label,subject,email_to,email_from,message
0,1,generic cialis branded quality,the00@speedy.uwaterloo.ca,RickyAmes@aol.com,contenttype texthtmlcontenttransferencoding 7b...
1,0,typo in debianreadme,debian-mirrors@lists.debian.org,yan.morin@savoirfairelinux.com,hi i have just updated from the gulus and i ch...
2,1,authentic viagra,the00@plg.uwaterloo.ca,7stocknews@tractionmarketing.com,contenttype textplain\tcharsetiso88591contentt...
3,1,nice talking with ya,opt4@speedy.uwaterloo.ca,vqucsmdfgvsg@ruraltek.com,hey billy it was really fun going out the othe...
4,1,or trembling stomach cramps trouble in sleepin...,ktwarwic@speedy.uwaterloo.ca,dcube@totalink.net,contenttype multipartalternative bounda...


In [88]:
data.shape

(75419, 5)

In [90]:
data.dropna(inplace=True)

In [91]:
data.shape

(73897, 5)

In [92]:
data.to_csv("data/processed_data2.csv")