In [12]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

import string
import numpy as np
import pandas as pd


# import data

In [18]:
df = pd.read_csv('../raw_data/nlp/emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


# text pre-cleaning

In [22]:
def clean(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    
    tokenized = word_tokenize(lowercased) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    stop_words = set(stopwords.words('english')) # Make stopword list
    
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    
    return [" ".join(without_stopwords)]

In [23]:
df['clean_text'] = df['text'].apply(clean) 

In [24]:
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,[subject naturally irresistible corporate iden...
1,Subject: the stock trading gunslinger fanny i...,1,[subject stock trading gunslinger fanny merril...
2,Subject: unbelievable new homes made easy im ...,1,[subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,[subject color printing special request additi...
4,"Subject: do not have money , get software cds ...",1,[subject money get software cds software compa...


# text preprocessing

In [25]:
# lemmatize
lemmatizer = WordNetLemmatizer()

def lemmatizer_text(line):
    lemmatize_line = ''.join(lemmatizer.lemmatize(w) for w in line)
    return lemmatize_line

df['clean_text'] = df['clean_text'].apply(lemmatizer_text)

In [26]:
df[['clean_text']]

Unnamed: 0,clean_text
0,subject naturally irresistible corporate ident...
1,subject stock trading gunslinger fanny merrill...
2,subject unbelievable new homes made easy im wa...
3,subject color printing special request additio...
4,subject money get software cds software compat...
...,...
5723,subject research development charges gpg forwa...
5724,subject receipts visit jim thanks invitation v...
5725,subject enron case study update wow day super ...
5726,subject interest david please call shirley cre...


In [27]:
# vectorize
vectorizer = CountVectorizer()

In [28]:
X = vectorizer.fit_transform(df.clean_text)
y = df.spam

# build model

In [29]:
# multinomialNB
model = MultinomialNB()
model.fit(X,y)
model.score(X,y)

0.9965083798882681