In [15]:
import os
import re
import numpy as np
import pandas as pd
import tarfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [2]:
downloads = os.path.join(os.environ['HOME'] + "/Downloads")
enron_dir = os.path.join(downloads, 'Enron emails')
enron_files = ['enron1.tar.gz', 'enron2.tar.gz', 'enron3.tar.gz',
               'enron4.tar.gz', 'enron5.tar.gz', 'enron6.tar.gz']

def extract_emails(fname):
    rows = []
    tfile = tarfile.open(fname, encoding="latin-1", errors='ignore')
    for member in tfile.getmembers():
        if 'ham' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'ham'})
        if 'spam' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'spam'})
    tfile.close()
    return pd.DataFrame(rows)
data = pd.DataFrame({'message': [], 'class': []})
for file in enron_files:
    unzipped_file = extract_emails(os.path.join(enron_dir, file))
    data = data.append(unzipped_file)

In [3]:
data['message'] = data['message'].apply(lambda x: x.decode('latin-1'))
data['class'] = data['class'].map({'spam':1, 'ham':0})

In [4]:
data.head()

Unnamed: 0,class,message
0,0,Subject: christmas tree farm pictures\r\n
1,0,"Subject: vastar resources , inc .\r\ngary , pr..."
2,0,Subject: calpine daily gas nomination\r\n- cal...
3,0,Subject: re : issue\r\nfyi - see note below - ...
4,0,Subject: meter 7268 nov allocation\r\nfyi .\r\...


In [5]:
def cleanSentences(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().replace("<br />", " ")
    text = text.split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

data['message'] = data['message'].apply(cleanSentences)

In [6]:
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

bow_transformer = CountVectorizer(analyzer=text_process).fit(data['message'])
print(len(bow_transformer.vocabulary_))

132267


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['class'], test_size=0.2, random_state=42)

In [17]:
messages_bow = bow_transformer.transform(X_train)
tfidf_transformer=TfidfTransformer().fit(messages_bow)
messages_tfidf=tfidf_transformer.transform(messages_bow)

messages_bow_test = bow_transformer.transform(X_test)
messages_tfidf_test=tfidf_transformer.transform(messages_bow_test)


spam_detect_model = MultinomialNB().fit(messages_tfidf,y_train)

y_pred = spam_detect_model.predict(messages_tfidf_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('Accuracy', accuracy_score(y_test.values, y_pred, normalize=True))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3276
           1       0.99      0.98      0.98      3468

   micro avg       0.98      0.98      0.98      6744
   macro avg       0.98      0.98      0.98      6744
weighted avg       0.98      0.98      0.98      6744

[[3230   46]
 [  73 3395]]
('Accuracy', 0.9823546856465006)


In [9]:

spam_detect_model2 = SVC( gamma = 1,kernel = 'sigmoid').fit(messages_tfidf,y_train)


y_pred = spam_detect_model2.predict(messages_tfidf_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('Accuracy', accuracy_score(y_test.values, y_pred, normalize=True))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3276
           1       0.99      1.00      0.99      3468

   micro avg       0.99      0.99      0.99      6744
   macro avg       0.99      0.99      0.99      6744
weighted avg       0.99      0.99      0.99      6744

[[3225   51]
 [  17 3451]]
('Accuracy', 0.9899169632265717)


In [16]:

spam_detect_model3 = LogisticRegression().fit(messages_tfidf,y_train)

y_pred = spam_detect_model3.predict(messages_tfidf_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('Accuracy', accuracy_score(y_test.values, y_pred, normalize=True))

 



              precision    recall  f1-score   support

           0       1.00      0.97      0.99      3276
           1       0.98      1.00      0.99      3468

   micro avg       0.99      0.99      0.99      6744
   macro avg       0.99      0.99      0.99      6744
weighted avg       0.99      0.99      0.99      6744

[[3194   82]
 [  14 3454]]
('Accuracy', 0.9857651245551602)


In [13]:
spam_detect_model4 = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0).fit(messages_tfidf,y_train)

y_pred = spam_detect_model4.predict(messages_tfidf_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('Accuracy', accuracy_score(y_test.values, y_pred, normalize=True))


              precision    recall  f1-score   support

           0       1.00      0.71      0.83      3276
           1       0.78      1.00      0.88      3468

   micro avg       0.86      0.86      0.86      6744
   macro avg       0.89      0.85      0.85      6744
weighted avg       0.89      0.86      0.85      6744

[[2329  947]
 [  11 3457]]
('Accuracy', 0.8579478054567022)


In [14]:
spam_detect_model5 = DecisionTreeClassifier(criterion = 'entropy').fit(messages_tfidf,y_train)

y_pred = spam_detect_model5.predict(messages_tfidf_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('Accuracy', accuracy_score(y_test.values, y_pred, normalize=True))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      3276
           1       0.95      0.97      0.96      3468

   micro avg       0.96      0.96      0.96      6744
   macro avg       0.96      0.96      0.96      6744
weighted avg       0.96      0.96      0.96      6744

[[3106  170]
 [ 105 3363]]
('Accuracy', 0.9592230130486358)


In [None]:
def find_classifier_grid(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    gamma=[1,0.1,0.001,0.0001]
    kernel=['linear','rbf','poly','sigmoid']
    acc_l = []
    for i in range(len(gamma)):
        for k in range(len(kernel)):
            clf = SVC(kernel=kernel[k],gamma = gamma[i]).fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc_l.append([accuracy_score(y_test, y_pred),gamma[i],kernel[k]])
    return("Accuracy, gamma, kernel:",max(acc_l))

find_classifier_grid(messages_tfidf, y_train.values)

  ## Explanation of the algorithm selection
   For binary text classification I used 5 algorithms - Naive Bayes, SVM, Logistic Regresion, Random Fores and Decision Tree classifier. As can be seen from the tables above the best result was shown by the following algorithms:  Naive Bayes, SVM, Logistic Regresion. Prediction accuracy is almost equal. 
   If we need an algorithm with better accuracy despite the time of prediction - chose SVM
If time of prediction is also important - chose Logistic Regression