In [1]:
import string
import email
import nltk

nltk.download('stopwords')  # ikuya

punctuations = list(string.punctuation)
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
## Utility functions to browse a list of words
# ikuya
def browse(L, name='(list)', num_samples=5):
    n = num_samples if len(L)>=num_samples else len(L)
    print('%s: len=%d, samples: %s' % (name, len(L), ' '.join(L[:n])))

In [3]:
browse(punctuations, 'punctuations')
browse(list(stopwords), 'stopwords')

punctuations: len=32, samples: ! " # $ %
stopwords: len=179, samples: my until of in hasn


In [4]:
## functions to load and parse emails

# Combine the different parts of the email into a flat list of strings
def flatten_to_string(parts):
    ret = []
    if type(parts) == str:
        ret.append(parts)
    elif type(parts) == list:
        for part in parts:
            ret += flatten_to_string(part)
    elif parts.get_content_type == 'text/plain':
        ret += parts.get_payload()
    return ret

# Extract subject and body text from a single email file
def extract_email_text(path):
    # Load a single email from an input file
    with open(path, errors='ignore') as f:
        msg = email.message_from_file(f)
    if not msg:
        return ""
    
    # Read the email subject
    subject = msg['Subject']
    if not subject:
        subject = ""
    
    # Read the email body
    body = ' '.join(m for m in flatten_to_string(msg.get_payload())
                    if type(m) == str)
    if not body:
        body = ""
    
    return subject + ' ' + body

# Process a single email file into stemmed tokens
PUNCTS = "".join(punctuations)
def load(path):
    email_text = extract_email_text(path)
    if not email_text:
        return []
    
    # Tokenize the message
    tokens = nltk.word_tokenize(email_text)
    
    # Remove punctuation from tokens
    tokens = [i.strip(PUNCTS) for i in tokens if i not in punctuations]
    
    # Remove stopwords and stem tokens
    if len(tokens) > 2:
        return [stemmer.stem(w) for w in tokens if w not in stopwords]
    return []


In [5]:
## Load dataset

import os

DATA_DIR = 'datasets/trec07p/data/'
LABELS_FILE = 'datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.7

nltk.download('punkt')  # ikuya

labels = {}

# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower()=='ham' else 0

# Split corpus into training and test sets
#filelist = os.listdir(DATA_DIR)
#num_train = int(len(filelist) * TRAINING_SET_RATIO)
#X_train = filelist[:num_train]
#X_test  = filelist[num_train:]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def read_email_files():
    X = []
    y = []
    for i in range(len(labels)):
        filename = 'inmail.' + str(i+1)
        email_str = extract_email_text(os.path.join(DATA_DIR, filename))
        X.append(email_str)
        y.append(labels[filename])
    return X, y

from sklearn.model_selection import train_test_split

X, y = read_email_files()

X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X, y, range(len(y)),
                     train_size=TRAINING_SET_RATIO, random_state=2)




In [8]:
print("#(X_train, X_test, y_train, y_test, idx_train, idx_test) =",
      (len(X_train), len(X_test), len(y_train), len(y_test), len(idx_train), len(idx_test)))

#(X_train, X_test, y_train, y_test, idx_train, idx_test) = (52793, 22626, 52793, 22626, 52793, 22626)


In [14]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

Wall time: 11.4 s


In [15]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

Wall time: 11.9 s


TF-IDF

単語 $t_i$ の文書 $d_j$ における単語の重要度 $tfidf_{i,j}= tf_{i,j} \cdot idf_i$

$tf_{i,j}$ は単語 $t_i$ の出現頻度 (term frequency) で $tf_{i, j} = n_{i,j} / \Sigma_k{ n_{k,j} }$、ただし $n_{i,j}$ は文書 $d_j$ における単語 $t_i$ の出現回数。

$idf_i$ は単語 $t_i$ の逆文書頻度 (inverse document frequency) で、全文書に登場する語では 0 に、1/e (≒ 37%) の文書に登場する語では 1 に、という値をとる単調増加関数。多くの文書に登場する語のスコアを下げる役割をもつ。 $idf_i = log{ \frac{|D|}{|\{d:d∋t_i\}|} }$

In [16]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize the classifier and make label predictions
mnb = MultinomialNB()
mnb.fit(X_train_vector, y_train)
y_pred = mnb.predict(X_test_vector)

# Print results
print('Accuracy {:.3f}'.format(accuracy_score(y_test, y_pred)))


Accuracy 0.973
Wall time: 59.8 ms


- CountVectorizer => Accuracy 0.955, Wall time: 77.8 ms
- TfidfVectorizer => Accuracy 0.973, Wall time: 59.8 ms