Dataset preparation

In [1]:
import os
import email_read_util

In [2]:
DATA_DIR = 'datasets/trec07p/data/'
LABELS_FILE = 'datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [3]:
labels = {}
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [4]:
def read_email_files():
    X = []
    y = [] 
    for i in range(len(labels)):
        filename = 'inmail.' + str(i+1)
        email_str = email_read_util.extract_email_text(
            os.path.join(DATA_DIR, filename))
        X.append(email_str)
        y.append(labels[filename])
    return X, y

In [5]:
X, y = read_email_files()

from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X, y, range(len(y)), 
    train_size=TRAINING_SET_RATIO, random_state=2)

Feature Extraction  &  Train, Test, Report

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [7]:
# GridSearchCV for each model

## pipeline. 여기에는 일단 placeholder로서만 적어주면 된다.
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words="english")), 
    ('clf', MultinomialNB())
])

## Parameters for each model
param_grid_nb = {
    'vect': [CountVectorizer(stop_words="english"), TfidfVectorizer(stop_words="english")], # 두 종류의 Vectorizer를 사용해본다. 불용어 처리도 해준다.
    'vect__max_features': [1000, 5000], # 벡터라이저가 고려할 최대 특성 수를 지정
    'vect__ngram_range': [(1, 1), (1, 2)], # 사용할 n-gram의 범위를 지정
    'clf': [MultinomialNB()], # classifier: MultinomialNB
    'clf__alpha': [0.01, 0.1, 1] # classifier의 스무딩 파라미터 alpha를 다양하게 설정
}
param_grid_lr = {
    'vect': [CountVectorizer(stop_words="english"), TfidfVectorizer(stop_words="english")], # 두 종류의 Vectorizer를 사용해본다. 불용어 처리도 해준다.
    'vect__max_features': [1000, 5000], # 벡터라이저의 parameter 설정은 nb model과 동일
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf': [LogisticRegression(max_iter=1000)], # classifier: LogisticRegression, 최대 반복 횟수를 1000으로 설정
    'clf__C': [0.1, 1, 10] # 정규화 강도를 다양하게 설정
}

## GridSearchCV objects for each model. cv: 교차 검증(Cross-Validation)의 폴드(fold) 수
grid_search_nb = GridSearchCV(pipeline, param_grid_nb, n_jobs=-1, verbose=1, cv=3)
grid_search_lr = GridSearchCV(pipeline, param_grid_lr, n_jobs=-1, verbose=1, cv=3)

# train

grid_search_nb.fit(X_train, y_train)
grid_search_lr.fit(X_train, y_train)

# best output for each model

print("Best parameters for NB:", grid_search_nb.best_params_)
print("Best cross-validation score for NB: {:.2f}".format(grid_search_nb.best_score_))
print("Best parameters for LR:", grid_search_lr.best_params_)
print("Best cross-validation score for LR: {:.2f}".format(grid_search_lr.best_score_))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best parameters for NB: {'clf': MultinomialNB(), 'clf__alpha': 0.01, 'vect': TfidfVectorizer(stop_words='english'), 'vect__max_features': 5000, 'vect__ngram_range': (1, 1)}
Best cross-validation score for NB: 0.96
Best parameters for LR: {'clf': LogisticRegression(max_iter=1000), 'clf__C': 10, 'vect': TfidfVectorizer(stop_words='english'), 'vect__max_features': 5000, 'vect__ngram_range': (1, 1)}
Best cross-validation score for LR: 0.98


'\nprint("Best parameters for SVM:", grid_search_svm.best_params_)\nprint("Best cross-validation score for SVM: {:.2f}".format(grid_search_svm.best_score_))\n'

In [9]:
# Test

y_pred = grid_search_lr.predict(X_test)

# Report 

print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.2%}'.format(accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

        Spam       0.98      0.99      0.99     15035
         Ham       0.98      0.96      0.97      7591

    accuracy                           0.98     22626
   macro avg       0.98      0.98      0.98     22626
weighted avg       0.98      0.98      0.98     22626

Classification accuracy 98.25%
