## Download Spam Mail Dataset


In [None]:
! curl http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz --output enron1.tar.gz
! tar -xf enron1.tar.gz enron1 # x: 파일 압축 해제 / f: 압축될 파일명 지정

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1760k  100 1760k    0     0  1139k      0  0:00:01  0:00:01 --:--:-- 1139k


In [None]:
! ls enron1/ham/*.txt | wc -l # print the number of non-spam emails

3672


In [None]:
! ls enron1/spam/*.txt | wc -l # print the number of spam emails

1500


In [None]:
! cat enron1/ham/0007.1999-12-14.farmer.ham.txt # print an example of non-spam (ham) email

Subject: mcmullen gas for 11 / 99
jackie ,
since the inlet to 3 river plant is shut in on 10 / 19 / 99 ( the last day of
flow ) :
at what meter is the mcmullen gas being diverted to ?
at what meter is hpl buying the residue gas ? ( this is the gas from teco ,
vastar , vintage , tejones , and swift )
i still see active deals at meter 3405 in path manager for teco , vastar ,
vintage , tejones , and swift
i also see gas scheduled in pops at meter 3404 and 3405 .
please advice . we need to resolve this as soon as possible so settlement
can send out payments .
thanks

In [None]:
! cat enron1/spam/0006.2003-12-18.GP.spam.txt # print an example of spam (spam) email

Subject: dobmeos with hgh my energy level has gone up ! stukm
introducing
doctor - formulated
hgh
human growth hormone - also called hgh
is referred to in medical science as the master hormone . it is very plentiful
when we are young , but near the age of twenty - one our bodies begin to produce
less of it . by the time we are forty nearly everyone is deficient in hgh ,
and at eighty our production has normally diminished at least 90 - 95 % .
advantages of hgh :
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swings
- new hair growth and color restore
read
more

## Download Packages

In [None]:
! pip3 install nltk
! python -m nltk.downloader all #이 부분에서 많은 시간을 소요합니다.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | 

## Load Dataset

In [None]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whether the given email is spam or ham
  - spam: 1
  - ham: 0
"""

emails, labels = [], []

In [None]:
# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: # [!important] check encoding format
        emails.append(f.read())
        labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: 
        emails.append(f.read())
        labels.append(0) 

print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))

# of emails = 5172
# of labels = 5172


## Data Preprocessing
  - remove number and punctuation
  - remove name entity
  - remove stopword
  - lemmatization

In [None]:
# remove number and punctuation 
def letters_only(word):
    return word.isalpha()

# remove name entity
from nltk.corpus import names
all_names = set(names.words())

# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# put all together to clean texts
def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '): # split doc. by blank (' ')
        word = word.lower() # ABD -> abd
        if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word))
            
    return ' '.join(cleaned_doc) 

cleaned_emails = [clean_text(doc) for doc in emails]

In [None]:
cleaned_emails[0]

'via gra pro will get you hard you already saw there new better via the market called via gra pro and significally beter and ha better influence never imagined enter now expreience more alien dimension should complex the city itself they rebuild stuff after while hank warren beatty annette bening and dustin hoffman all turned out feinstein for pal carole bayer sager new york daily news site used this site online class assignment very helpful can wait start teaching and have more time browse kung mababaw lang tingin pagkakaibigan natin sasabihin friend siya hindi kita friend pero hindi just blog hopping wish you very lovely happy valentine day have fun take care xoxo celly note nbsp link are the webpage where these track are available nbsp nbsp audio file are linked directly are hosted r feed just passed along news critical update for window when clicked the link took this page professor milgram work established the small world idea ha been tested few time and there are sign similar int

## Data Preparation
* Split data into train and test set.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=1213)

cv = CountVectorizer(stop_words='english', max_features=500)
term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
term_docs_test = cv.transform(X_test) # get counter vector for X_test

## Import Model and Train

* references: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [None]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

gnb = MultinomialNB()
gnb.fit(term_docs_train, Y_train)
y_pred = gnb.predict(term_docs_test)

## Simple Evaluation

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, y_pred)
print("Accuracy of the model is: {:.2f}".format(acc))

Accuracy of the model is: 0.90


## 실습과제

* data preprocessing부터 model training까지 직접 코드를 작성해보세요.
* 이 과정에서, 다양한 parameter를 튜닝하며 실행시켜보세요. (e.g. ratio of train/test dataset, CountVectorizer의 max_features, etc.)

In [None]:
# Your code here
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics

def Naive_Bayes(test_size, max_features):
    X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails,
                                                        labels,
                                                        test_size=test_size,
                                                        random_state=1213)

    cv = CountVectorizer(stop_words='english', max_features=max_features)
    term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
    term_docs_test = cv.transform(X_test) # get counter vector for X_test

    gnb = MultinomialNB()
    gnb.fit(term_docs_train, Y_train)
    y_pred = gnb.predict(term_docs_test)

    acc = accuracy_score(Y_test, y_pred)
    print("Accuracy of the model is: {:.2f}".format(acc))
    matrics=sklearn.metrics.classification_report(Y_test, y_pred)
    print(matrics)

    return acc

In [None]:
import numpy as np

test_size = [0.20, 0.25, 0.30, 0.35, 0.40, 0.50]
max_features = [300, 500, 800, 1000, 2000, 3000]
temp = np.array([[0,0,0]])
for size in test_size :
    for features in max_features:
        row_add = [[size, features, Naive_Bayes(size, features)]]
        temp = np.r_[temp, row_add]

results = np.delete(temp, 0, axis = 0)
best = results[results[:,2].argmax(), :]

print(f'Best test ratio : {best[0]}\nBest max_features : {best[1]}\nwith accuracy : {best[2]}')


Accuracy of the model is: 0.89
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       751
           1       0.78      0.80      0.79       284

    accuracy                           0.89      1035
   macro avg       0.85      0.86      0.86      1035
weighted avg       0.89      0.89      0.89      1035

Accuracy of the model is: 0.90
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       751
           1       0.80      0.84      0.82       284

    accuracy                           0.90      1035
   macro avg       0.87      0.88      0.87      1035
weighted avg       0.90      0.90      0.90      1035

Accuracy of the model is: 0.91
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       751
           1       0.81      0.87      0.84       284

    accuracy                           0.91      1035
   macro avg       0.88      0.90  