In [1]:
import email
import re
import os
import stat
import pandas as pd

def read_email(filename):
    '''
    Read Email content.
    :param filename: Path to the file
    :return: Subject and content
    '''
    with open(filename, encoding='latin-1') as f:
        mail = email.message_from_file(f)
        payload = mail.get_payload()
        if type(payload) == type(list()):
            payload = payload[0]
        if type(payload) != type(''):
            payload = str(payload)
            
        subject = mail.get('subject')
        subject_str = str(subject)
        return subject_str + payload

def get_file_id(filename):
    '''
    Get file id.
    :param filename: Path to the file
    :return: The id of the file
    '''
    return int(re.findall(r'\d+', filename)[0])

def clean_html(raw_text):
    return raw_text


def get_label(labels, index):
    '''
    Get emails tag.
    :param labels: Id and prediction
    :return: 1 for ham，0 for spam
    '''
    return labels.Prediction[labels.Id == index].iloc[0]
    
def calc_tf_idf(tf, idf, text, ignore=3):
    '''
    Calculate term frequency and inverse document frequency.
    :param tf: Term frequency
    :param idf: Inverse document frequency
    :return: words count
    '''
    words = re.findall('\w+', text)
    count = 0
    word_set = set()
    for word in words:
        # Filter invalid words
        if len(word) < ignore or len(word) > 20:
            continue
        word = word.lower()
        
        # Calculate inverse document frequency
        if not (word in word_set):
            idf[word] = idf.get(word, 0) + 1
            word_set.add(word)
            
        # Calculate words count
        tf[word] = tf.get(word, 0) + 1
        
        # Calculate total words count
        count = count + 1
    
    return count

def train_model():
    '''
    Read emails and tags.
    :return: All the information we need.
    '''
    pathname = 'TR'
    labels = pd.read_csv('spam-mail.tr.label')
    
    ham_tf = dict()
    spam_tf = dict()
    word_idf = dict()
    ham_word_count = 0
    spam_word_count = 0
    file_count = 0
    spam_file_count = 0
    ham_file_count = 0
    
    # Iterate over emails
    for file in os.listdir(pathname):
        fpath = os.path.join(pathname, file)
        info = os.stat(fpath)
        if stat.S_ISREG(info.st_mode) and file.endswith('.eml'):
            '''
            1. Read content
            2. According to the tags, calculate words count and so on.
            '''
            text = clean_html(read_email(fpath))
            index = get_file_id(file)
            file_count = file_count + 1
            if get_label(labels, index) == 1:
                ham_file_count = ham_file_count + 1
                ham_word_count = ham_word_count + calc_tf_idf(ham_tf, word_idf, text)
            else:
                spam_file_count = spam_file_count + 1
                spam_word_count = spam_word_count + calc_tf_idf(spam_tf, word_idf, text)

    info = {}
    info['ham_word_count'] = ham_word_count
    info['spam_word_count'] = spam_word_count
    info['file_count'] = file_count
    info['ham_file_count'] = ham_file_count
    info['spam_file_count'] = spam_file_count
    print('train email info : ', info)

    # Transpose
    word_df = pd.DataFrame([ham_tf, spam_tf, word_idf]).T
    word_df.columns = ['ham_tf', 'spam_tf', 'word_idf']
    return (word_df, info)

In [2]:
email_df, email_info = train_model()

train email info :  {'ham_word_count': 448008, 'spam_word_count': 351808, 'file_count': 2500, 'ham_file_count': 1721, 'spam_file_count': 779}


In [4]:
# 拷贝数据，可重复运行这段代码
word_df = email_df.copy()
word_df.fillna(1, inplace=True)    

# P(Y=S) : 垃圾邮件的概率
p_y_s = email_info['spam_file_count'] /  email_info['file_count']

# P(Y=H) : 正常邮件的概率
p_y_h = 1 - p_y_s

# P(W|Y=H) : 正常邮件时，出现单词 W 的概率
word_df['ham_tf'] = word_df['ham_tf'] / email_info['ham_word_count']

# P(W|Y=S) : 垃圾邮件时，出现单词 W 的概率
word_df['spam_tf'] = word_df['spam_tf'] / email_info['spam_word_count']

# 根据公式计算 P(Y=S|W)
word_df['spam_sp'] = (word_df['spam_tf'] * p_y_s) / (word_df['ham_tf'] * p_y_h + word_df['spam_tf'] * p_y_s)

# 根据公式计算 P(Y=H|W)
# word_df['spam_hp'] = (word_df['ham_tf'] * p_y_h) / (word_df['ham_tf'] * p_y_h + word_df['spam_tf'] * p_y_s)

# 选择 P(Y=S|W) >= 0.9 的单词作为识别关键词，节省计算
word_df = word_df.loc[(word_df['spam_sp'] >= 0.9)]

# 从大到小排序
word_df = word_df.sort_values(by=['spam_sp'], ascending=[False])

print(word_df)


                 ham_tf   spam_tf  word_idf   spam_sp
hibody         0.000002  0.001168     214.0  0.995797
111n           0.000002  0.001083       8.0  0.995467
0px            0.000004  0.001768     204.0  0.994453
111r           0.000002  0.000881       7.0  0.994435
11px           0.000002  0.000770     107.0  0.993639
...                 ...       ...       ...       ...
dd3960         0.000002  0.000045       2.0  0.902178
118i           0.000002  0.000045       7.0  0.902178
tabdesmond61a  0.000002  0.000045       2.0  0.902178
heat           0.000013  0.000270      23.0  0.901250
farmer         0.000007  0.000134      13.0  0.900304

[531 rows x 4 columns]


In [5]:
def is_spam_email(filename, word_df, info, ignore=3):
    '''
    Test if a given email is spam
    :param filename: Path to email
    :param word_df: word dataframe
    :param info: word info
    '''
    text = clean_html(read_email(filename))
    words = re.findall('[A-Za-z]+', text)
    word_set = set()
    p_s_w = info['spam_file_count'] /  info['file_count']
    p_h_w = 1 - p_s_w
    
    for word in words:
        # Ignore invalid words
        if len(word) < ignore or len(word) > 20:
            continue
            
        word = word.lower()

        # Test if it is spam
        if (word in word_df.index) and not (word in word_set):
            word_set.add(word)
            p_s_w = 1000 * p_s_w * (word_df.loc[word].spam_tf)
            p_h_w = 1000 * p_h_w * (word_df.loc[word].ham_tf)

    # Cannot verify, ham
    if len(word_set) == 0:
        return (False, 0)

    # print('file %s p_s_w : %f, p_h_w %f, word count %d' % (filename, p_s_w, p_h_w, len(word_set)))
    result = p_s_w / (p_s_w + p_h_w)
    if result > 0.9:
        return (True, result)
    return (False, result)

In [14]:
is_spam_email('TR/TRAIN_168.eml', word_df, email_info)

(True, 0.905149443323536)

In [63]:
import email

def read_email2(filename):
    '''
    Read Email content.
    :param filename: Path to the file
    :return: Subject and content
    '''
    with open(filename, encoding='latin-1') as f:
        mail = email.message_from_file(f)
        payload = ''
        for part in mail.walk():
            # Only process email parts in plain text and html
            if part.get_content_type() == 'text/plain' or part.get_content_type() == 'text/html':
                payload += str(part.get_payload())

        subject = mail.get('subject')
        sender = mail.get('from')
        subject_str = str(subject)
        return (subject_str, sender, payload)

for i in range(1, 2500):
    read_email2('TR/TRAIN_{}.eml'.format(i))

```dsl
# This is a comment
if mail.content has `some-word` then
    LS = 0.1
    LN = 0.2

if mail.from eq/== `some-user` then
    LS = 0.0
    LN = 1.0

if mail.subject has `some-word` then
    LS = 0.1
    LN = 0.3

# Extension
if mail.subject match `some-pattern` then
    LS = 0.1
    LN = 0.3
```

dsl -> dataframe

In [64]:
import mail

class EMail(object):
    def __init__(self, subject, sender, receiver, content):
        '''
        :param subject: Subject of the email.
        :param sender: Sender of the email.
        :param receiver: Receiver of the email.
        :param content: Content of the email.
        :returns: Mail object.
        '''
        self.subject = subject
        self.sender = sender
        self.receiver = receiver
        self.content = content

    def from_file(filename):
        '''
        :param filename: Path to the email file.
        :returns: Mail object.
        '''
        with open(filenam, encodng='latin-1') as f:
            mail = email.message_from_file(f)
            content = ''
            for part in mail.walk():
                # Only process email parts in plain text and html
                if part.get_content_type() == 'text/plain' or part.get_content_type() == 'text/html':
                    content += str(part.get_payload())
            subject = mail.get('subject')
            sender = mail.get('from')
            receiver = mail.get('to')
            subject = str(subject)
            return EMail(subject, sender, receiver, content)


ModuleNotFoundError: No module named 'mail'