In [110]:
import mailbox
import re
import pprint as pp
from bs4 import BeautifulSoup
import pandas as pd
import csv
from textblob import TextBlob
import textblob
import time
import numpy as np
from collections import defaultdict            
from tqdm import tqdm
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.classifiers import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import email
import base64

In [150]:
from abc import ABCMeta, abstractmethod


class FeatureFinder:
    __metaclass__ = ABCMeta

    @abstractmethod
    def getFeatureTitle(self):
        pass

    @abstractmethod
    def getFeature(self, message):
        pass


class HTMLFormFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Html Form"

    def getFeature(self, message):
        import re
        super(HTMLFormFinder, self).getFeature(message)
        payload = getpayload(message).lower()
        return re.compile(r'<\s?\/?\s?form\s?>', re.IGNORECASE).search(payload) != None


class IFrameFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Html iFrame"

    def getFeature(self, message):
        import re
        super(IFrameFinder, self).getFeature(message)
        payload = getpayload(message).lower()
        return re.compile(r'<\s?\/?\s?iframe\s?>', re.IGNORECASE).search(payload) != None


class FlashFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Flash content"

    def getFeature(self, message):
        import re
        super(FlashFinder, self).getFeature(message)
        payload = getpayload(message).lower()

        swflinks = re.compile(FLASH_LINKED_CONTENT, re.IGNORECASE).findall(payload)
        flashObject = re.compile(r'embed\s*src\s*=\s*\".*\.swf\"', re.IGNORECASE).search(payload);
        return (swflinks != None and len(swflinks) > 0) or \
               (flashObject != None)


class AttachmentFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Attachments"

    def getFeature(self, message):
        return getAttachmentCount(message)


class HTMLContentFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "HTML content"

    def getFeature(self, message):
        return ishtml(message)


class URLsFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "URLs"

    def getFeature(self, message):
        return len(geturls_payload(message))


class ExternalResourcesFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "External Resources"

    def getFeature(self, message):
        return len(getexternalresources(message))


class JavascriptFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Javascript"

    def getFeature(self, message):
        return len(getjavascriptusage(message))


class CssFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Css"

    def getFeature(self, message):
        return len(getcssusage(message))


class IPsInURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "IPs in URLs"

    def getFeature(self, message):
        return len(getIPHrefs(message)) > 0

# isaac
class DotsInURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "Dots in URLs"

    def getFeature(self, message):
        result_dot_url = []
        for url in geturls_payload(message):
            domain = re.search('(?<=//)[^/]+', url)
#             print(url, " = ", domain.group(0))
            if domain != None:
                url_dot_count = domain.group(0).count(".")
#                 print("URL DOTS: ", url_dot_count, url)
                result_dot_url.append(url_dot_count)
#         print(result_dot_url)
#         if len(result_dot_url) == 0:
#             print("NO DOTS FOUND")
        result = np.average(result_dot_url)
        if np.isnan(result):
            result = 0.0
        return result

class LinksMatchUrls(FeatureFinder):
    def getFeatureTitle(self):
        return "LinksMatchURLs"

    def getFeature(self, message):
        payload = getpayload_dict(message)
        print("LINKS MATCH URLS")
        result = []
        for part in payload:
            soup = BeautifulSoup(part["payload"], "html.parser")
            print("SOUP: ", soup)
            if soup.find():
                print("FOUND SOUP")
                all_a_tags = soup.find_all('a')
                for a_tag in all_a_tags:
                    print(a_tag)
                    print(a_tag.text, " COMPARED ", a_tag.get('href'))
                    result.append(a_tag.text == a_tag.get('href'))
            
        
        return result
    

    
# class HTTPinURLs(FeatureFinder):
    

    
class AtInURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "@ in URLs"

    def getFeature(self, message):
        emailPattern = re.compile(EMAILREGEX, re.IGNORECASE)
        for url in geturls_payload(message):
            if (url.lower().startswith("mailto:") or (
                    emailPattern.search(url) != None and emailPattern.search(url).group() != None)):
                continue
            atvalue = url.find("@")
            athexvalue = url.find("%40")

            if (atvalue != -1 and athexvalue != -1):
                atvalue = min(athexvalue, atvalue)
            else:
                atvalue = max(atvalue, athexvalue)

            paramindex = url.find("?")

            if paramindex != -1:  # url has parameters, an email can be a parameter
                if (atvalue != -1) and (paramindex > atvalue):
                    return True
            else:
                # There are no parameters in the url. if there is an @, then return true
                if (atvalue != -1):
                    return True
        return False
    
class EncodingFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Encoding"

    def getFeature(self, message):
        return str(message.get('content-transfer-encoding')).lower().strip()
    
class TextAnalysis(FeatureFinder):
    def getFeatureTitle(self):
        return "TextAnalysis"
    
    def getFeature(self, message):
        return get_text_analysis(message)


sus_words = ["update", "confirm", "account", "online", "please", "ebay", "information", "banking", "bank", "security", 
             "http", "com", "paypal", "access", "customer", "service", "click", "link", "dear", "e-mail", "www", "below",
            "thank", "co", "message", "nationwide", "user", "member", "address", "mail", "records", "verify", "fraud", 
            "personal", "protect", "internet", "ensure"]
class SuspiciousWords(FeatureFinder):
    def getFeatureTitle(self):
        return "SuspiciousWords"
    
    def getFeature(self, message):
        return get_suspicious_words(message, sus_words)

In [145]:
def getpayload(msg):
    return __getpayload_rec__(msg, payloadresult="")


def __getpayload_rec__(msg, payloadresult):
    payload = msg.get_payload()
#     print(payload)
    if str(msg.get('content-transfer-encoding')).lower() == "base64":
        
        payload = msg.get_payload(decode=True)
#         print("entered here:", payload)
#         print(type(payload))

    if payload and msg.is_multipart():
        for subMsg in payload:
            payloadresult += __getpayload_rec__(subMsg, payloadresult)
    else:
        if type(payload) == bytes:
#             print("AM BYTES")
            payload_asbytes = str(payload)
#             try:

#             except (UnicodeDecodeError, AttributeError):
#                 print("Found Error!")
#                 pass
        
            return msg.get_content_type() + "\t" + payload_asbytes + "\n"
        else:
            return msg.get_content_type() + "\t" + payload + "\n"
    return payloadresult

def getpayload_dict(msg):
    return __getpayload_dict_rec__(msg, [])


def __getpayload_dict_rec__(msg, payloadresult):
    payload = msg.get_payload()
    if msg.is_multipart():
        for subMsg in payload:
            __getpayload_dict_rec__(subMsg, payloadresult)
    else:
        payloadresult.append({"mimeType": msg.get_content_type(), "payload": payload})
    return payloadresult


def getAttachmentCount(msg):
    return __getAttachmentCountrec__(msg, count=0)


def __getAttachmentCountrec__(msg, count):
    payload = msg.get_payload()
    if msg.is_multipart():
        for subMsg in payload:
            count += __getAttachmentCountrec__(subMsg, count)
    else:
        if __hasAttachment__(msg):
            return 1
    return count


def __hasAttachment__(message):
    contentDisp = message.get("Content-Disposition")
#     print(message)
    return contentDisp is not None and contentDisp.lower().find("attachment") != -1


def getContentTypes(msg):
    return __getContentTypes_rec__(msg, [])


def __getContentTypes_rec__(msg, contenttypes):
    payload = msg.get_payload()
    if msg.is_multipart():
        for subMsg in payload:
            __getContentTypes_rec__(subMsg, contenttypes)
    else:
        contenttypes.append(msg.get_content_type())

    return contenttypes

def geturls_payload(message):
    """
    Returns the urls present in the message payload.
    Could be optimized by only looking into text payloads instead of all the payload

    :param message: message
    :return: url list
    """
    return geturls_string(getpayload(message))



def geturls_string(string):
    """
    Returns the urls present in the message payload.
    Could be optimized by only looking into text payloads instead of all the payload

    :param message: message
    :return: url list
    """
    result = []

    cleanPayload = re.sub(r'\s+', ' ', string)  # removes innecesary spaces
    linkregex = re.compile(HREFREGEX, re.IGNORECASE)
    links = linkregex.findall(cleanPayload)

    for link in links:
        if isurl(link):
            result.append(link)


    urlregex = re.compile(URLREGEX_NOT_ALONE, re.IGNORECASE)
    links = urlregex.findall(cleanPayload)
    for link in links:
        if link not in result:
            result.append(link)
    return links
    

def isurl(link):
    return re.compile(URLREGEX, re.IGNORECASE).search(link) is not None


def getIPHrefs(message):
    urls = geturls_payload(message)
    iphref = re.compile(IPREGEX, re.IGNORECASE)
    result = []
    for url in urls:
        if iphref.search(url) and iphref.search(url).group(1) is not None:
            result.append(iphref.search(url).group(1))
    return result


def getexternalresources(message):
    """
    :param message: message
    :return: url list-
    """
    result = []

    for script in getjavascriptusage(message):
        if "src" in str(script) and "src" in script and isurl(script["src"]):
            result.append(script["src"])
    for css in getcssusage(message):
        if "href" in str(css) and isurl(css["href"]):
            result.append(css["href"])

    return result


def getjavascriptusage(message):
    """
    :param message: message
    :return: url list-
    """
    result = []
    payload = getpayload_dict(message)
    for part in payload:
        if part["mimeType"].lower() == "text/html":
            htmlcontent = part["payload"]
            soup = BeautifulSoup(htmlcontent, "html.parser")
#             print(soup)
            scripts = soup.find_all("script")
#             print(scripts)
            for script in scripts:
                result.append(script)
    return result


def getcssusage(message):
    """
    :param message: message
    :return: url list-
    """
    result = []
    payload = getpayload_dict(message)
    
#     print("len payload: ", len(payload))
    for part in payload:
        if part["mimeType"].lower() == "text/html":
#             print("ENTERED HERE")
            htmlcontent = part["payload"]
#             print("htmlcontent", htmlcontent)
#             print("html len: ", len(htmlcontent))
            soup = BeautifulSoup(htmlcontent, "html.parser")
#             print("soup", soup)
#             print("soup len: ", len(soup))
            csslinks = soup.findAll("link")
#             print(csslinks)
            for css in csslinks:
                result.append(css)
    return result


def extract_registered_domain(url):
    return tldextract.extract(url).registered_domain

def get_whois_data(url):
    domain = extract_registered_domain;
    return pythonwhois.get_whois(domain)


def ishtml(message):
    result = ("text/html" in getContentTypes(message))
    payload = getpayload_dict(message)
#     print(getContentTypes(message))
#     print(result)
    for part in payload:
#         print("beautifulSoup: ", BeautifulSoup(part["payload"], "html.parser").find())
        if result or BeautifulSoup(part["payload"], "html.parser").find():
            return True
    return result


# def filter_out_html(tag):
#     return tag is not None and not tag.name == 'style'

def get_text_analysis(some_message):
    payload = getpayload_dict(some_message)

    j = 0
    sentences = []
    for part in payload:

        if part['mimeType'] == "text/html":
#             print("PARSING HTML ------ DO NOTHING")
            # THERE IS A CONTINUE HERE
            continue
            htmlcontent = BeautifulSoup(part['payload'], "html.parser")

            html_sentences_old = ' '.join(htmlcontent.findAll(string=True))

            blob = TextBlob(html_sentences_old)
#             print(blob.sentences)

            for k in range(len(blob.sentences)):
#                 print('\t', k, blob.sentences[k])
                sentences.append(blob.sentences[k])

        elif part['mimeType'] == "text/plain":
#             print("PARSING PLAINTEXT")
            blob = TextBlob(part['payload'])
#             print(blob)

            try:
#                 print("CHECKING IF PLAINTEXT BOMB")
                regex = re.compile(r' ')
#                 found_whitespace = part['payload'].find(' ')
                found_whitespace = regex.search(part['payload'])
#                 print("regex_res: ", found_whitespace)
                length_part = len(part['payload'])
                if found_whitespace == None and length_part > 5:
#                     print("NO WHITESPACE PARSING AS MESSAGE")
                    decoded = email.message_from_bytes(base64.b64decode(part['payload']))
                    attached_message = getpayload_dict(decoded)
#                     print("DECODED with len {0}: {1}".format(len(attached_message), attached_message[:5]))
                    assert(len(attached_message) > 0)
                    assert(attached_message[0]['payload'] != None)
                    if len(attached_message) == 1:
                        # decode if there is one message, ignore if requires more recursion
                        decoded_blob = TextBlob(attached_message[0]['payload'])
#                         print("DECODED BLOB: ", decoded_blob)
                        for s in decoded_blob.sentences:
                            sentences.append(s)
#                         print("CONTINUING SINCE FOUND BLOB")
                    continue
                    
            except Exception as e:
                print(e)
                
                
            try:
#                 print("TRYING EMBEDED")
                part_is_html = bool(BeautifulSoup(part['payload'], "html.parser").find())
#                 print("is html: ", part_is_html)
#                 part_is_html = ishtml(part)
#                 print("is html: ", part_is_html)
                
                if not part_is_html:
                    for k in range(len(blob.sentences)):
            #                 print('\t', k, blob.sentences[k])
                            sentences.append(blob.sentences[k])
                

            except Exception as e:
                print(e)
            
#         else:
#             print("UNPARSED TEXT OF MIMTYPE: ", part['mimeType'])
        j+=1

    return sentences



In [116]:
URLREGEX = r"^(https?|ftp)://[^\s/$.?#].[^\s]*$"
URLREGEX_NOT_ALONE = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
FLASH_LINKED_CONTENT = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F])+).*\.swf"
HREFREGEX = '<a\s*href=[\'|"](.*?)[\'"].*?\s*>'
IPREGEX =  r"\b((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\b"
EMAILREGEX = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"


#  Sample

In [None]:
random_state = 42
np.random.seed(random_state)



def get_smallest_sample_size(filenames):
    sizes = []

    for filename in filenames:
        mbox = mailbox.mbox("./resources/" + filename + ".mbox")
#         a_sample = random.sample(range(len(mbox)), 414)
#         print(a_sample[:5])
        sizes.append(len(mbox))
        mbox.close()

    for item in zip(filenames, sizes):
        print(item)
        
    return min(sizes)

def getSampleMatrix(filenames, smallest_size):
    sample_matrix = []
    for filename in filenames:
        mbox = mailbox.mbox("./resources/" + filename + ".mbox")
        a_sample = np.random.choice(range(len(mbox)), smallest_size, replace=False)
        print(a_sample[:5])
        sample_matrix.append(a_sample)
        mbox.close()

    
    return sample_matrix
    

# filenames = ["emails-phishing", "phishing0","ass-spam0", "ass-spam2", "emails-tamu-ham"]

# start = time.perf_counter()
# smallest_size = get_smallest_sample_size(filenames)
# sample_matrix = getSampleMatrix(filenames, smallest_size)
# end = time.perf_counter()

# print("Sampling from ", filenames, "  took", end-start, "seconds")
# for arr in sample_matrix:
#     print(arr[:5])

    

# NLP 

In [None]:


random_state = 42

def trainNLP(filenames, stop = None, phishy = [], idxParse=None, sample_size = None):
    #     filename = "emails-enron"
    myfinders = [TextAnalysis()]
#     idxPhishy = 0
    
    parseAll = idxParse == None

    data=[]
    email_index = []
    
    train = [] 
    

    
#     smallest_size = get_smallest_sample_size(filenames)
#     sample_size = min(sample_size, smallest_size)
#     print("Gathering sentence data from emails of sample_size: ", sample_size)
#     sample_matrix = getSampleMatrix(filenames, sample_size)
    
    for idxFilename in tqdm(range(len(filenames))):
        filename=filenames[idxFilename]
        curr_phishy = phishy[idxFilename]
        i = 1
        mbox = mailbox.mbox("./resources/" + filename + ".mbox")
        print("Executing on", filename)
        
        for mboxIndex in tqdm(range(len(mbox))):
            msg=mbox[mboxIndex]
            print(i, end=" ")
            
            totalsize = 0
            payload = getpayload_dict(msg)
            for part in payload:
                totalsize += len(re.sub(r'\s+','',part["payload"]))

            if totalsize < 1:
                print(i, " empty email - ", str(phishy), " - ", payload)
                i += 1
                continue
                
            tmpDict = {}
            if parseAll or (idxParse and i in idxParse):
                try:
                    for finder in myfinders:
                        tmpDict[finder.getFeatureTitle()] = finder.getFeature(msg)

                    tmpDict["Phishy"] = curr_phishy
    #                 tmpDict["id"] = i


        #             pp.pprint(tmpDict)
                    data.append(tmpDict)
                except Exception as e: 
                    print(str(e))
                    print("failed adding item")
            
            i+=1
            if stop and i >= stop:
                break
        is_phishy = phishy[idxFilename]
        train.extend(makeTrain(data, is_phishy))
        
#     pp.pprint(train)
#     cl = NaiveBayesClassifier(train)


    
    return train


# from textblob.en.np_extractors import ConllExtractor
# from textblob.tokenizers import SentenceTokenizer
# extractor = textblob.np_extractors.ConllExtractor()
# tb = Blobber(np_extractor = extractor)
        
def makeTrain(data, phishy):
    if phishy == True:
        label = "pos"
    else:
        label = "neg"
        
    train = []
    for d in data:
        sentences = d["TextAnalysis"] 
#         print(type(sentences))
#         pp.pprint(sentences)
        
        for s in sentences:
            # val= s

#             try:
#             print("Detected language: ", s.detect_language())
#             print("SUCC TRANSLATED:")
            val = ' '.join(s.noun_phrases)
#             print(label, val)
            train.append((val, label))
#             except KeyboardInterrupt:
#                 exit(1)
#             except:
#                 print("NON-TRANSLATED: ", s)


            
            
#     pp.pprint(train)
    return train


        
start_nlp = time.perf_counter()
# trainSet = trainNLP(["emails-phishing", "phishing0", "ass-spam0", "ass-spam2", "emails-tamu-ham"], 
#                     phishy = [True, True, True, True, False, False, False], idxParse=None, sample_size=25)
# trainSet = trainNLP(["small-emails-tamu-ham"], phishy = [False], idxParse=None, sample_size=sys.maxsize)
trainSet = trainNLP(["small-emails-phishing", "small-phishing0", "small-ass-spam0", "small-ass-spam2", "small-emails-tamu-ham-0"],
                    phishy = [True, True, True, True, False], idxParse=None, sample_size=sys.maxsize)
 
#                     phishy = [True, True, True, True, False, False, False]
end_nlp = time.perf_counter()
print("Extracting sentences from emails", end_nlp-start_nlp, "sec")

# print("trainSet: ", len(trainSet))

Xy_train, Xy_test = train_test_split(trainSet, test_size=0.33, random_state=random_state)



In [None]:
def make_small_sample(filename, sample_size):
    mbox = mailbox.mbox("./resources/" + filename + ".mbox")
    # if not enough samples to satisfy sample_size
    if len(mbox) < sample_size:
        sample_size = len(mbox)
    small = np.random.choice(range(len(mbox)), sample_size, replace=False)
    
    small_mbox = mailbox.mbox("./resources/small-" + filename + ".mbox")
    small_mbox.clear()
    for i in small:
        msg = mbox[i] 
        small_mbox.add(msg)
    
    mbox.close()
    small_mbox.close()

filenames = ["emails-phishing", "phishing0", "ass-spam0", "ass-spam2", "emails-tamu-ham-0"]
size = 200
for filename in filenames:
    

    start_time_small_sample = time.perf_counter()
    make_small_sample(filename, size)
    end_time_small_sample = time.perf_counter()

    print("Made small sample of", filename, "in", end_time_small_sample - start_time_small_sample, "seconds")

In [None]:
import pandas as pd
# work on adding SMS spam to classifier?
import codecs
def getFromSmsSpamCollection():
    path = "./resources/smsspamcollection/"
    file = "SMSSpamCollection"


    doc = codecs.open(path+file,'rU','UTF-8') 
    colnames=["phishy", "msg"]
    df = pd.read_csv(doc, sep='\t',names=colnames,  header=None,)
    df = df.replace({"spam": "pos", "ham": "neg"})
    df = df.rename(columns={"ham": "phishy"})
    display(df.head())
    display(df.shape)
    
    return df
    
dfSMS = getFromSmsSpamCollection()


from textblob.en.np_extractors import FastNPExtractor
from textblob.tokenizers import SentenceTokenizer
extractor = FastNPExtractor()
tb = Blobber( np_extractor=extractor)
for idx, row in dfSMS.iterrows():
    print(idx, '-' * 78)
    print(row['phishy'], row['msg'])

    blob = extractor.extract(row['msg'])
    print(blob)
    
    

In [None]:
print(len(Xy_train))
print(len(Xy_test))
# display(Xy_train.head())
# pp.pprint(Xy_train)
n_pos = 0
n_neg = 0 
sentencesNLP = defaultdict(list)
for i in range(len(Xy_train)):
    if Xy_train[i][1] == "pos":
        n_pos +=1 

        for l in Xy_train[i][0].split(' '):
            sentencesNLP["pos"].append(l)
    elif Xy_train[i][1] == "neg":
        n_neg +=1
        for l in Xy_train[i][0].split(' '):
            sentencesNLP["neg"].append(l)
    else:
        print("idk")
print("N pos: ", n_pos, " in Xy_train")
print("N neg: ", n_neg, " in Xy_train")

In [None]:
pp.pprint(sentencesNLP)
# print(sentencesNLP['neg'])
# pd.DataFrame(sentencesNLP['neg'])

In [None]:
start = time.perf_counter()
nbcl = NaiveBayesClassifier(Xy_train)
end = time.perf_counter()
print("Training NaiveBayesClassifier with", len(Xy_train), " training points took", end-start, "seconds")


In [None]:
nbcl_acc = nbcl.accuracy(Xy_test)
print("Accuracy of NaiveBayesClassifier: ", round(nbcl_acc, 10))

In [None]:
nbcl.informative_features(5)

In [None]:
y_true = [x[1] for x in Xy_test]

In [None]:
from  sklearn.metrics import classification_report

# X_test = [x[0] for x in Xy_test]
# print(X_test)

y_pred = [None] * len(Xy_test)
i = 0
for d in tqdm(Xy_test):
    
    y_pred[i] = nbcl.classify(d[0])
    i+=1
    
# pp.pprint(y_true)
# print(X)
# print(y_pred)
print(classification_report(y_true, y_pred))

# dtcl_acc = dtcl.accuracy(X_test)

# print("Accuracy of DecisionTreeClassifier: ", round(nbcl, 10))
    

In [None]:

start = time.perf_counter()
dtcl = DecisionTreeClassifier(Xy_train)
end = time.perf_counter()
print("Training DecisionTreeClassifier with", len(Xy_train), " training points took", end-start, "seconds")


In [None]:
from  sklearn.metrics import classification_report

# X_test = [x[0] for x in Xy_test]
# print(X_test)

y_pred = [None] * len(Xy_test)
i = 0
for d in tqdm(Xy_test):
    y_pred[i] = dtcl.classify(d[0])
    i += 1
    
# pp.pprint(y_true)
# print(X)
# print(y_pred)
print(classification_report(y_true, y_pred))

# dtcl_acc = dtcl.accuracy(X_test)

# print("Accuracy of DecisionTreeClassifier: ", round(dtcl_acc, 10))

In [None]:
dtcl.pprint()

In [None]:
from joblib import dump, load
model_filename = "DTTextClassifier.dat"
dump(dtcl, model_filename)

model_filename = "NBTextClassifier.dat"
dump(nbcl, model_filename)


# Gather Data

In [151]:
def get_suspicious_words(message, words):
    sentences = get_text_analysis(message)
    
    result = {}
    for w in words:
        result[w] = 0
#     print('*' * 80, "\n", text_analysis)
    for s in sentences:
        for w in words:
            n_sus_words = str(s).lower().count(w.lower())
            result[w] += int(n_sus_words)
#             print(n_sus_words, ":", s)
#             if n_sus_words > 0:
#                 print (w," FOUND")
#                 print(n_sus_words, ":", s)
            
    
    return result
    
    

class HTTPSinURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "HTTPSinURLs"

    def getFeature(self, message):
        hasHTTPS = re.compile(r'https', re.IGNORECASE)
        payload = getpayload_dict(message)
        result = 0.0
        total = 0.0
        for part in payload:
            soup = BeautifulSoup(part["payload"], "html.parser")
            if soup.find():
                all_a_tags = soup.find_all('a')
#                 print("ALL A TAGS")
#                 print(all_a_tags)
                for a_tag in all_a_tags:
                    looking_for_https = hasHTTPS.search(a_tag.get('href'))
#                     print("LOOKING FOR HTTPS", looking_for_https)
                    if looking_for_https != None:
#                         print("FOUND HTTPS")
                        result += 1.0
#                     else:
#                         print("NO HTTPS")
                        
#                     print(looking_for_https, " HTTPS ? : ", a_tag)
                    total +=1.0
#                     result.append(a_tag.get('href'))
        
        avg_https = 0.0
        if result > 0:
            avg_https = result / total
        return avg_https
    
class HTTPinURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "HTTP_nosecure_inURLs"

    def getFeature(self, message):
        hasHTTPS = re.compile(r'http[^s]', re.IGNORECASE)
        payload = getpayload_dict(message)
        result = 0.0
        total = 0.0
        for part in payload:
            soup = BeautifulSoup(part["payload"], "html.parser")
            if soup.find():
                all_a_tags = soup.find_all('a')
#                 print("ALL A TAGS")
#                 print(all_a_tags)
                for a_tag in all_a_tags:
                    looking_for_https = hasHTTPS.search(a_tag.get('href'))
#                     print("LOOKING FOR HTTP NO SECURE", looking_for_https)
                    if looking_for_https != None:
#                         print("FOUND HTTP NO SCURE")
                        result += 1.0
#                     else:
#                         print("NO HTTP NO SECURE")
                        
#                     print(looking_for_https, " HTTP NO SECURE : ", a_tag)
                    total +=1.0
#                     result.append(a_tag.get('href'))
        
        avg_https = 0.0
        if result > 0:
            avg_https = result / total
        return avg_https

In [152]:

def parseFile(filename, stop = None, phishy = True, idxParse=None):
#     filename = "emails-enron"
    mbox = mailbox.mbox("./resources/" + filename + ".mbox")
    print('len mbox: ', len(mbox))
    i = 1
    s = set()
    d = defaultdict(list)

#     finders = [HTMLFormFinder(), AttachmentFinder(), FlashFinder(),
#                IFrameFinder(), HTMLContentFinder(), URLsFinder(),
#                ExternalResourcesFinder(), JavascriptFinder(),
#                CssFinder(), IPsInURLs(), AtInURLs(), EncodingFinder()]
    finders = [DotsInURLs(), HTTPSinURLs(), HTTPinURLs(), HTMLFormFinder(), AttachmentFinder(), FlashFinder(),
               IFrameFinder(), HTMLContentFinder(), URLsFinder(),
               ExternalResourcesFinder(), JavascriptFinder(),
               CssFinder(), IPsInURLs(), AtInURLs(), EncodingFinder()]
    
    # DEAD LinksMatchUrls(), 
    
    wordFinders = [SuspiciousWords()]
    
    myfinders = [TextAnalysis()]

    parseAll = idxParse == None

    data=[]
    email_index = []
    
    failures = 0
    for msg in tqdm(mbox):
    #     print("msg type: ", type(msg))
    #     print(i)
    #     payload = getpayload_dict(msg)
#         print(i, end=" ")
        tmpDict = {}

    #     print(payload)

    #         print(item.keys())
    #         print(item["mimeType"])

    #     pp.pprint(payload)

    #     attachmentCount = getAttachmentCount(msg)
    #     contentTypes = getContentTypes(msg)
    #     urls = geturls_payload(msg)
    #     iphRefs = getIPHrefs(msg)
    #     externalResources = getexternalresources(msg)

        totalsize = 0
        payload = getpayload_dict(msg)
        for part in payload:
            totalsize += len(re.sub(r'\s+','',part["payload"]))

        if totalsize < 1:
            print(i, " empty email - ", str(phishy), " - ", payload)
            i += 1
            continue

    #     print(i, " : ", msg['content-transfer-encoding'])
    #     for item in payload:
    #         s.add(item["mimeType"])
    #         d[item["mimeType"]].append(i)
        
        if parseAll or (idxParse and i in idxParse):
            try:
                for finder in finders:
                    tmpDict[finder.getFeatureTitle()] = finder.getFeature(msg)

                    
                for finder in wordFinders:
#                     print(i)
                    wordDict = finder.getFeature(msg)
                tmpDict.update(wordDict)

                tmpDict["Phishy"] = phishy
    #             tmpDict["id"] = i


    #             for finder in myfinders:
    #                 tmpDict[finder.getFeatureTitle()] = finder.getFeature(msg)

#                 pp.pprint(tmpDict)
                data.append(tmpDict)
#                 email_fields = {}

#                 email_fields["id"] = i
#                 email_fields["message"] = getpayload(msg)


        #         try:
        #             print(msg)
        #             email_fields["raw"] = str(msg)

        #         except UnicodeEncodeError as er:
        #             print("type: ", type(msg))
        #             email_fields["raw"] = msg.as_bytes()
        #             print(er)
            #         print(msg.get_body_encoding())
        #             print(msg.get_charsets())
        #         except: 
        #             print("CRITICAL ERROR")

#                 email_index.append(email_fields)
            except Exception as e:
#                 print(e)
#                 print("failed at loading email")
                failures+=1


        #fixme
        i += 1
        if stop and i >= stop:
            break

    print("failures: ", failures)
#     pp.pprint(data)
    df = pd.DataFrame(data)

    df.to_csv("./datasets/" + filename + ".csv", quoting=csv.QUOTE_ALL)


#     emails = pd.DataFrame(email_index)
#     emails.to_csv("./datasets/raw/" + filename + "-raw.txt", header=None, index=None, sep=' ', mode='w')

    # print(d)
    for k,v in d.items():
        print("{0:30} : {1}".format(k, len(v)))
    print(s)
    display(df)
    

# parseFile("emails-enron", phishy=False, idxParse = None)


# parseFile("emails-phishing", phishy=True, idxParse = None)
# parseFile("phishing0", phishy=True, idxParse = None)
# parseFile("ass-spam0", phishy=True, idxParse=None)
parseFile("ass-spam2", phishy=True, idxParse=None)

# parseFile("ass-hard-ham", phishy=False, idxParse=None)
# parseFile("ass-easy-ham", phishy=False, idxParse=None)
parseFile("small-emails-tamu-ham-0", phishy=False, idxParse=None)






  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)




  0%|          | 6/1275 [00:00<00:22, 57.50it/s][A[A[A[A

len mbox:  1275






  1%|          | 12/1275 [00:00<00:22, 56.59it/s][A[A[A[A



  1%|          | 15/1275 [00:00<00:30, 41.91it/s][A[A[A[A



  2%|▏         | 23/1275 [00:00<00:28, 44.24it/s][A[A[A[A



  2%|▏         | 29/1275 [00:00<00:26, 47.83it/s][A[A[A[A



  3%|▎         | 34/1275 [00:00<00:29, 42.65it/s][A[A[A[A



  3%|▎         | 42/1275 [00:00<00:25, 48.81it/s][A[A[A[A



  4%|▎         | 47/1275 [00:00<00:27, 44.20it/s][A[A[A[A



  5%|▍         | 60/1275 [00:01<00:22, 54.03it/s][A[A[A[A



" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup




  6%|▌         | 77/1275 [00:01<00:19, 60.43it/s][A[A[A[A



  7%|▋         | 86/1275 [00:01<00:18, 62.88it/s][A[A[A[A

Incorrect padding






  8%|▊         | 96/1275 [00:01<00:17, 69.11it/s][A[A[A[A



  8%|▊         | 108/1275 [00:01<00:14, 78.59it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






  9%|▉         | 117/1275 [00:01<00:16, 68.57it/s][A[A[A[A



 10%|▉         | 125/1275 [00:01<00:16, 69.99it/s][A[A[A[A



 10%|█         | 133/1275 [00:02<00:19, 59.43it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 11%|█         | 140/1275 [00:02<00:24, 46.34it/s][A[A[A[A



 12%|█▏        | 151/1275 [00:02<00:20, 55.55it/s][A[A[A[A



 12%|█▏        | 158/1275 [00:02<00:21, 51.27it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 13%|█▎        | 165/1275 [00:02<00:22, 48.28it/s][A[A[A[A



 14%|█▍        | 176/1275 [00:02<00:19, 57.46it/s][A[A[A[A



 14%|█▍        | 184/1275 [00:03<00:20, 53.35it/s][A[A[A[A



 15%|█▍        | 191/1275 [00:03<00:21, 50.33it/s][A[A[A[A



 15%|█▌        | 197/1275 [00:03<00:27, 39.23it/s][A[A[A[A



 16%|█▌        | 202/1275 [00:03<00:28, 37.81it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 16%|█▋        | 208/1275 [00:03<00:25, 41.32it/s][A[A[A[A



 17%|█▋        | 214/1275 [00:03<00:24, 43.53it/s][A[A[A[A



 17%|█▋        | 223/1275 [00:03<00:20, 50.16it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 18%|█▊        | 231/1275 [00:04<00:19, 54.18it/s][A[A[A[A



 19%|█▊        | 237/1275 [00:04<00:21, 47.66it/s][A[A[A[A



 19%|█▉        | 247/1275 [00:04<00:19, 54.08it/s][A[A[A[A



 20%|██        | 255/1275 [00:04<00:17, 59.42it/s][A[A[A[A



 21%|██        | 262/1275 [00:04<00:17, 57.11it/s][A[A[A[A



 21%|██        | 269/1275 [00:04<00:18, 55.14it/s][A[A[A[A



 22%|██▏       | 281/1275 [00:04<00:15, 63.85it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 23%|██▎       | 289/1275 [00:04<00:15, 65.26it/s][A[A[A[A



 24%|██▎       | 300/1275 [00:05<00:13, 72.61it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 25%|██▍       | 313/1275 [00:05<00:11, 82.52it/s][A[A[A[A



 25%|██▌       | 323/1275 [00:05<00:11, 82.24it/s][A[A[A[A



 26%|██▌       | 333/1275 [00:05<00:12, 75.70it/s][A[A[A[A



 27%|██▋       | 342/1275 [00:05<00:13, 71.41it/s][A[A[A[A



 27%|██▋       | 350/1275 [00:05<00:12, 72.80it/s][A[A[A[A



 28%|██▊       | 358/1275 [00:05<00:15, 60.97it/s][A[A[A[A



 29%|██▊       | 365/1275 [00:06<00:15, 59.79it/s][A[A[A[A



 29%|██▉       | 372/1275 [00:06<00:17, 50.30it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 30%|██▉       | 378/1275 [00:06<00:19, 47.08it/s][A[A[A[A



 30%|███       | 384/1275 [00:06<00:20, 43.96it/s][A[A[A[A



 31%|███       | 389/1275 [00:06<00:20, 42.61it/s][A[A[A[A



 31%|███       | 394/1275 [00:06<00:24, 36.29it/s][A[A[A[A



 31%|███▏      | 400/1275 [00:06<00:21, 39.92it/s][A[A[A[A



 32%|███▏      | 408/1275 [00:07<00:18, 46.96it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 32%|███▏      | 414/1275 [00:07<00:22, 38.04it/s][A[A[A[A



 33%|███▎      | 421/1275 [00:07<00:19, 43.66it/s][A[A[A[A



 33%|███▎      | 427/1275 [00:07<00:22, 37.97it/s][A[A[A[A



 34%|███▍      | 433/1275 [00:07<00:21, 39.57it/s][A[A[A[A



 35%|███▍      | 442/1275 [00:07<00:17, 46.65it/s][A[A[A[A



 35%|███▌      | 448/1275 [00:07<00:17, 47.15it/s][A[A[A[A



 36%|███▌      | 456/1275 [00:08<00:15, 51.84it/s][A[A[A[A



 36%|███▋      | 464/1275 [00:08<00:14, 54.30it/s][A[A[A[A



 37%|███▋      | 470/1275 [00:08<00:14, 53.92it/s][A[A[A[A



 37%|███▋      | 477/1275 [00:08<00:15, 52.83it/s][A[A[A[A



 38%|███▊      | 483/1275 [00:08<00:15, 52.70it/s][A[A[A[A



 39%|███▊      | 494/1275 [00:08<00:12, 61.65it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 39%|███▉      | 501/1275 [00:08<00:13, 57.64it/s][A[A[A[A



 40%|███▉      | 508/1275 [00:08<00:13, 57.09it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 40%|████      | 515/1275 [00:09<00:14, 51.77it/s][A[A[A[A



 41%|████      | 521/1275 [00:09<00:15, 50.22it/s][A[A[A[A



 41%|████▏     | 527/1275 [00:09<00:14, 50.55it/s][A[A[A[A



 42%|████▏     | 533/1275 [00:09<00:16, 45.95it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 42%|████▏     | 538/1275 [00:09<00:16, 43.89it/s][A[A[A[A



 43%|████▎     | 545/1275 [00:09<00:14, 49.00it/s][A[A[A[A



 43%|████▎     | 551/1275 [00:09<00:14, 51.33it/s][A[A[A[A



 44%|████▍     | 559/1275 [00:09<00:12, 56.51it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 44%|████▍     | 565/1275 [00:10<00:13, 53.04it/s][A[A[A[A



 45%|████▍     | 573/1275 [00:10<00:12, 54.66it/s][A[A[A[A



 45%|████▌     | 580/1275 [00:10<00:12, 57.28it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 46%|████▌     | 586/1275 [00:10<00:12, 54.76it/s][A[A[A[A



 47%|████▋     | 595/1275 [00:10<00:10, 61.89it/s][A[A[A[A



 47%|████▋     | 602/1275 [00:10<00:10, 63.90it/s][A[A[A[A



 48%|████▊     | 609/1275 [00:10<00:13, 48.57it/s][A[A[A[A



 48%|████▊     | 616/1275 [00:11<00:12, 52.55it/s][A[A[A[A



 49%|████▉     | 622/1275 [00:11<00:12, 53.23it/s][A[A[A[A



 50%|████▉     | 633/1275 [00:11<00:10, 62.37it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 50%|█████     | 641/1275 [00:11<00:10, 59.30it/s][A[A[A[A



 51%|█████     | 648/1275 [00:11<00:11, 52.80it/s][A[A[A[A



 52%|█████▏    | 657/1275 [00:11<00:10, 57.13it/s][A[A[A[A



 52%|█████▏    | 664/1275 [00:11<00:11, 54.49it/s][A[A[A[A



 53%|█████▎    | 670/1275 [00:11<00:11, 53.23it/s][A[A[A[A



 53%|█████▎    | 677/1275 [00:12<00:10, 56.38it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 54%|█████▎    | 683/1275 [00:12<00:10, 56.86it/s][A[A[A[A



 54%|█████▍    | 689/1275 [00:12<00:11, 50.99it/s][A[A[A[A



 55%|█████▍    | 695/1275 [00:12<00:11, 50.09it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 55%|█████▍    | 701/1275 [00:12<00:11, 51.94it/s][A[A[A[A



 56%|█████▌    | 710/1275 [00:12<00:09, 58.71it/s][A[A[A[A



 56%|█████▌    | 717/1275 [00:12<00:10, 52.90it/s][A[A[A[A



 57%|█████▋    | 727/1275 [00:13<00:14, 38.96it/s][A[A[A[A



 58%|█████▊    | 736/1275 [00:13<00:11, 46.91it/s][A[A[A[A



 59%|█████▊    | 749/1275 [00:13<00:09, 57.43it/s][A[A[A[A



 59%|█████▉    | 757/1275 [00:13<00:09, 57.39it/s][A[A[A[A



 60%|██████    | 765/1275 [00:13<00:09, 56.46it/s][A[A[A[A



 61%|██████    | 772/1275 [00:13<00:08, 57.28it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 61%|██████    | 779/1275 [00:13<00:08, 57.55it/s][A[A[A[A



 62%|██████▏   | 786/1275 [00:14<00:08, 60.54it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 62%|██████▏   | 796/1275 [00:14<00:07, 65.13it/s][A[A[A[A



 63%|██████▎   | 805/1275 [00:14<00:06, 70.87it/s][A[A[A[A



 64%|██████▍   | 813/1275 [00:14<00:06, 72.97it/s][A[A[A[A



 64%|██████▍   | 821/1275 [00:14<00:06, 74.75it/s][A[A[A[A



 65%|██████▌   | 832/1275 [00:14<00:05, 81.69it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 66%|██████▌   | 841/1275 [00:14<00:05, 78.94it/s][A[A[A[A



 67%|██████▋   | 852/1275 [00:14<00:04, 85.02it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 68%|██████▊   | 861/1275 [00:14<00:05, 70.23it/s][A[A[A[A



 68%|██████▊   | 872/1275 [00:15<00:05, 77.41it/s][A[A[A[A



 69%|██████▉   | 882/1275 [00:15<00:04, 80.13it/s][A[A[A[A



 70%|███████   | 897/1275 [00:15<00:04, 93.15it/s][A[A[A[A



 71%|███████   | 908/1275 [00:15<00:04, 82.41it/s][A[A[A[A



 72%|███████▏  | 918/1275 [00:15<00:04, 79.90it/s][A[A[A[A



 73%|███████▎  | 931/1275 [00:15<00:04, 85.79it/s][A[A[A[A



 74%|███████▍  | 941/1275 [00:15<00:03, 88.41it/s][A[A[A[A



 75%|███████▍  | 951/1275 [00:16<00:04, 72.60it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 75%|███████▌  | 960/1275 [00:16<00:04, 65.61it/s][A[A[A[A



 76%|███████▌  | 968/1275 [00:16<00:04, 64.37it/s][A[A[A[A



 77%|███████▋  | 976/1275 [00:16<00:04, 66.56it/s][A[A[A[A



 77%|███████▋  | 984/1275 [00:16<00:04, 67.05it/s][A[A[A[A



 78%|███████▊  | 991/1275 [00:16<00:05, 53.16it/s][A[A[A[A



 78%|███████▊  | 997/1275 [00:16<00:05, 53.78it/s][A[A[A[A



 79%|███████▉  | 1011/1275 [00:16<00:04, 65.86it/s][A[A[A[A



 80%|████████  | 1020/1275 [00:17<00:04, 63.09it/s][A[A[A[A



 81%|████████  | 1033/1275 [00:17<00:03, 73.25it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 82%|████████▏ | 1042/1275 [00:17<00:03, 66.64it/s][A[A[A[A



 82%|████████▏ | 1050/1275 [00:17<00:03, 67.98it/s][A[A[A[A



 83%|████████▎ | 1061/1275 [00:17<00:02, 76.54it/s][A[A[A[A



 84%|████████▍ | 1070/1275 [00:17<00:03, 61.43it/s][A[A[A[A



 85%|████████▍ | 1078/1275 [00:17<00:03, 58.54it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 85%|████████▌ | 1085/1275 [00:18<00:03, 58.43it/s][A[A[A[A



 86%|████████▌ | 1092/1275 [00:18<00:02, 61.46it/s][A[A[A[A



 86%|████████▌ | 1099/1275 [00:18<00:02, 62.71it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 87%|████████▋ | 1107/1275 [00:18<00:03, 55.86it/s][A[A[A[A



 88%|████████▊ | 1117/1275 [00:18<00:02, 63.44it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 88%|████████▊ | 1124/1275 [00:18<00:03, 48.97it/s][A[A[A[A



 89%|████████▉ | 1135/1275 [00:18<00:02, 56.64it/s][A[A[A[A



 90%|████████▉ | 1143/1275 [00:19<00:02, 61.23it/s][A[A[A[A



 90%|█████████ | 1151/1275 [00:19<00:01, 64.76it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 91%|█████████ | 1162/1275 [00:19<00:01, 72.39it/s][A[A[A[A



 92%|█████████▏| 1173/1275 [00:19<00:01, 77.32it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 93%|█████████▎| 1182/1275 [00:19<00:01, 58.87it/s][A[A[A[A



 93%|█████████▎| 1189/1275 [00:19<00:01, 54.24it/s][A[A[A[A



 94%|█████████▍| 1196/1275 [00:19<00:01, 54.17it/s][A[A[A[A



 94%|█████████▍| 1203/1275 [00:20<00:01, 52.59it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 95%|█████████▌| 1213/1275 [00:20<00:01, 53.59it/s][A[A[A[A



 96%|█████████▌| 1219/1275 [00:20<00:01, 48.46it/s][A[A[A[A



 96%|█████████▌| 1227/1275 [00:20<00:00, 54.49it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 97%|█████████▋| 1235/1275 [00:20<00:00, 57.54it/s][A[A[A[A



 98%|█████████▊| 1245/1275 [00:20<00:00, 65.07it/s][A[A[A[A



 99%|█████████▊| 1257/1275 [00:20<00:00, 74.23it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 99%|█████████▉| 1266/1275 [00:20<00:00, 73.78it/s][A[A[A[A



100%|██████████| 1275/1275 [00:20<00:00, 60.72it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email
failures:  48
set()


Unnamed: 0,@ in URLs,Attachments,Css,Dots in URLs,Encoding,External Resources,Flash content,HTML content,HTTPSinURLs,HTTP_nosecure_inURLs,...,please,protect,records,security,service,thank,update,user,verify,www
0,False,0,0,2.333333,quoted-printable,0,False,True,0.0,1.000000,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,2.000000,8bit,0,False,False,0.0,0.000000,...,1,0,0,4,0,0,0,0,0,1
2,False,0,0,0.000000,quoted-printable,0,False,True,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
3,False,0,0,1.500000,8bit,0,False,False,0.0,0.000000,...,6,0,0,2,4,2,0,0,0,0
4,False,0,0,2.750000,8bit,0,False,True,0.0,1.000000,...,0,0,0,0,0,0,0,0,0,0
5,False,0,0,1.000000,none,0,False,True,0.0,1.000000,...,0,0,0,0,0,0,0,0,0,0
6,False,0,0,1.666667,none,0,False,False,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
7,False,0,0,2.000000,8bit,0,False,False,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,4
8,False,0,0,2.000000,7bit,0,False,True,0.0,1.000000,...,0,0,0,0,0,0,0,0,0,0
9,False,0,0,2.000000,none,0,False,False,0.0,0.000000,...,2,0,0,0,0,1,0,0,0,4






  0%|          | 0/200 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 3/200 [00:00<00:07, 27.39it/s][A[A[A[A

len mbox:  200






  4%|▍         | 8/200 [00:00<00:06, 30.22it/s][A[A[A[A



  5%|▌         | 10/200 [00:00<00:19,  9.65it/s][A[A[A[A



  6%|▋         | 13/200 [00:00<00:15, 12.01it/s][A[A[A[A



  9%|▉         | 18/200 [00:01<00:11, 15.26it/s]

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email


[A[A[A[A



 10%|█         | 21/200 [00:01<00:10, 17.35it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 12%|█▏        | 24/200 [00:05<01:21,  2.16it/s][A[A[A[A



 13%|█▎        | 26/200 [00:05<01:00,  2.90it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 14%|█▍        | 29/200 [00:06<00:56,  3.04it/s][A[A[A[A



 16%|█▌        | 31/200 [00:06<00:50,  3.32it/s][A[A[A[A



 17%|█▋        | 34/200 [00:06<00:37,  4.48it/s][A[A[A[A



 19%|█▉        | 38/200 [00:07<00:26,  6.00it/s][A[A[A[A



 22%|██▏       | 43/200 [00:07<00:19,  8.10it/s][A[A[A[A



 23%|██▎       | 46/200 [00:07<00:15, 10.01it/s][A[A[A[A



 25%|██▌       | 50/200 [00:07<00:12, 12.26it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 26%|██▋       | 53/200 [00:07<00:11, 12.71it/s][A[A[A[A



 30%|███       | 61/200 [00:07<00:08, 16.87it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 32%|███▎      | 65/200 [00:08<00:08, 15.07it/s][A[A[A[A



 34%|███▍      | 68/200 [00:08<00:08, 14.70it/s][A[A[A[A

Incorrect padding






 36%|███▌      | 71/200 [00:08<00:08, 14.45it/s][A[A[A[A



 37%|███▋      | 74/200 [00:08<00:08, 14.31it/s][A[A[A[A



 39%|███▉      | 78/200 [00:08<00:07, 16.88it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 42%|████▎     | 85/200 [00:09<00:05, 20.08it/s][A[A[A[A



 44%|████▍     | 88/200 [00:09<00:07, 15.26it/s][A[A[A[A



 46%|████▌     | 92/200 [00:09<00:05, 18.38it/s][A[A[A[A



 48%|████▊     | 95/200 [00:09<00:07, 14.02it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 49%|████▉     | 98/200 [00:10<00:10,  9.40it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 50%|█████     | 101/200 [00:10<00:10,  9.77it/s][A[A[A[A



 54%|█████▎    | 107/200 [00:10<00:07, 13.01it/s][A[A[A[A



 55%|█████▌    | 110/200 [00:10<00:06, 14.75it/s][A[A[A[A



 57%|█████▋    | 114/200 [00:11<00:05, 16.82it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 58%|█████▊    | 117/200 [00:11<00:07, 11.38it/s][A[A[A[A



 60%|█████▉    | 119/200 [00:11<00:07, 11.05it/s][A[A[A[A



 62%|██████▏   | 123/200 [00:11<00:05, 13.54it/s][A[A[A[A



 63%|██████▎   | 126/200 [00:12<00:04, 15.77it/s][A[A[A[A



 64%|██████▍   | 129/200 [00:12<00:03, 17.99it/s][A[A[A[A



 66%|██████▌   | 132/200 [00:12<00:03, 19.24it/s][A[A[A[A



 70%|██████▉   | 139/200 [00:12<00:02, 24.41it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 72%|███████▏  | 143/200 [00:12<00:02, 22.99it/s][A[A[A[A



 74%|███████▍  | 148/200 [00:12<00:01, 26.69it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 79%|███████▉  | 158/200 [00:15<00:04, 10.39it/s][A[A[A[A



 82%|████████▏ | 163/200 [00:17<00:08,  4.53it/s][A[A[A[A



 83%|████████▎ | 166/200 [00:17<00:05,  5.69it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 86%|████████▌ | 172/200 [00:17<00:03,  7.69it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 88%|████████▊ | 176/200 [00:18<00:03,  7.10it/s][A[A[A[A



 89%|████████▉ | 178/200 [00:18<00:02,  8.51it/s][A[A[A[A



 90%|█████████ | 181/200 [00:18<00:01,  9.95it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






 92%|█████████▏| 184/200 [00:19<00:01, 11.92it/s][A[A[A[A



 94%|█████████▎| 187/200 [00:19<00:00, 14.27it/s][A[A[A[A

expected string or bytes-like object
failed at loading email
expected string or bytes-like object
failed at loading email






 95%|█████████▌| 190/200 [00:19<00:00, 13.44it/s][A[A[A[A



 98%|█████████▊| 196/200 [00:19<00:00, 17.46it/s][A[A[A[A

expected string or bytes-like object
failed at loading email






100%|██████████| 200/200 [00:19<00:00, 16.09it/s][A[A[A[A

failures:  21
set()


Unnamed: 0,@ in URLs,Attachments,Css,Dots in URLs,Encoding,External Resources,Flash content,HTML content,HTTPSinURLs,HTTP_nosecure_inURLs,...,please,protect,records,security,service,thank,update,user,verify,www
0,False,0,0,2.142857,none,0,False,True,0.000000,1.000000,...,2,0,0,0,0,1,0,0,0,0
1,False,0,0,1.888889,none,0,False,True,0.000000,1.000000,...,0,0,0,0,0,0,0,0,0,0
2,False,0,0,2.200000,none,0,False,True,0.000000,0.750000,...,2,0,0,0,0,0,0,0,0,0
3,False,0,0,1.484848,none,0,False,True,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,1
4,False,0,0,2.000000,none,0,False,True,1.000000,0.000000,...,0,1,0,1,0,0,0,0,0,0
5,False,0,0,0.000000,none,0,False,True,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
6,False,0,0,2.500000,base64,0,False,True,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
7,False,0,0,2.250000,none,0,False,True,0.000000,1.000000,...,0,0,0,0,0,0,0,0,0,0
8,False,0,0,2.000000,none,0,False,True,0.000000,0.000000,...,0,0,0,0,1,0,0,0,0,0
9,False,1,0,1.666667,none,0,False,True,1.000000,0.000000,...,0,0,0,2,0,0,0,0,0,0


# Email Daemon

In [165]:
import os
import smtplib
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from joblib import dump, load



def sendResponse(target, response_text):
    
    smtp_ssl_host = 'smtp.gmail.com'  # smtp.mail.yahoo.com
    smtp_ssl_port = 465
    smtp_username = 'phishingdetective@gmail.com'
    smtp_password = 'phishingToolPassword97'
    smtp_sender = 'phishingdetective@gmail.com'
    
    msg = MIMEMultipart()
    msg['Subject'] = "Phishing Detector Results"
    msg['From'] = smtp_sender
    msg['To'] = target

    txt = MIMEText(response_text)
    msg.attach(txt)
    
    
    smtp_server = smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port)
    smtp_server.login(smtp_username, smtp_password)
    smtp_server.sendmail(smtp_sender, target, msg.as_string())
    smtp_server.quit()
    
def constructResponseText(results):
    txt = "Howdy \r\nI have analyzed " + str(len(results)) + " eml file(s). Here are my predictions\r\n\r\n"
    
    for res in results:
        txt += "The file '" + res['filename'] + "'"
        txt = txt + " is phishy " if res['prediction'] else txt + " is not phishy "
        txt += "with probabilities of " 
        txt += str(res["prob"][0]) + " for class " + str(res["classes"][0]) + " and " + str(res["prob"][1]) + " for class " + str(res["classes"][1])
        txt += "\r\n"

    return txt

In [188]:
#### import time
from itertools import chain
import email
import imaplib
import pprint as pp
import email.mime.message
import base64


model_filename = "RandomForestModel.dat"
rf_tuned_saved = load(model_filename)

encoding = "encoding.dat"
dEncoding = load(encoding)


# import mailparser

# mail = mailparser.parse_from_file(f)
# mail = mailparser.parse_from_file_obj(fp)
# mail = mailparser.parse_from_string(raw_mail)
# mail = mailparser.parse_from_bytes(byte_mail)

import argparse


# get commandline arguments
# parser = argparse.ArgumentParser()
# parser.add_argument('-p', '--password')
# parser.add_argument('-u', '--username')
# args = parser.parse_args()

# print("Command line arguments:");print(args);print(78*'-')


imap_ssl_host = 'imap.gmail.com'  # imap.mail.yahoo.com
imap_ssl_port = 993
username = 'phishingdetective@gmail.com'
password = 'phishingToolPassword97'

# Restrict mail search. Be very specific.
# Machine should be very selective to receive messages.
criteria = {
  # 'FROM':    'isaacpitblado@tamu.edu',
#   'SUBJECT': '',
  # 'BODY':    'SECRET SIGNATURE',
}
uid_max = 0


def search_string(uid_max, criteria):
  c = list(map(lambda t: (t[0], '"'+str(t[1])+'"'), criteria.items())) + [('UID', '%d:*' % (uid_max+1))]
  return '(%s)' % ' '.join(chain(*c))
  # Produce search string in IMAP format:
  #   e.g. (FROM "me@gmail.com" SUBJECT "abcde" BODY "123456789" UID 9999:*)


def get_first_text_block(msg):
  type = msg.get_content_maintype()

  if type == 'multipart':
    for part in msg.get_payload():
      if part.get_content_maintype() == 'text':
        return part.get_payload()
  elif type == 'text':
    return msg.get_payload()


server = imaplib.IMAP4_SSL(imap_ssl_host, imap_ssl_port)
server.login(username, password)
server.select('INBOX')

result, data = server.uid('search', None, search_string(uid_max, criteria))
# result, data = server.search(None, 'ALL')

uids = [int(s) for s in data[0].split()]
# print("original uids: ", uids)
if uids:
#     uid_max = max(uids) # FIXME

    uid_max = 21
    print("CURR_MAX_UID: ", uid_max)
  # Initialize `uid_max`. Any UID less than or equal to `uid_max` will be ignored subsequently.

# server.logout()

# finders = [HTMLFormFinder(), AttachmentFinder(), FlashFinder(),
#            IFrameFinder(), HTMLContentFinder(), URLsFinder(),
#            ExternalResourcesFinder(), JavascriptFinder(),
#            CssFinder(), IPsInURLs(), AtInURLs(), EncodingFinder()]

finders = [DotsInURLs(), HTTPSinURLs(), HTTPinURLs(), HTMLFormFinder(), AttachmentFinder(), FlashFinder(),
               IFrameFinder(), HTMLContentFinder(), URLsFinder(),
               ExternalResourcesFinder(), JavascriptFinder(),
               CssFinder(), IPsInURLs(), AtInURLs(), EncodingFinder()]

wordFinders = [SuspiciousWords()]

myfinders = [TextAnalysis()]

# Keep checking messages ...
# I don't like using IDLE because Yahoo does not support it.
while 1:
    # Have to login/logout each time because that's the only way to get fresh results.

#     server = imaplib.IMAP4_SSL(imap_ssl_host, imap_ssl_port)
#     server.login(username, password)
    server.select('INBOX')

    result, data = server.uid('search', None, search_string(uid_max, criteria))
#     result, data = server.search(None, 'ALL')

    uids = [int(s) for s in data[0].split()]
    print('UID: ', uids)
    for uid in uids:
    #   for i in range(len(uids) -1, len(uids) - 2, -1):
    # Have to check again because Gmail sometimes does not obey UID criterion.
    #     uid = uids[i]
        if uid > uid_max:
        #     if True:
            print("uid: ", uid)
            result, data = server.uid('fetch', str(uid), '(RFC822)')  # fetch entire message
            
            # print("res: ", result)
            # print("data: ", data)
            # print("data:")
            # pp.pprint(data)

            print("setting new uid_max: ", uid)
            uid_max = uid
            
#             print("data:", data)
            phishy_sender = None
            
            attached_emails = []
        
            results = [] 

            for response_part in data:
                
                try:
                    if isinstance(response_part, tuple):
                        # print("response_part: ", response_part)
                        # print("response_part[1]: ", response_part[1])
                        msg = email.message_from_string(response_part[1].decode('utf-8'))
    #                     msg = email.message_from_string(response_part[1].decode('utf-8'))
    #                     print(msg)
                        phishy_sender = msg["From"]
                        print("from: ", msg["From"])
                        print("subject: ", msg["Subject"])
                
                        for part in msg.walk():
                            curr_email = {}
                            if part['Content-Type'] and part['Content-Type'].find("message/rfc822") != -1:

                                attachment = part.get_payload()

#                                 print(part.keys())
#                                 print(part.values())
#                                 print(part.get_filename())
#                                 curr_email["filename"] = part.get_filename()

    #                             print("PAYLOAD LEN: ", len(attachment))
                                for i in range(len(attachment)):
    #                                 print("payload i: ",i, attachment[i].as_string())
    #                                 print("keys: ", part[i].keys())
    #                                 print("type: ", type(attachment[i]))
                                    decoded = email.message_from_bytes(base64.b64decode(attachment[i].as_string()))
                                    attached_message = getpayload_dict(decoded)
#                                     print(attached_message)
                                    totalsize = 0
                                    for a_part in attached_message:
                                        totalsize += len(re.sub(r'\s+','',a_part["payload"]))

                                    if totalsize < 1:
                                        print(" empty email - ", attached_message)
                                        i += 1
                                        continue

                                    tmpDict = {}
                                    for finder in finders:
                                        tmpDict[finder.getFeatureTitle()] = finder.getFeature(decoded)

                                    for finder in wordFinders:
                                        wordDict = finder.getFeature(msg)
                                    tmpDict.update(wordDict)

                                    tmpDict['filename'] = part.get_filename()
                                    print("got filename: ", tmpDict['filename'])

#                                     pp.pprint(tmpDict)
                                    if tmpDict['filename']  != None:
                                        attached_emails.append(tmpDict)



                            else:
                                print("not message/rfc822: ", part['Content-Type'])

                        if len(attached_emails):
                            X = pd.DataFrame(attached_emails)

                            X = X.replace({"Encoding": dEncoding})  

                            files = X['filename'].values
                            X = X.drop(columns='filename')
                            display(X.head())


                            prediction = rf_tuned_saved.predict(X)
                            prob = rf_tuned_saved.predict_proba(X)
                            classes = ["'Not Phishy'", "'Phishy'"]
                            print("prediction:", files, prediction)

                            assert(len(files) == len(prediction))
                            print("appending result...")
                            for i in range(len(files)):
                                results.append({"filename": files[i], "prediction": prediction[i], "prob": prob[i], "classes": classes})
                except Exception as e:
                    print(e)
            
            print("RESULTS: ", results)
            if len(results):
                txt = constructResponseText(results)

                print("Response Text:\n", txt)
#                 sendResponse(phishy_sender, txt)
            elif phishy_sender != None:
                print("No .eml files attached")
                txt = "Howdy,\r\n\r\nTo use this tool, please attach .eml files to an email and send them to me. I will respond a report on my prediction for all attached email files.\r\n\r\nThank you."

                print("Response Text:\n", txt)
#                 sendResponse(phishy_sender, txt)
            else: 
                print("No sender found!")
                    
                    
                    
                    
#     server.logout()
#   break

    time.sleep(1)


CURR_MAX_UID:  21
UID:  [22, 23, 24]
uid:  22
setting new uid_max:  22
from:  Isaac Pitblado <isaacpitblado@gmail.com>
subject:  1:04 pm
attachment count:  0
not message/rfc822:  multipart/mixed; boundary="000000000000003ea60598d15de5"
not message/rfc822:  multipart/alternative; boundary="000000000000003ea30598d15de3"
not message/rfc822:  text/plain; charset="UTF-8"
not message/rfc822:  text/html; charset="UTF-8"
got filename:  paypal.eml
not message/rfc822:  None


Unnamed: 0,@ in URLs,Attachments,Css,Dots in URLs,Encoding,External Resources,Flash content,HTML content,HTTPSinURLs,HTTP_nosecure_inURLs,...,please,protect,records,security,service,thank,update,user,verify,www
0,False,0,0,2.0,1,0,False,True,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


prediction: ['paypal.eml'] [False]
appending result...
RESULTS:  [{'filename': 'paypal.eml', 'prediction': False, 'prob': array([0.70613528, 0.29386472]), 'classes': ["'Not Phishy'", "'Phishy'"]}]
Response Text:
 Howdy 
I have analyzed 1 eml file(s). Here are my predictions

The file 'paypal.eml' is not phishy with probabilities of 0.7061352813852811 for class 'Not Phishy' and 0.29386471861471863 for class 'Phishy'

uid:  23
setting new uid_max:  23
from:  Isaac Pitblado <isaacpitblado@gmail.com>
subject:  1:06 pm
attachment count:  0
not message/rfc822:  multipart/mixed; boundary="00000000000035783b0598d1660a"
not message/rfc822:  multipart/alternative; boundary="0000000000003578370598d16608"
not message/rfc822:  text/plain; charset="UTF-8"
not message/rfc822:  text/html; charset="UTF-8"
got filename:  paypal.eml
not message/rfc822:  None
got filename:  Phishing.eml
not message/rfc822:  None


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,@ in URLs,Attachments,Css,Dots in URLs,Encoding,External Resources,Flash content,HTML content,HTTPSinURLs,HTTP_nosecure_inURLs,...,please,protect,records,security,service,thank,update,user,verify,www
0,False,0,0,2.0,1,0,False,True,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,0.0,0,0,False,True,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


prediction: ['paypal.eml' 'Phishing.eml'] [False False]
appending result...
RESULTS:  [{'filename': 'paypal.eml', 'prediction': False, 'prob': array([0.70613528, 0.29386472]), 'classes': ["'Not Phishy'", "'Phishy'"]}, {'filename': 'Phishing.eml', 'prediction': False, 'prob': array([0.73634991, 0.26365009]), 'classes': ["'Not Phishy'", "'Phishy'"]}]
Response Text:
 Howdy 
I have analyzed 2 eml file(s). Here are my predictions

The file 'paypal.eml' is not phishy with probabilities of 0.7061352813852811 for class 'Not Phishy' and 0.29386471861471863 for class 'Phishy'
The file 'Phishing.eml' is not phishy with probabilities of 0.7363499146288779 for class 'Not Phishy' and 0.26365008537112244 for class 'Phishy'

uid:  24
setting new uid_max:  24
from:  Isaac Pitblado <isaacpitblado@gmail.com>
subject:  1:07 pm
attachment count:  0
not message/rfc822:  multipart/mixed; boundary="00000000000057969a0598d16830"
not message/rfc822:  multipart/alternative; boundary="0000000000005796960598d1682e

KeyboardInterrupt: 