# Code Highlights #

Parse all sub-folders and files from a given folder:

- Universal Detector: Detects the encoding type of a file and returns the file text properly decoded. This is so helpful!


- os.walk

    for root, dirs, files in os.walk(enron_data_dir):
        for file_path in files:
        
- Parse email using the python email package: https://docs.python.org/3/library/email.examples.html
    
- Create PySpark Dataframe and Schema: 
    - https://www.geeksforgeeks.org/how-to-create-pyspark-dataframe-with-schema/
    
    - https://sparkbyexamples.com/pyspark/pyspark-structtype-and-structfield/
    
- Proper Installation Notes to get Pyspark running in Juypter. 

    - https://changhsinlee.com/install-pyspark-windows-jupyter/


In [1]:
#//*** Run this code may or may not ensure pyspark is happy
"""
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.sql('''select 'spark' as hello ''')
df.show()
"""
print()




In [2]:
import os
import json
from pathlib import Path
import zipfile
import email
from email.policy import default
from email.parser import Parser
from datetime import timezone
import datetime

from collections import namedtuple

import pandas as pd
#import s3fs
from bs4 import BeautifulSoup
from dateutil.parser import parse
from chardet.universaldetector import UniversalDetector

#//*** Must Run before pyspark
import findspark
findspark.init()

from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.pipeline import Transformer
from pyspark.sql.functions import udf
from pyspark.sql.types import *

import pandas as pd

current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)
data_dir = current_dir.joinpath('data')
data_dir.mkdir(parents=True, exist_ok=True)
enron_data_dir = data_dir.joinpath('enron')

output_columns = [
        'username',
        'original_msg',
        'payload',
        'Message-ID',
        'Date',
        'From',
        'To',
        'Subject',
        'Mime-Version',
        'Content-Type',
        'Content-Transfer-Encoding',
        'X-From',
        'X-To',
        'X-cc',
        'X-bcc',
        'X-Folder',
        'X-Origin',
        'X-FileName',
        'Cc',
        'Bcc' 
]

columns = [column.replace('-', '_') for column in output_columns]

ParsedEmail = namedtuple('ParsedEmail', columns)

spark = SparkSession\
    .builder\
    .appName("Assignment04")\
    .getOrCreate()



In [3]:
print("Current Dir: ", current_dir)
print("Results Dir: ", results_dir)
print("Data Dir: ", data_dir)
print("Enron Data Dir: ", enron_data_dir)
print(ParsedEmail.username)



Current Dir:  C:\Users\family\DSCProjects\DSC\DSC650\assignment04
Results Dir:  C:\Users\family\DSCProjects\DSC\DSC650\assignment04\results
Data Dir:  C:\Users\family\DSCProjects\DSC\DSC650\assignment04\data
Enron Data Dir:  C:\Users\family\DSCProjects\DSC\DSC650\assignment04\data\enron
<property object at 0x000001689C20CAE8>


The following code loads data to your local JupyterHub instance. You only need to run this once. 

In [4]:
#//*** Copied Files manually, avoided Amazon S3 due to ongoing S3 issues.
"""
def copy_data_to_local():
    dst_data_path = data_dir.joinpath('enron.zip')
    endpoint_url='https://storage.budsc.midwest-datascience.com'
    enron_data_path = 'data/external/enron.zip'

    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )

    
    s3.get(enron_data_path, str(dst_data_path))
    
    with zipfile.ZipFile(dst_data_path) as f_zip:
        f_zip.extractall(path=data_dir)
    
copy_data_to_local()
"""
print()





This code reads emails and creates a Spark dataframe with three columns. 

## Assignment 4.1

In [5]:
#//*** Use Universal Detector to ascertain the message encoding type.
#//*** Returns a text based on the detected encoding type
def read_raw_email(email_path):
    detector = UniversalDetector()
    
    try:
        with open(email_path) as f:
            original_msg = f.read()
    except UnicodeDecodeError:
        detector.reset()
        with open(email_path, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        with open(email_path, encoding=encoding) as f:
            original_msg = f.read()
            
    return original_msg 

def make_spark_df():
    
    #//*** All Fields are Stringtype except Date which is Timestamp type
    #//*** PySpark will accept a Datetime Object for timestamp type
    schema = StructType([
        StructField("id",StringType(),True),
        StructField("username",StringType(),True),
        StructField("original_msg",StringType(),True)
  ])
    records = []
    sc = spark.sparkContext

    for root, dirs, files in os.walk(enron_data_dir):
        for file_path in files:
            ## Current path is now the file path to the current email.  
            ## Use this path to read the following information
            ## original_msg
            ## username (Hint: It is the root folder)
            ## id (The relative path of the email message)
            current_path = Path(root).joinpath(file_path)
            
            #//*** Get raw Email Text Message from File
            raw_email = read_raw_email(current_path)
                        
            row = []
            
                        
            #//*** Append ID
            id_path = str(current_path).replace(os.getcwd(),"").replace("\\data\\enron","")
            row.append(id_path)
            
            #//*****************
            #//*** Find username
            #//*****************
            
            
            limit = 10
            tgt = 'enron'
            val = ""
            i = -1
            n = 0
            #//*** Username will be the folder name after enron
            #//*** Search through the Path Parent names to find the index value of enron
            #//*** Username will be parent[i-1].name
            while val != tgt:
                i += 1
                val = str(Path(root).parents[i].name)
                    
                #//*** Prevent Infinite Loops with a maximum Loop Limit
                n += 1
                if n > limit:
                    #//*** Limit Reached Set i to 2
                    i=1
                    break
            #//*** i can't be negative, reset to 0
            if i == 0:
                i = 1

            try:
                username = str(Path(root).parents[i-1].name)
            except:
                print(i)
            #//*** Append Username
            row.append(username)
    

            
            #//*** Add Original Message
            row.append(raw_email)
            
            #//*** Add Row as Record
            records.append(row)
            
            #//*** Print a Sample, every 200 records
            if len(records) % 200 == 0:
                print(f"username: {username} Path: {id_path} Msg Len: {len(raw_email)}")
            
    ## TODO: Complete the code to code to create the Spark dataframe
    return spark.createDataFrame(records,schema)

df = make_spark_df()




username: davis-d Path: \davis-d\all_documents\244_ Msg Len: 716
username: davis-d Path: \davis-d\all_documents\425_ Msg Len: 938
username: davis-d Path: \davis-d\deleted_items\101_ Msg Len: 1436
username: davis-d Path: \davis-d\deleted_items\296_ Msg Len: 2400
username: davis-d Path: \davis-d\discussion_threads\129_ Msg Len: 1000
username: davis-d Path: \davis-d\discussion_threads\30_ Msg Len: 698
username: davis-d Path: \davis-d\finanial_operations\14_ Msg Len: 1816
username: davis-d Path: \davis-d\inbox\family\13_ Msg Len: 1187
username: davis-d Path: \davis-d\sap\18_ Msg Len: 2219
username: davis-d Path: \davis-d\sent\64_ Msg Len: 1122
username: davis-d Path: \davis-d\_sent_mail\55_ Msg Len: 2223
username: gay-r Path: \gay-r\all_documents\238_ Msg Len: 3570
username: gay-r Path: \gay-r\all_documents\41_ Msg Len: 678
username: gay-r Path: \gay-r\discussion_threads\138_ Msg Len: 3172
username: gay-r Path: \gay-r\inbox\18_ Msg Len: 1792
username: gay-r Path: \gay-r\sent\210_ Msg Len: 

In [16]:
print(df)
print(df.printSchema())
df.show(n=10,truncate = True)
ids = df.select("id").collect()
print(ids[0].id)
print(len(ids))


DataFrame[id: string, username: string, original_msg: string]
root
 |-- id: string (nullable = true)
 |-- username: string (nullable = true)
 |-- original_msg: string (nullable = true)

None
+--------------------+--------+--------------------+
|                  id|username|        original_msg|
+--------------------+--------+--------------------+
| \davis-d\2_trash\1_| davis-d|Message-ID: <1774...|
| \davis-d\2_trash\2_| davis-d|Message-ID: <2467...|
| \davis-d\2_trash\3_| davis-d|Message-ID: <2833...|
| \davis-d\2_trash\4_| davis-d|Message-ID: <1972...|
|\davis-d\2_trash\...| davis-d|Message-ID: <1964...|
|\davis-d\2_trash\...| davis-d|Message-ID: <7345...|
|\davis-d\2_trash\...| davis-d|Message-ID: <5686...|
|\davis-d\2_trash\...| davis-d|Message-ID: <7218...|
|\davis-d\2_trash\...| davis-d|Message-ID: <3016...|
|\davis-d\2_trash\...| davis-d|Message-ID: <1233...|
+--------------------+--------+--------------------+
only showing top 10 rows

\davis-d\2_trash\1_
13725


## Assignment 4.2

Use `plain_msg_example` and `html_msg_example` to create a function that parses an email message. 

In [7]:
plain_msg_example = """
Message-ID: <6742786.1075845426893.JavaMail.evans@thyme>
Date: Thu, 7 Jun 2001 11:05:33 -0700 (PDT)
From: jeffrey.hammad@enron.com
To: andy.zipper@enron.com
Subject: Thanks for the interview
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Hammad, Jeffrey </O=ENRON/OU=NA/CN=RECIPIENTS/CN=NOTESADDR/CN=CBBE377A-24F58854-862567DD-591AE7>
X-To: Zipper, Andy </O=ENRON/OU=NA/CN=RECIPIENTS/CN=AZIPPER>
X-cc: 
X-bcc: 
X-Folder: \Zipper, Andy\Zipper, Andy\Inbox
X-Origin: ZIPPER-A
X-FileName: Zipper, Andy.pst

Andy,

Thanks for giving me the opportunity to meet with you about the Analyst/ Associate program.  I enjoyed talking to you, and look forward to contributing to the success that the program has enjoyed.  

Thanks and Best Regards,

Jeff Hammad
"""

html_msg_example = """
Message-ID: <21013632.1075862392611.JavaMail.evans@thyme>
Date: Mon, 19 Nov 2001 12:15:44 -0800 (PST)
From: insynconline.6jy5ympb.d@insync-palm.com
To: tstaab@enron.com
Subject: Last chance for special offer on Palm OS Upgrade!
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: InSync Online <InSyncOnline.6jy5ympb.d@insync-palm.com>
X-To: THERESA STAAB <tstaab@enron.com>
X-cc: 
X-bcc: 
X-Folder: \TSTAAB (Non-Privileged)\Staab, Theresa\Deleted Items
X-Origin: Staab-T
X-FileName: TSTAAB (Non-Privileged).pst

<html>

<html>
<head>
<title>Paprika</title>
<meta http-equiv="Content-Type" content="text/html;">
</head>
<body bgcolor="#FFFFFF" TEXT="#333333" LINK="#336699" VLINK="#6699cc" ALINK="#ff9900">
<table border="0" cellpadding="0" cellspacing="0" width="582">
<tr valign="top">
  <td width="582" colspan="9"><nobr><a href="http://insync-online.p04.com/u.d?BEReaQA5eczXB=1"><img src="http://images4.postdirect.com/master-images/404707/upper_left.gif" alt="" width="103" height="110" hspace="0" vspace="0" border="0"></a><a href="http://insync-online.p04.com/u.d?AkReaQA5eczXE=11"><img src="http://images4.postdirect.com/master-images/404707/upper_right.gif" alt="" width="479" height="110" hspace="0" vspace="0" border="0"></a></nobr></td>
</tr>
<tr valign="top">
  <td width="4" bgcolor="#CCCCCC"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="4" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  <td width="20"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="20" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  <td width="165"><br><a href="http://insync-online.p04.com/u.d?LkReaQA5eczXL=21"><img src="http://images4.postdirect.com/master-images/404707/screen1.gif" alt="" width="165" height="159" hspace="0" vspace="0" border="0"></a><br><img src="http://images4.postdirect.com/master-images/404707/screen1_text.gif" alt="" width="93" height="26" hspace="0" vspace="0" border="0"></td>
  <td width="20"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="20" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  <td width="165"><br><a href="http://insync-online.p04.com/u.d?BkReaQA5eczXO=31"><img src="http://images4.postdirect.com/master-images/404707/screen2.gif" alt="" width="165" height="159" hspace="0" vspace="0" border="0"></a><br><img src="http://images4.postdirect.com/master-images/404707/screen2_text.gif" alt="" width="93" height="26" hspace="0" vspace="0" border="0"></td>
  <td width="20"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="20" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  <td width="165"><br><a href="http://insync-online.p04.com/u.d?JkReaQA5eczXRs=41"><img src="http://images4.postdirect.com/master-images/404707/screen3.gif" alt="" width="165" height="159" hspace="0" vspace="0" border="0"></a><br><img src="http://images4.postdirect.com/master-images/404707/screen3_text.gif" alt="" width="93" height="26" hspace="0" vspace="0" border="0"></td>
  <td width="19"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="19" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  <td width="4" bgcolor="#CCCCCC"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="4" height="1" hspace="0" vspace="0" border="0" alt=""></td>
</tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="582">
<tr valign="top">
  <td width="4" bgcolor="#CCCCCC"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="4" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  <td width="574"><br>
    <table border="0" cellpadding="0" cellspacing="0" width="574" bgcolor="#99ccff">
    <tr>
      <td width="50"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="50" height="1" hspace="0" vspace="0" border="0" alt=""></td>
      <td width="474"><font face="verdana, arial" size="-2"color="#000000">
        <br>
        Dear THERESA,
        <br><br>
        Due to overwhelming demand for the Palm OS&#174; v4.1 Upgrade with Mobile Connectivity, we are 
        extending the special offer of 25% off through November 30, 2001. So there's still time to significantly 
        increase the functionality of your Palm&#153; III, IIIx, IIIxe, IIIc, V or Vx handheld. Step up to the 
        new Palm OS v4.1 through this extended special offer. You'll receive the brand new Palm OS v4.1 
        <b>for just $29.95 when you use Promo Code <font color="#FF0000">OS41WAVE</font></b>. That's a 
        <b>$10 savings</b> off the list price. 
        <br><br>
        <a href="http://insync-online.p04.com/u.d?NkReaQA5eczXRh=51">Click here to view a full product demo now</a>.
        <br><br>
        <a href="http://insync-online.p04.com/u.d?MkReaQA5eczXRm=61"><img src="http://images4.postdirect.com/master-images/404707/title1.gif" alt="" width="336" height="20" hspace="0" vspace="0" border="0"></a>
        <br><br>
        You can do a lot more with your Palm&#153; handheld when you upgrade to the Palm OS v4.1. All your 
        favorite features just got even better and there are some terrific new additions:
        <br><br>
        <LI> Handwrite notes and even draw pictures right on your Palm&#153 handheld</LI>
        <LI> Tap letters with your stylus and use Graffiti&#174; at the same time with the enhanced onscreen keyboard</LI>
        <LI> Improved Date Book functionality lets you view, snooze or clear multiple alarms all with a single tap </LI>
        <LI> You can easily change time-zone settings</LI>
        
        <br><br>
        <a href="http://insync-online.p04.com/u.d?WkReaQA5eczXRb=71"><img src="http://images4.postdirect.com/master-images/404707/title2.gif" alt="" width="460" height="20" hspace="0" vspace="0" border="0"></a>
        <br><br>
        <LI> <nobr>Mask/unmask</nobr> private records or hide/unhide directly within the application</LI>
        <LI> Lock your device automatically at a designated time using the new Autolocking feature</LI>
        <LI> Always remember your password with our new Hint feature*</LI>
        
        <br><br>
        <a href="http://insync-online.p04.com/u.d?VEReaQA5eczXRQ=81"><img src="http://images4.postdirect.com/master-images/404707/title3.gif" alt="" width="461" height="31" hspace="0" vspace="0" border="0"></a>
        <br><br>
        <LI> Use your GSM compatible mobile phone or modem to get online and access the web</LI>
        <LI> Stay connected with email, instant messaging and text messaging to GSM mobile phones</LI>
        <LI> Send applications or records through your cell phone to schedule meetings and even "beam" 
             important information to others</LI>
        
        <br><br>
        All this comes in a new operating system that can be yours for just $29.95! <a href="http://insync-online.p04.com/u.d?MkReaQA5eczXRV=91">Click here to 
        upgrade to the new Palm&#153; OS v4.1</a> and you'll also get the latest Palm desktop software. Or call 
        <nobr>1-800-881-7256</nobr> to order via phone. 
        <br><br>
        Sincerely,<br>
        The Palm Team
        <br><br>
        P.S. Remember, this extended offer opportunity of 25% savings absolutely ends on November 30, 2001 
        and is only available through the Palm Store when you use Promo Code <b><font color="#FF0000">OS41WAVE</font></b>.
        <br><br>
        <img src="http://images4.postdirect.com/master-images/404707/bottom_button.gif" align="right" alt="" width="295" height="60" hspace="0" vspace="0" border="0">
        <br><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="474" height="1" hspace="0" vspace="0" border="0" alt="">
        </font></td>
      <td width="50"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="50" height="1" hspace="0" vspace="0" border="0" alt=""></td>
    </tr>
    </table></td>
    <td width="4" bgcolor="#CCCCCC"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="4" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  </tr>
  <tr>
  <td colspan="3"><img src="http://images4.postdirect.com/master-images/404707/bottom.gif" width="582" height="67" hspace="0" vspace="0" border="0"></td>
  </tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="582">
  <tr>
    <td width="54"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="54" height="1" hspace="0" vspace="0" border="0" alt=""></td>
    <td width="474"><font face="arial, verdana" size="-2" color="#000000"><br>
    * This feature is available on the Palm&#153; IIIx, Palm&#153; IIIxe, and Palm&#153; Vx. <br><br>
    ** Note: To use the MIK functionality, you need either a Palm OS&#174; compatible modem or a phone 
    with  <nobr>built-in</nobr> modem or data capability that has either an infrared port or cable exits.  If you 
    are using a phone, you must have data services from your mobile service provider.  <a href="http://insync-online.p04.com/u.d?RkReaQA5eczXRK=101">Click here</a> for 
    a list of tested and supported phones that you can use with the MIK. Cable not provided.
    <br><br>
    ------------------<br>
    To modify your profile or unsubscribe from Palm newsletters, <a href="http://insync-online.p04.com/u.d?KkReaQA5eczXRE=121">click here</a>. 
    Or, unsubscribe by replying to this message, with "unsubscribe" as the subject line of the message. 
    <br><br>
    ------------------<br>
    Copyright&#169; 2001 Palm, Inc. Palm OS, Palm Computing, HandFAX, HandSTAMP, HandWEB, Graffiti, 
    HotSync, iMessenger, MultiMail, Palm.Net, PalmConnect, PalmGlove, PalmModem, PalmPoint, PalmPrint, 
    and the Palm Platform Compatible Logo are registered trademarks of Palm, Inc. Palm, the Palm logo, 
    AnyDay, EventClub, HandMAIL, the HotSync Logo, PalmGear, PalmGlove, PalmPix, Palm Powered, the Palm 
    trade dress, PalmSource, Smartcode, and Simply Palm are trademarks of Palm, Inc. All other brands and 
    product names may be trademarks or registered trademarks of their respective owners.</font>
    <img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="474" height="1" hspace="0" vspace="0" border="0" alt=""></td>
    <td width="54"><img src="http://images4.postdirect.com/master-images/404707/clear.gif" width="54" height="1" hspace="0" vspace="0" border="0" alt=""></td>
  </tr>
</table><br><br><br><br>
<!-- The following image is included for message detection -->
<img src="http://p04.com/1x1.dyn" border="0" alt="" width="1" height="1">
<img src="http://p04.com/1x1.dyn?0vEGou8Ig30ba2L2bLn" width=1 height=1></body>
</html>

</html>
"""
plain_msg_example = plain_msg_example.strip()
html_msg_example = html_msg_example.strip()

In [8]:
def parse_html_payload(payload):
    """
    This function uses Beautiful Soup to read HTML data
    and return the text.  If the payload is plain text, then
    Beautiful Soup will return the original content
    """
    soup = BeautifulSoup(payload, 'html.parser')
    return str(soup.get_text()).encode('utf-8').decode('utf-8')

def parse_email(original_msg):
    result = {}
    msg = Parser(policy=default).parsestr(original_msg)
    ## TODO: Use Python's email library to read the payload and the headers
    ## https://docs.python.org/3/library/email.examples.html
    
    for column in output_columns:

        #//*** Username is the root parent folder name
        if column == 'username':
            continue
            
        #//*** Append the whole unprocessed text file
        if column == 'original_msg':
            result[column] = original_msg
            continue

        #//*** Append the Payload, which is the message body
        #//*** Send raw Text to parse_html_payload. This will use beautiful soup to strip
        #//*** html and return plain text
        if column == 'payload':
            #print('payload: ')
            result[column] = parse_html_payload(msg.get_content())
            continue

        #//*** Convert Date to Datetime
        #if column == 'Date':
            #//*** Convert Text to Datetime Object
        #    dt = datetime.datetime.strptime(msg[column],"%a, %d %b %Y %H:%M:%S %z")
            
        #    result[column] = dt
        #    continue 
        #//*** All other headers are pass-thru strings
        if column in msg.keys():
            #print(column, )
            result[column] = msg[column]
        else:
            
            #//*** Empty Field: return Zero Length String
            result[column] = ""
            
                  
    tuple_result = tuple([str(result.get(column, None)) for column in columns])
    return ParsedEmail(*tuple_result)


In [9]:
msg = Parser(policy=default).parsestr(plain_msg_example)
msg.keys()

['Message-ID',
 'Date',
 'From',
 'To',
 'Subject',
 'Mime-Version',
 'Content-Type',
 'Content-Transfer-Encoding',
 'X-From',
 'X-To',
 'X-cc',
 'X-bcc',
 'X-Folder',
 'X-Origin',
 'X-FileName']

In [10]:
parsed_msg = parse_email(plain_msg_example)
print(parsed_msg.payload)

Andy,

Thanks for giving me the opportunity to meet with you about the Analyst/ Associate program.  I enjoyed talking to you, and look forward to contributing to the success that the program has enjoyed.  

Thanks and Best Regards,

Jeff Hammad


In [11]:
parsed_html_msg = parse_email(html_msg_example)
print(parsed_html_msg.payload)




Paprika




























        Dear THERESA,
        
        Due to overwhelming demand for the Palm OS® v4.1 Upgrade with Mobile Connectivity, we are 
        extending the special offer of 25% off through November 30, 2001. So there's still time to significantly 
        increase the functionality of your Palm™ III, IIIx, IIIxe, IIIc, V or Vx handheld. Step up to the 
        new Palm OS v4.1 through this extended special offer. You'll receive the brand new Palm OS v4.1 
        for just $29.95 when you use Promo Code OS41WAVE. That's a 
        $10 savings off the list price. 
        
Click here to view a full product demo now.
        


        You can do a lot more with your Palm™ handheld when you upgrade to the Palm OS v4.1. All your 
        favorite features just got even better and there are some terrific new additions:
        
 Handwrite notes and even draw pictures right on your Palm™ handheld
 Tap letters with your stylus and use Graffiti® at the same t

## Assignment 4.3

Notes:

https://www.analyticsvidhya.com/blog/2019/11/build-machine-learning-pipelines-pyspark/

https://towardsdatascience.com/data-transformation-in-pyspark-6a88a6193d92

Pyspark UDF - User Defined Functions:

https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/


In [12]:
## This creates a schema for the email data
email_struct = StructType()

#//*** All Columns are StringType except the Date Column which is TimestampType
for column in columns:
    #if column == "Date":
    #    email_struct.add(column, TimestampType(), True)
    #else:
        email_struct.add(column, StringType(), True)

In [14]:
## This creates a user-defined function which can be used in Spark
## It transforms the custom function parse_emails into a function useable by Pipeline functions
parse_email_func = udf(lambda z: parse_email(z), email_struct)

#//*** Takes the existing df, passes each column into parsed_email() to generate parsed columns.
#//*** This step parses the email header into separate columns
def parse_emails(input_df):
    #//*** Selects the first three columns of the input_df
    #//*** The fourth column applies the transformed parse_email_func on the original_msg column
    #//*** and outputs the results in parsed_email column
    new_df = input_df.select(
        'username', 'id', 'original_msg', parse_email_func('original_msg').alias('parsed_email')
    )
    
    #//*** Extracts each (sub)column in parsed_email into a df column
    for column in columns:
        new_df = new_df.withColumn(column, new_df.parsed_email[column])
    
    #//*** Removed the parsed_email column filled with sub columns
    new_df = new_df.drop('parsed_email')
    
    return new_df

#//*** Transformer function used to apply parse_emails, which in turn applies the parse_email_func udf (user defined function)
class ParseEmailsTransformer(Transformer):
    def _transform(self, dataset):
        """
        Transforms the input dataset.

        :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`
        :returns: transformed dataset
        """
        return dataset.transform(parse_emails)


## Use the custom ParseEmailsTransformer, Tokenizer, and CountVectorizer 
## to create a spark pipeline 
email_pipeline = Pipeline(
    ## TODO: Complete code
    stages=[
        ParseEmailsTransformer(),
        Tokenizer(inputCol='payload', outputCol="words"), 
        CountVectorizer(inputCol='words', outputCol='features')
        ]
    
)
model = email_pipeline.fit(df)
result = model.transform(df)



In [15]:
result.select('id', 'words', 'features').show(n=20, truncate=True)

+--------------------+--------------------+--------------------+
|                  id|               words|            features|
+--------------------+--------------------+--------------------+
| \davis-d\2_trash\1_|[, >, , , , , >, ...|(99771,[0,1,2,3,4...|
| \davis-d\2_trash\2_|[fyi..., thanks.,...|(99771,[0,1,2,3,5...|
| \davis-d\2_trash\3_|[----------------...|(99771,[0,1,2,6,7...|
| \davis-d\2_trash\4_|[-----original, m...|(99771,[0,2,6,7,9...|
|\davis-d\2_trash\...|[hi, mommy!, , ye...|(99771,[0,1,2,6,7...|
|\davis-d\2_trash\...|[hey, sweetie,, ,...|(99771,[0,1,7,10,...|
|\davis-d\2_trash\...|[----------------...|(99771,[0,10,25,2...|
|\davis-d\2_trash\...|[----------------...|(99771,[0,1,2,3,5...|
|\davis-d\2_trash\...|[----------------...|(99771,[0,2,3,6,7...|
|\davis-d\2_trash\...|[----------------...|(99771,[0,1,2,3,7...|
|\davis-d\2_trash\...|[----------------...|(99771,[0,1,2,3,4...|
|\davis-d\2_trash\...|[----------------...|(99771,[0,10,25,2...|
|\davis-d\2_trash\...|[, 