In [58]:
# This is the main executablefile of the process mining pipeline
# Author: Florian Lietz
# Last edited
# 

import os, sys, re
import pandas as pd
from os import path
from argparse import ArgumentParser
from stages.utils.utils import parseArgs, DataCleaner
import spacy
f = r"C:\Users\flietz\OneDrive - TU Wien\!Studium\1_MSc\!Diplomarbeit\code\pipeline\resources\dataset\Mail_ApplicationDummy.csv"
# import CSV file
inputFile = pd.read_csv(f, delimiter=";")
inputFile.head()

Unnamed: 0,From,To,Datetime,Message-ID,In-Reply-To,Subject,Content
0,jmayer94@gmail.com,office@ziegler-cat.com,01.10.2001 09:15:00,15779.1995312335.JavaMail.evans@thyme,,Application for an Internship in Summer 2022,"Dear sir or madam,\n\nI hereby want to apply f..."
1,j.parker@ziegler-cat.com,jmayer94@gmail.com,05.10.2021 11:01:00,78445.176352.JavaMail@ziegler,15779.1995312335.JavaMail.evans@thyme,Re: Application for an Internship in Summer 2022,"Dear Mister Mayer,\n\nthank you a lot for your..."
2,j.parker@ziegler-cat.com,jmayer94@gmail.com,12.10.2021 23:11:00,784338.1763212.JavaMail@ziegler,78445.176352.JavaMail@ziegler,Re:Re: Application for an Internship in Summer...,"Dear Mister Mayer,\n\nthank you for your appli..."
3,jmayer94@gmail.com,j.parker@ziegler-cat.com,12.10.2021 08:05:00,127791.19953167485.JavaMail.evans@thyme,784338.1763212.JavaMail@ziegler,Re:Re:Re: Application for an Internship in Sum...,"Dear Ms. Parker,\n\nthank you for your reply a..."
4,j.parker@ziegler-cat.com,"jmayer94@gmail.com, g.sullenberger@ziegler-cat...",13.10.2021 09:33:00,784971.1786612.JavaMail@ziegler,127791.19953167485.JavaMail.evans@thyme,Invitation for Personal Meeting,"Dear Mister Mayer,\n\nWe are happy to hear tha..."


In [59]:
def removeUrl(content):
    return re.sub(r'https?://\S+', '', content)

def removeMultWhitespace(content):
    return re.sub(r' +', ' ', content)

def stripEndClauses(content, clauses):
    clauseIndex = 0
    index = 0
    # Find lowest greetings or end clause index and strip off everything that comes after it
    for item in clauses:
        # needle and haystack both in lowercase to ignore case
        index = content.lower().find(item.lower())
        if index > -1 and (index < clauseIndex or clauseIndex == 0):
            clauseIndex = index
    if clauseIndex > 0:
        return content[:clauseIndex]
    else:
        return content

def stripStartClauses(content, clauses):
    clauseIndex = 0
    index = 0
    # Find lowest greetings or end clause index and strip off everything that comes after it
    for item in clauses:
        # needle and haystack both in lowercase to ignore case
        index = content.lower().find(item.lower())
        if index > -1 and (index > clauseIndex or clauseIndex == 0):
            clauseIndex = index
    if clauseIndex > 0:
        return content[clauseIndex:]
    else:
        return content
    
inputFile["Content"] = inputFile.apply(lambda row: removeMultWhitespace(row["Content"]), axis=1)
inputFile["Content"] = inputFile.apply(lambda row: removeUrl(row["Content"]), axis=1)

# Stripping of greeting phrases and end clauses
startClausesList = ["Dear sir or madam", "To whom it may concern", "Hello,", ",\n\n"]

endGreetingsList = ["Yours sincerely", "Sincerely", "Sincerely yours", "Take care", "Regards",
                 "Warm regards", "Best regards", "Kind regards", "Warmest regards", "Yours truly", "Yours,",
                 "Warmly,", "Warm wishes", "Best,", "Best Wishes", "Thanks in advance", "Thank you in advance",
                 "Thanks in advance", "Thanks,\n", "I am looking forward to hearing", "I'm looking forward to hearing",
                 "I look forward to hearing from you"]

confList = ["The information contained in this communication",
               "The content of this email is confidential", "The content of this e-mail", "This email and attachments (if any) is intended",
               "This email is intended solely", "This e-mail is intended"]

endClausesList = endGreetingsList+confList

inputFile["Content"] = inputFile.apply(lambda row: stripEndClauses(row["Content"], endClausesList), axis=1)
inputFile["Content"] = inputFile.apply(lambda row: stripStartClauses(row["Content"], startClausesList), axis=1)
inputFile["Content"] = inputFile.apply(lambda row: re.sub(r'\n+', '\n', row["Content"]), axis=1)
inputFile["Content"] = inputFile.apply(lambda row: re.sub(r'\n', ' ', row["Content"]), axis=1)

In [60]:
nlp = spacy.load("en_core_web_md")

In [61]:
temp = inputFile["Content"]
temp

0     , I hereby want to apply for the internship po...
1     , thank you a lot for your interest in Ziegler...
2     , thank you for your application and your wait...
3     , thank you for your reply and for your kind i...
4     , We are happy to hear that. My assistant will...
5     , it was a pleasure to meet you in person and ...
6     , Thank you a lot for your interest in our com...
7     , After having screened your application, unfo...
8     , I hereby want to apply for the position of "...
9     , thank you a lot for your appliation which ha...
10    , thank you a lot for your application at Lind...
11    , please find attached the missing transcript....
12    Dear Ms. Bender, thank you for the document. T...
13    , After having assessed your application docum...
14    , thank you for your consideration and the inv...
15    , It was a pleasure to meet you in person. We ...
16    Dear Ms Hoeller, thank you for your email and ...
17    Dear Ms. Bender, thank you for you reply. 

In [62]:
doc = nlp(temp)        
tokens = [token for token in doc]
tokens = [t for t in tokens if t.lemma_ != "\n"]
#for token in tokens: 
#    print(token.lemma_, token.pos_, token.tag_, token.dep_,
#            token.shape_, token.is_alpha, token.is_stop, [child for child in token.children])

# tactic: investigate words and follow children until noun is found
verbs = [v for v in tokens if v.pos_ == "VERB"]

def find_noun_children(verb):
    level = 0
    protocol = []
    orig_children = [c for c in verb.children if c.pos_ != "PUNCT" and c.pos_ != "SPACE"]
    def iterate_children(verb, level, orig_verb, protocol):
        if verb in orig_verb:
            add = "-"
        else:
            add = "#"
        rel_children = [c for c in verb.children if c.pos_ != "PUNCT" and c.pos_ != "SPACE"]
        for child in rel_children:
            if child.pos_ == "NOUN" and child.dep_ != "npadvmod":
                level = level
                protocol.append(child.lemma_+"[Noun]")
                return protocol
            else:
                level = level+1
                if child.pos_ == "VERB":
                    protocol.append(child.lemma_)
                iterate_children(child, level, orig_verb, protocol)
    
    res = iterate_children(verb, 0, orig_children, protocol)
    return protocol
    
res = {v: find_noun_children(v) for v in verbs}
print(res)


ValueError: [E866] Expected a string or 'Doc' as input, but got: <class 'pandas.core.series.Series'>.

In [18]:
doc = nlp(temp)        
tokens = [token for token in doc]
tokens = [t for t in tokens if t.lemma_ != "\n"]
#for token in tokens: 
#    print(token.lemma_, token.pos_, token.tag_, token.dep_,
#            token.shape_, token.is_alpha, token.is_stop, [child for child in token.children])

# tactic: investigate words and follow children until noun is found
verbs = [v for v in tokens if v.pos_ == "VERB"]

def find_noun_children(verb):
    level = 0
    protocol = []
    orig_children = [c for c in verb.children if c.pos_ != "PUNCT" and c.pos_ != "SPACE"]
    def iterate_children(verb, level, orig_verb, protocol):
        if verb in orig_verb:
            add = "-"
        else:
            add = "#"
        rel_children = [c for c in verb.children if c.pos_ != "PUNCT" and c.pos_ != "SPACE"]
        for child in rel_children:
            if child.pos_ == "NOUN" and child.dep_ != "npadvmod":
                level = level
                protocol.append(child.lemma_+"[Noun]")
                return protocol
            else:
                level = level+1
                if child.pos_ == "VERB":
                    protocol.append(child.lemma_)
                iterate_children(child, level, orig_verb, protocol)
    
    res = iterate_children(verb, 0, orig_children, protocol)
    return protocol
    
res = {v: find_noun_children(v) for v in verbs}
print(res)
# Create for each document bag of words - tf-idf, to find out overall topic/activity of email


{thank: ['interest[Noun]', 'application', 'position[Noun]'], application: ['position[Noun]'], reaching: ['lot[Noun]'], doing: ['screen[Noun]', 'assess', 'applicant[Noun]'], assess: ['applicant[Noun]'], allow: ['week[Noun]'], find: ['company[Noun]']}


## Textacy Subject-Object-Verb Triples

In [63]:
temp

0     , I hereby want to apply for the internship po...
1     , thank you a lot for your interest in Ziegler...
2     , thank you for your application and your wait...
3     , thank you for your reply and for your kind i...
4     , We are happy to hear that. My assistant will...
5     , it was a pleasure to meet you in person and ...
6     , Thank you a lot for your interest in our com...
7     , After having screened your application, unfo...
8     , I hereby want to apply for the position of "...
9     , thank you a lot for your appliation which ha...
10    , thank you a lot for your application at Lind...
11    , please find attached the missing transcript....
12    Dear Ms. Bender, thank you for the document. T...
13    , After having assessed your application docum...
14    , thank you for your consideration and the inv...
15    , It was a pleasure to meet you in person. We ...
16    Dear Ms Hoeller, thank you for your email and ...
17    Dear Ms. Bender, thank you for you reply. 

In [64]:
import spacy
import textacy
nlp = spacy.load('en_core_web_md')

tuples_list = []          
doc = nlp(temp)
tuples = textacy.extract.subject_verb_object_triples(doc)
if tuples:
    tuples_to_list = list(tuples)
    tuples_list.append(tuples_to_list)
print(tuples_list)

ValueError: [E866] Expected a string or 'Doc' as input, but got: <class 'pandas.core.series.Series'>.

## Textacy Keyterms

In [65]:
from textacy.extract import keyterms as kt
kt.textrank(doc)
# maybe filter out names and organisations, then check for nouns and verbs here

[('personal meeting', 0.07264685846855523),
 ('kind invitation', 0.06822644776178241),
 ('fine', 0.03402668454397878),
 ('date', 0.03360346405887551),
 ('reply', 0.03340336269097082)]

## Sentence splitting and extraction of nouns and verbs 

In [66]:
for sentence in doc.sents:
    print([f"{t}, {t.tag_}" for t in sentence if (t.pos_ == "VERB" or t.pos_ == "NOUN" and t.dep_ != "npadvmod")])

['thank, VBP', 'reply, NN', 'invitation, NN']
['proposed, VBN', 'date, NN', 'meeting, NN', 'works, VBZ']
['looking, VBG', 'meeting, VBG']


## Build Message chain

In [None]:
# start with first message, add next if in-reply-to or sender and (receiver match and date not > 14 days)
# convList = []
# conv = Conv(from, to, content, headers)
# convList.add(conv)
# mark added somehow as done

**TODOS**

[x] Split end and greeting clause (everything after sincerely, best regards etc. strip)

[x] Build dataset

Next after Skiing:

* Build message chain using Message-ID, receiver/sender info and datetime


* Extract verb noun pairs and find way to rank them (e.g., combination of above solutions
* Cluster using LDA or similar - all messages to gather super categories, probability of fitting labels
* NER for names, skills, positions, activities (application, assess, invite interview, clarify, hire yes/no)
* Build cases using similarity measure


In [67]:
a = ""
a+="b"
a

'b'

In [69]:
a = "Hallo, wie gehts"
for word in a:
    print(f"{word} <-")

H <-
a <-
l <-
l <-
o <-
, <-
  <-
w <-
i <-
e <-
  <-
g <-
e <-
h <-
t <-
s <-
