In [65]:
import os
import pandas as pd
import numpy as np 
import plotly
import re
import explainerdashboard
from d3blocks import d3blocks
from nltk.corpus import stopwords
import string
import nltk

In [66]:
nltk.download('stopwords')
sw = set(stopwords.words('english'))
list(sw)[0:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Heather\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['needn',
 'out',
 'few',
 "it's",
 'of',
 'that',
 'after',
 "needn't",
 "weren't",
 'off']

In [67]:
print(set(string.punctuation))

{'^', '!', '_', ',', '`', '&', '~', '+', '|', ':', '{', ')', '$', "'", '=', '.', '>', '?', '%', '@', '"', '[', '*', '}', '-', '\\', '(', '<', '#', '/', ';', ']'}


### Employee Records

In [5]:
employee_records_df = pd.read_excel('EmployeeRecords.xlsx', sheet_name='Employee Records')
employee_records_df.head(2)

Unnamed: 0,LastName,FirstName,BirthDate,BirthCountry,Gender,CitizenshipCountry,CitizenshipBasis,CitizenshipStartDate,PassportCountry,PassportIssueDate,PassportExpirationDate,CurrentEmploymentType,CurrentEmploymentTitle,CurrentEmploymentStartDate,EmailAddress,MilitaryServiceBranch,MilitaryDischargeType,MilitaryDischargeDate
0,Bramar,Mat,1981-12-19,Tethys,Male,Tethys,BirthNation,1981-12-19,Tethys,2007-12-12,2017-12-11,Administration,Assistant to CEO,2005-07-01,Mat.Bramar@gastech.com.kronos,,,NaT
1,Ribera,Anda,1975-11-17,Tethys,Female,Tethys,BirthNation,1975-11-17,Tethys,2009-06-15,2019-06-14,Administration,Assistant to CFO,2009-10-30,Anda.Ribera@gastech.com.kronos,,,NaT


For datetime64[ns] types, NaT represents missing values. (Not a valid time). 

While NaN is the default missing value marker for reasons of computational speed and convenience. In many cases, however, the Python `None` will arise and we wish to also consider that “missing” or “not available” or “NA”. 

In [6]:
index_employee_records_df = pd.read_excel('EmployeeRecords.xlsx', sheet_name='Index')
index_employee_records_df.head(3)

Unnamed: 0,Field Name,Description
0,LastName,the last name of the employee
1,FirstName,the first name of the employee
2,BirthDate,the birth date of the employee


In [11]:
employee_records_df.shape

(54, 18)

In [8]:
employee_records_df.dtypes

LastName                              object
FirstName                             object
BirthDate                     datetime64[ns]
BirthCountry                          object
Gender                                object
CitizenshipCountry                    object
CitizenshipBasis                      object
CitizenshipStartDate          datetime64[ns]
PassportCountry                       object
PassportIssueDate             datetime64[ns]
PassportExpirationDate        datetime64[ns]
CurrentEmploymentType                 object
CurrentEmploymentTitle                object
CurrentEmploymentStartDate    datetime64[ns]
EmailAddress                          object
MilitaryServiceBranch                 object
MilitaryDischargeType                 object
MilitaryDischargeDate         datetime64[ns]
dtype: object

In [10]:
employee_records_df.isnull().sum()

LastName                       0
FirstName                      0
BirthDate                      0
BirthCountry                   0
Gender                         0
CitizenshipCountry             0
CitizenshipBasis               0
CitizenshipStartDate           0
PassportCountry               21
PassportIssueDate             21
PassportExpirationDate        21
CurrentEmploymentType          0
CurrentEmploymentTitle         0
CurrentEmploymentStartDate     0
EmailAddress                   0
MilitaryServiceBranch         27
MilitaryDischargeType         27
MilitaryDischargeDate         27
dtype: int64

The missingness means the employee did not serve in a military branch.

The missingness from passport means they did not own a passport.

In [20]:
# d3 = d3blocks.D3Blocks()

[d3blocks] >INFO> Cleaning edge_properties and config parameters..


### Employee Emails

In [60]:
# single-byte charcter encoding of the latin alphabet
email_df = pd.read_csv('emailheaders.csv', encoding='cp1252')
email_df.head()

Unnamed: 0,From,To,Date,Subject
0,Sven.Flecha@gastech.com.kronos,"Isak.Baza@gastech.com.kronos, Lucas.Alcazar@ga...",1/6/2014 8:39,GT-SeismicProcessorPro Bug Report
1,Kanon.Herrero@gastech.com.kronos,"Felix.Resumir@gastech.com.kronos, Hideki.Cocin...",1/6/2014 8:58,Inspection request for site
2,Bertrand.Ovan@gastech.com.kronos,"Emile.Arpa@gastech.com.kronos, Varro.Awelon@ga...",1/6/2014 9:28,New refueling policies - Effective February 1
3,Valeria.Morlun@gastech.com.kronos,"Dante.Coginian@gastech.com.kronos, Albina.Hafo...",1/6/2014 9:38,Route suggestion for next shift
4,Mat.Bramar@gastech.com.kronos,"Rachel.Pantanal@gastech.com.kronos, Lars.Azada...",1/6/2014 9:49,Upcoming birthdays


In [61]:
print(email_df.shape)
print(email_df.dtypes)

(1170, 4)
From       object
To         object
Date       object
Subject    object
dtype: object


In [62]:
def stripEmail(email):
    return re.sub(pattern=r"\b@[a-z][a-z.+]+.", repl="", string=email)

def addSplit(email):
    return re.sub(pattern="\.", repl=" ", string=email)

email_df['From'] = email_df['From'].apply(lambda x: stripEmail(x))
email_df['From'] = email_df['From'].apply(lambda x: addSplit(x))
email_df.head(3)

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,"Isak.Baza@gastech.com.kronos, Lucas.Alcazar@ga...",1/6/2014 8:39,GT-SeismicProcessorPro Bug Report
1,Kanon Herrero,"Felix.Resumir@gastech.com.kronos, Hideki.Cocin...",1/6/2014 8:58,Inspection request for site
2,Bertrand Ovan,"Emile.Arpa@gastech.com.kronos, Varro.Awelon@ga...",1/6/2014 9:28,New refueling policies - Effective February 1


In [63]:
email_df['To'] = email_df['To'].apply(lambda x: stripEmail(x))
email_df['To'] = email_df['To'].apply(lambda x: addSplit(x))
email_df.head(3)

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,Isak Baza Lucas Alcazar,1/6/2014 8:39,GT-SeismicProcessorPro Bug Report
1,Kanon Herrero,Felix Resumir Hideki Cocinaro Inga Ferro Varja...,1/6/2014 8:58,Inspection request for site
2,Bertrand Ovan,Emile Arpa Varro Awelon Dante Coginian Albina ...,1/6/2014 9:28,New refueling policies - Effective February 1


In [64]:
# M/D/YYYY
email_df['Date'] = pd.to_datetime(email_df['Date'], errors='raise')
email_df.dtypes

From               object
To                 object
Date       datetime64[ns]
Subject            object
dtype: object

In [68]:
punctuation = None 

def clean(text, stopwords) -> str:
    """
    Clean text sentence 
    Params: text: the string to clean
    stopwords: a list of NLTK stopwords to remove from input row
    Returns: cleaned sentence
    """ 
    text = re.sub(r'<[^>?]*>', '', text)
    text_list = text.split()
    text_words = []
    punctuation = set(string.punctuation)
    
    for word in text_list: 
        while len(word) > 0 and word[0] in punctuation: 
            word = word[1:]
        
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
            
        word = word.replace(',', '')
        word = word.replace('\"', '')
        word = word.replace('\'', '')

        if len(word) > 0 and "/" not in word: 
            if word.lower() not in stopwords: 
                text_words.append(word.lower())
        cleaner_text = " ".join(text_words)
    return cleaner_text

email_df['Subject'] = email_df['Subject'].apply(clean, stopwords=sw)
email_df.head(3)

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,Isak Baza Lucas Alcazar,2014-01-06 08:39:00,gt-seismicprocessorpro bug report
1,Kanon Herrero,Felix Resumir Hideki Cocinaro Inga Ferro Varja...,2014-01-06 08:58:00,inspection request site
2,Bertrand Ovan,Emile Arpa Varro Awelon Dante Coginian Albina ...,2014-01-06 09:28:00,new refueling policies effective february 1


### News Articles

News articles contain historical information.

In [76]:
def extract_article_name(name) -> str:
    """
    Params: str: article name
    Returns:
        str: shorten file name
    """
    return "article " + re.sub(pattern=r"\.+.*", repl="", string=name)

def extract_data(rootdir, news_article) -> dict:
    """
    Params:
        rootdir (Path): path to search for article files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('txt'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        news_name = extract_article_name(file)
                        news_article[news_name] = datafile.read()
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse article: ", ex)
                        continue
    return news_article

news_article = {}
rootdir = 'articles/'
news_article = extract_data(rootdir, news_article)
articles_df = pd.DataFrame.from_dict(data=news_article, orient='index', columns=['description'])
articles_df.reset_index(inplace=True)
print(articles_df.shape)
articles_df.head()

(845, 2)


Unnamed: 0,index,description
0,article 0,The Orb\n\n\nBUMP OF PROTESTS IN ABILA IN RESP...
1,article 1,The Light of Truth\n\nENORMOUS IPO MAKES THE B...
2,article 10,Homeland Illumination\nVOICES - a blog about w...
3,article 100,The Continent\n\nFour people have died in an e...
4,article 101,Daily Pegasus\n\nTHE DEMONSTRATION ATTRACTS TH...


In [71]:
articles_df['description'] = articles_df['description'].apply(clean, stopwords=sw)
articles_df.head()

Unnamed: 0,index,description
0,article 0,orb bump protests abila response calls action ...
1,article 1,light truth enormous ipo makes billionaire san...
2,article 10,homeland illumination voices blog important pe...
3,article 100,continent four people died enthusiastic discha...
4,article 101,daily pegasus demonstration attracts thousands...


### Resumes

Using resumes to look up historical employee data

In [None]:
resume_df = pd.read_csv('resumes/')