In [359]:
import os
import pandas as pd
import numpy as np 
import plotly
import re
import explainerdashboard
from d3blocks import d3blocks
from nltk.corpus import stopwords
import string
import nltk
import docx
from collections.abc import Iterable
from pattern.en import sentiment, subjectivity
import seaborn as sns
import plotly.express as px

In [360]:
nltk.download('stopwords')
sw = set(stopwords.words('english'))
list(sw)[0:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Heather\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


["that'll",
 'hadn',
 'me',
 'its',
 'wasn',
 "mightn't",
 'theirs',
 'up',
 'against',
 'once']

In [361]:
print(set(string.punctuation))

{"'", ';', '}', ':', '.', ']', '^', '~', '"', ')', ',', '/', '=', '<', '_', '|', '$', '[', '?', '%', '\\', '`', '+', '#', '&', '!', '@', '>', '(', '*', '-', '{'}


### Employee Records

In [362]:
employee_records_df = pd.read_excel('EmployeeRecords.xlsx', sheet_name='Employee Records')
employee_records_df.head(2)

Unnamed: 0,LastName,FirstName,BirthDate,BirthCountry,Gender,CitizenshipCountry,CitizenshipBasis,CitizenshipStartDate,PassportCountry,PassportIssueDate,PassportExpirationDate,CurrentEmploymentType,CurrentEmploymentTitle,CurrentEmploymentStartDate,EmailAddress,MilitaryServiceBranch,MilitaryDischargeType,MilitaryDischargeDate
0,Bramar,Mat,1981-12-19,Tethys,Male,Tethys,BirthNation,1981-12-19,Tethys,2007-12-12,2017-12-11,Administration,Assistant to CEO,2005-07-01,Mat.Bramar@gastech.com.kronos,,,NaT
1,Ribera,Anda,1975-11-17,Tethys,Female,Tethys,BirthNation,1975-11-17,Tethys,2009-06-15,2019-06-14,Administration,Assistant to CFO,2009-10-30,Anda.Ribera@gastech.com.kronos,,,NaT


For datetime64[ns] types, NaT represents missing values. (Not a valid time). 

While NaN is the default missing value marker for reasons of computational speed and convenience. In many cases, however, the Python `None` will arise and we wish to also consider that “missing” or “not available” or “NA”. 

In [363]:
index_employee_records_df = pd.read_excel('EmployeeRecords.xlsx', sheet_name='Index')
index_employee_records_df.head(3)

Unnamed: 0,Field Name,Description
0,LastName,the last name of the employee
1,FirstName,the first name of the employee
2,BirthDate,the birth date of the employee


In [364]:
employee_records_df.shape

(54, 18)

In [365]:
employee_records_df.dtypes

LastName                              object
FirstName                             object
BirthDate                     datetime64[ns]
BirthCountry                          object
Gender                                object
CitizenshipCountry                    object
CitizenshipBasis                      object
CitizenshipStartDate          datetime64[ns]
PassportCountry                       object
PassportIssueDate             datetime64[ns]
PassportExpirationDate        datetime64[ns]
CurrentEmploymentType                 object
CurrentEmploymentTitle                object
CurrentEmploymentStartDate    datetime64[ns]
EmailAddress                          object
MilitaryServiceBranch                 object
MilitaryDischargeType                 object
MilitaryDischargeDate         datetime64[ns]
dtype: object

In [366]:
employee_records_df.isnull().sum()

LastName                       0
FirstName                      0
BirthDate                      0
BirthCountry                   0
Gender                         0
CitizenshipCountry             0
CitizenshipBasis               0
CitizenshipStartDate           0
PassportCountry               21
PassportIssueDate             21
PassportExpirationDate        21
CurrentEmploymentType          0
CurrentEmploymentTitle         0
CurrentEmploymentStartDate     0
EmailAddress                   0
MilitaryServiceBranch         27
MilitaryDischargeType         27
MilitaryDischargeDate         27
dtype: int64

The missingness means the employee did not serve in a military branch.

The missingness from passport means they did not own a passport.

#### Feature Engineering: Create Full Name column

In [367]:
employee_records_df['FullName'] = employee_records_df['FirstName'] + ' ' + employee_records_df['LastName']

### Employee Emails

In [368]:
# single-byte charcter encoding of the latin alphabet
email_df = pd.read_csv('emailheaders.csv', encoding='cp1252')
email_df.head()

Unnamed: 0,From,To,Date,Subject
0,Sven.Flecha@gastech.com.kronos,"Isak.Baza@gastech.com.kronos, Lucas.Alcazar@ga...",1/6/2014 8:39,GT-SeismicProcessorPro Bug Report
1,Kanon.Herrero@gastech.com.kronos,"Felix.Resumir@gastech.com.kronos, Hideki.Cocin...",1/6/2014 8:58,Inspection request for site
2,Bertrand.Ovan@gastech.com.kronos,"Emile.Arpa@gastech.com.kronos, Varro.Awelon@ga...",1/6/2014 9:28,New refueling policies - Effective February 1
3,Valeria.Morlun@gastech.com.kronos,"Dante.Coginian@gastech.com.kronos, Albina.Hafo...",1/6/2014 9:38,Route suggestion for next shift
4,Mat.Bramar@gastech.com.kronos,"Rachel.Pantanal@gastech.com.kronos, Lars.Azada...",1/6/2014 9:49,Upcoming birthdays


In [369]:
print(email_df.shape)
print(email_df.dtypes)

(1170, 4)
From       object
To         object
Date       object
Subject    object
dtype: object


In [370]:
def stripEmail(email):
    return re.sub(pattern=r"\br\.|@[a-z][a-z.+]+.", repl="", string=email)

def addSplit(email):
    return re.sub(pattern="\.", repl=" ", string=email)

email_df['From'] = email_df['From'].apply(lambda x: stripEmail(x))
email_df['From'] = email_df['From'].apply(lambda x: addSplit(x))
email_df.head(3)

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,"Isak.Baza@gastech.com.kronos, Lucas.Alcazar@ga...",1/6/2014 8:39,GT-SeismicProcessorPro Bug Report
1,Kanon Herrero,"Felix.Resumir@gastech.com.kronos, Hideki.Cocin...",1/6/2014 8:58,Inspection request for site
2,Bertrand Ovan,"Emile.Arpa@gastech.com.kronos, Varro.Awelon@ga...",1/6/2014 9:28,New refueling policies - Effective February 1


In [371]:
email_df['To'] = email_df['To'].apply(lambda x: stripEmail(x))
email_df['To'] = email_df['To'].apply(lambda x: addSplit(x))
email_df.head(3)

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,Isak Baza Lucas Alcazar,1/6/2014 8:39,GT-SeismicProcessorPro Bug Report
1,Kanon Herrero,Felix Resumir Hideki Cocinaro Inga Ferro Varja...,1/6/2014 8:58,Inspection request for site
2,Bertrand Ovan,Emile Arpa Varro Awelon Dante Coginian Albina ...,1/6/2014 9:28,New refueling policies - Effective February 1


In [372]:
# M/D/YYYY
email_df['Date'] = pd.to_datetime(email_df['Date'], errors='raise')
email_df.dtypes

From               object
To                 object
Date       datetime64[ns]
Subject            object
dtype: object

In [373]:
print(email_df.shape)

(1170, 4)


In [374]:
punctuation = None 

def clean(text, stopwords) -> str:
    """
    Clean text sentence 
    Params: text: the string to clean
    stopwords: a list of NLTK stopwords to remove from input row
    Returns: cleaned sentence
    """ 
    text = re.sub(r'<[^>?]*>', '', text)
    text_list = text.split()
    text_words = []
    punctuation = set(string.punctuation)
    
    for word in text_list: 
        while len(word) > 0 and word[0] in punctuation: 
            word = word[1:]
        
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
            
        word = word.replace(',', '')
        word = word.replace('\"', '')
        word = word.replace('\'', '')

        if len(word) > 0 and "/" not in word: 
            if word.lower() not in stopwords: 
                text_words.append(word.lower())
        cleaner_text = " ".join(text_words)
    return cleaner_text

email_df['Subject'] = email_df['Subject'].apply(clean, stopwords=sw)
email_df.head(3)

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,Isak Baza Lucas Alcazar,2014-01-06 08:39:00,gt-seismicprocessorpro bug report
1,Kanon Herrero,Felix Resumir Hideki Cocinaro Inga Ferro Varja...,2014-01-06 08:58:00,inspection request site
2,Bertrand Ovan,Emile Arpa Varro Awelon Dante Coginian Albina ...,2014-01-06 09:28:00,new refueling policies effective february 1


### News Articles

News articles contain historical information.

In [332]:
def extract_article_name(name) -> str:
    """
    Params: str: article name
    Returns:
        str: shorten file name
    """
    return "article " + re.sub(pattern=r"\.+.*", repl="", string=name)

def extract_data(rootdir, news_article) -> dict:
    """
    Params:
        rootdir (Path): path to search for article files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('txt'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        news_name = extract_article_name(file)
                        news_article[news_name] = datafile.read()
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse article: ", ex)
                        continue
    return news_article

news_article = {}
rootdir = 'articles/'
news_article = extract_data(rootdir, news_article)
articles_df = pd.DataFrame.from_dict(data=news_article, orient='index', columns=['description'])
articles_df.reset_index(inplace=True)
print(articles_df.shape)
articles_df.head()

(845, 2)


Unnamed: 0,index,description
0,article 0,The Orb\n\n\nBUMP OF PROTESTS IN ABILA IN RESP...
1,article 1,The Light of Truth\n\nENORMOUS IPO MAKES THE B...
2,article 10,Homeland Illumination\nVOICES - a blog about w...
3,article 100,The Continent\n\nFour people have died in an e...
4,article 101,Daily Pegasus\n\nTHE DEMONSTRATION ATTRACTS TH...


In [333]:
articles_df['description'] = articles_df['description'].apply(clean, stopwords=sw)
articles_df.head()

Unnamed: 0,index,description
0,article 0,orb bump protests abila response calls action ...
1,article 1,light truth enormous ipo makes billionaire san...
2,article 10,homeland illumination voices blog important pe...
3,article 100,continent four people died enthusiastic discha...
4,article 101,daily pegasus demonstration attracts thousands...


### Resumes

Using resumes to look up historical employee data

In [334]:
def extract_resume_name(name) -> str:
    """
    Params: str: resume name
    Returns:
        str: shorten file name
    """
    name = re.sub(pattern=r"\.+.*|Bio|Resume", repl="", string=name)
    name = re.sub(pattern=r"-", repl=" ", string=name)
    return name

def extract_data(rootdir, resumes) -> dict:
    """
    Params:
        rootdir (Path): path to search for article files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('docx'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        resume_name = extract_resume_name(file)
                        doc = docx.Document(nav_file)
                        fullText = []
                        for para in doc.paragraphs:
                            fullText.append(para.text)
                        resumes[resume_name] = '\n'.join(fullText)
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse resume: ", ex)
                        continue
    return resumes

resumes = {}
rootdir = 'resumes/'
resumes = extract_data(rootdir, resumes)
resumes_df = pd.DataFrame.from_dict(data=resumes, orient='index', columns=['resume'])
resumes_df.reset_index(inplace=True)
print(resumes_df.shape)
resumes_df.head()

(35, 2)


Unnamed: 0,index,resume
0,Ada Campo,Ada Campo-Corrente\nChief Information Officer\...
1,Ingrid Barranco,Ingrid Barranco\nChief Financial Officer\n\n\n...
2,OrhanStrum,Orhan Strum\nChief Operating Officer\n\n\nMr. ...
3,Sten SanjorgeJr,"Sten Sanjorge, Jr\nPresident & CEO GASTech\nTe..."
4,Willem Vasco,Willem Vasco-Pais\nEnvironmental Safety Adviso...


In [335]:
def clean_text_resume(text):
    text = re.sub(r'<[^>?]*>', '', text)
    text_list = text.split()
    text_words = []
    punctuation = set(string.punctuation)
    
    for word in text_list: 
        while len(word) > 0 and word[0] in punctuation: 
            word = word[1:]
        
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
            
        word = word.replace(',', '')
        word = word.replace('\"', '')
        word = word.replace('\'', '')

        if len(word) > 0 and "/" not in word: 
            text_words.append(word.lower())
        cleaner_text = " ".join(text_words)
    return cleaner_text

resumes_df['resume'] = resumes_df['resume'].apply(clean_text_resume)
resumes_df.head()

Unnamed: 0,index,resume
0,Ada Campo,ada campo-corrente chief information officer m...
1,Ingrid Barranco,ingrid barranco chief financial officer ms ing...
2,OrhanStrum,orhan strum chief operating officer mr strum s...
3,Sten SanjorgeJr,sten sanjorge jr president ceo gastech tethys ...
4,Willem Vasco,willem vasco-pais environmental safety advisor...


### Historical Documents

In [336]:
def extract_doc_name(name) -> str:
    """
    Params: str: resume name
    Returns:
        str: shorten file name
    """
    return re.match(pattern=r"^([0-9]+ year).*$", string=name).group(1)

def extract_data(rootdir, hist_doc) -> dict:
    """
    Params:
        rootdir (Path): path to search for files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('docx'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        hist_name = extract_doc_name(file)
                        doc = docx.Document(nav_file)
                        fullText = []
                        for para in doc.paragraphs:
                            fullText.append(para.text)
                        hist_doc[hist_name] = '\n'.join(fullText)
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse resume: ", ex)
                        continue
    return hist_doc

historical_doc = {}
rootdir = 'HistoricalDocuments/'
historical_doc = extract_data(rootdir, historical_doc)
historical_df = pd.DataFrame.from_dict(data=historical_doc, orient='index', columns=['description'])
historical_df.reset_index(inplace=True)
print(historical_df.shape)
historical_df.head()

(2, 2)


Unnamed: 0,index,description
0,10 year,\n\n<EXCERPTS from>The Application and Validat...
1,5 year,History of the Protectors of Kronos\nA Psycorp...


In [337]:
historical_df['description'].apply(clean_text_resume)
historical_df.head()

Unnamed: 0,index,description
0,10 year,\n\n<EXCERPTS from>The Application and Validat...
1,5 year,History of the Protectors of Kronos\nA Psycorp...


### FactBook

In [338]:
def extract_doc_name(name) -> str:
    """
    Params: str: resume name
    Returns:
        str: shorten file name
    """
    return re.match(pattern=r"^.*?(?=\.)", string=name).group(0)

def extract_data(rootdir, hist_doc) -> dict:
    """
    Params:
        rootdir (Path): path to search for files
        news_article (dict): empty dict to fill of <file name, description>
    Returns:
        news_article: full <file name, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('docx'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        print(file)
                        file_name = str.lower(extract_doc_name(file))
                        doc = docx.Document(nav_file)
                        fullText = []
                        for para in doc.paragraphs:
                            fullText.append(para.text)
                        factbook_doc[file_name] = '\n'.join(fullText)
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse factbook: ", ex)
                        continue
    return factbook_doc

factbook_doc = {}
rootdir = 'factbook/'
factbook_doc = extract_data(rootdir, factbook_doc)
factbook_df = pd.DataFrame.from_dict(data=factbook_doc, orient='index', columns=['description'])
factbook_df.reset_index(inplace=True)
print(factbook_df.shape)
factbook_df.head()

FACTBOOK-Kronos.docx
FACTBOOK-Tethys.docx
(2, 2)


Unnamed: 0,index,description
0,factbook-kronos,FACTBOOK :: Kronos \nIntroduction\nBackground:...
1,factbook-tethys,FACTBOOK :: Tethys\n\nIntroduction\nBackground...


In [339]:
factbook_df['description'].apply(clean_text_resume)
factbook_df.head()

Unnamed: 0,index,description
0,factbook-kronos,FACTBOOK :: Kronos \nIntroduction\nBackground:...
1,factbook-tethys,FACTBOOK :: Tethys\n\nIntroduction\nBackground...


### Email Analysis

In [340]:
email_df.head()

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,Isak Baza Lucas Alcazar,2014-01-06 08:39:00,gt-seismicprocessorpro bug report
1,Kanon Herrero,Felix Resumir Hideki Cocinaro Inga Ferro Varja...,2014-01-06 08:58:00,inspection request site
2,Bertrand Ovan,Emile Arpa Varro Awelon Dante Coginian Albina ...,2014-01-06 09:28:00,new refueling policies effective february 1
3,Valeria Morlun,Dante Coginian Albina Hafon Benito Hawelon Hen...,2014-01-06 09:38:00,route suggestion next shift
4,Mat Bramar,Rachel Pantanal Lars Azada Felix Balas Isande ...,2014-01-06 09:49:00,upcoming birthdays


In [341]:
groupby_name = email_df.copy()
groupby_name = email_df.groupby(by=['From'])
groupby_name = groupby_name.count().sort_values(by='To', ascending=False)
groupby_name.head()

Unnamed: 0_level_0,To,Date,Subject
From,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lucas Alcazar,59,59,59
Nils Calixto,54,54,54
Isak Baza,50,50,50
Sven Flecha,42,42,42
Mat Bramar,42,42,42



Create a commuication network within the GASTech organization and Protectors of Kronos members.

In [342]:
total_email_count = email_df.shape[0]
groupby_name['Frequency'] = groupby_name['To'].apply(lambda x: (x/total_email_count)*100)
groupby_name.head()

Unnamed: 0_level_0,To,Date,Subject,Frequency
From,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lucas Alcazar,59,59,59,5.042735
Nils Calixto,54,54,54,4.615385
Isak Baza,50,50,50,4.273504
Sven Flecha,42,42,42,3.589744
Mat Bramar,42,42,42,3.589744


In [343]:
comm_network = email_df.groupby(by=['From'])
comm_network.head()

Unnamed: 0,From,To,Date,Subject
0,Sven Flecha,Isak Baza Lucas Alcazar,2014-01-06 08:39:00,gt-seismicprocessorpro bug report
1,Kanon Herrero,Felix Resumir Hideki Cocinaro Inga Ferro Varja...,2014-01-06 08:58:00,inspection request site
2,Bertrand Ovan,Emile Arpa Varro Awelon Dante Coginian Albina ...,2014-01-06 09:28:00,new refueling policies effective february 1
3,Valeria Morlun,Dante Coginian Albina Hafon Benito Hawelon Hen...,2014-01-06 09:38:00,route suggestion next shift
4,Mat Bramar,Rachel Pantanal Lars Azada Felix Balas Isande ...,2014-01-06 09:49:00,upcoming birthdays
...,...,...,...,...
750,Minke Mies,Inga Ferro,2014-01-14 15:36:00,late meeting
821,Marin Onda,Emile Arpa,2014-01-15 11:17:00,files
829,Linnea Bergen,Claudio Hawelon,2014-01-15 12:10:00,favor borrow hedge trimmer
848,Birgitta Frente,Marin Onda Brand Tempestad Elsa Orilla Axel Ca...,2014-01-15 13:26:00,wellhead flow rate data overpressure well 1783-03


How frequently do people contact each other? ie. Ada contacted Felix 2 times.

In [344]:
emails_sent = pd.DataFrame()
for employee_from_name, employee_to_name in comm_network:
    name_map_frequency = {}
    for arr_name in employee_to_name['To']:
        arr_name = re.findall(r"\w+ \w+", arr_name)
        counter = 0
        for index in arr_name:
            counter+=1
            name_map_frequency[index] = name_map_frequency.get(index, 0) + counter
    emails_sent[employee_from_name] = name_map_frequency
emails_sent = emails_sent.T





In [345]:
emails_sent.shape

(54, 65)

In [346]:
emails_sent.fillna(0, inplace=True)
emails_sent.head()

Unnamed: 0,Felix Resumir,Rachel Pantanal,Lars Azada,Felix Balas,Isande Borrasca,Axel Calzas,Gustav Cazar,Lidelse Dedos,Birgitta Frente,Vira Frente,...,Mies Haber,Anda Ribera,Sten Sanjorge,Corrente Ingrid,Pais Ada,Corrente Orhan,Strum Willem,Pais Sten,Pais Ingrid,Ruscella Mies
Ada Campo-Corrente,2.0,2.0,4.0,6.0,8.0,10.0,12.0,14.0,16.0,18.0,...,110.0,112.0,14.0,7.0,6.0,22.0,6.0,10.0,4.0,1.0
Adan Morlun,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,55.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adra Nubarron,0.0,3.0,20.0,24.0,20.0,19.0,34.0,37.0,39.0,39.0,...,165.0,168.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
Albina Hafon,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Anda Ribera,0.0,40.0,4.0,3.0,4.0,5.0,8.0,7.0,8.0,9.0,...,55.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0


In [347]:
print(emails_sent.columns)

Index(['Felix Resumir', 'Rachel Pantanal', 'Lars Azada', 'Felix Balas',
       'Isande Borrasca', 'Axel Calzas', 'Gustav Cazar', 'Lidelse Dedos',
       'Birgitta Frente', 'Vira Frente', 'Adra Nubarron', 'Marin Onda',
       'Kare Orilla', 'Elsa Orilla', 'Brand Tempestad', 'Ingrid Barranco',
       'Ada Campo', 'Corrente Sten', 'Sanjorge Jr', 'Orhan Strum',
       'Willem Vasco', 'Pais Bertrand', 'Ovan Albina', 'Hafon Benito',
       'Hawelon Claudio', 'Hawelon Henk', 'Mies Valeria', 'Morlun Adan',
       'Morlun Cecilia', 'Morluniau Irene', 'Nant Dylan', 'Scozzese Lucas',
       'Alcazar Isak', 'Baza Linnea', 'Bergen Nils', 'Calixto Sven',
       'Flecha Hideki', 'Cocinaro Inga', 'Ferro Loreto', 'Bodrogi Isia',
       'Vann Stenig', 'Fusil Hennie', 'Osvaldo Kanon', 'Herrero Varja',
       'Lagos Minke', 'Mies Felix', 'Resumir Edvard', 'Vann Emile',
       'Arpa Varro', 'Awelon Mat', 'Bramar Dante', 'Coginian Carla',
       'Forluniau Linda', 'Lagos Cornelia', 'Lais Ruscella', 'Mies Ha

## #TODO i have an off by 1 error

In [357]:
for employee in emails_sent.columns:
    try: 
        assert (employee_records_df['FullName'].eq(employee)).any()
    except Exception as ex:
        print(employee)

Ada Campo
Corrente Sten
Sanjorge Jr
Willem Vasco
Pais Bertrand
Ovan Albina
Hafon Benito
Hawelon Claudio
Hawelon Henk
Mies Valeria
Morlun Adan
Morlun Cecilia
Morluniau Irene
Nant Dylan
Scozzese Lucas
Alcazar Isak
Baza Linnea
Bergen Nils
Calixto Sven
Flecha Hideki
Cocinaro Inga
Ferro Loreto
Bodrogi Isia
Vann Stenig
Fusil Hennie
Osvaldo Kanon
Herrero Varja
Lagos Minke
Mies Felix
Resumir Edvard
Vann Emile
Arpa Varro
Awelon Mat
Bramar Dante
Coginian Carla
Forluniau Linda
Lagos Cornelia
Lais Ruscella
Mies Haber
Sten Sanjorge
Corrente Ingrid
Pais Ada
Corrente Orhan
Strum Willem
Pais Sten
Pais Ingrid
Ruscella Mies


In [351]:
px.bar(emails_sent, x=emails_sent.index, y=emails_sent.columns.values)

In [272]:
gastech_chord = pd.DataFrame()
gastech_chord['source'] = emails_sent.index
gastech_chord['target'] = emails_sent.columns
gastech_chord['weight'] = emails_sent.values
d3 = d3blocks.D3Blocks()
d3.chord(df=gastech_chord)

ValueError: Length of values (66) does not match length of index (54)

In [None]:
grouped_birth = employee_records_df.groupby(by=['BirthCountry'], as_index=False).size()
px.bar(grouped_birth, x='BirthCountry', y='size', title='Birth Country Count')

Asteria isn't mentioned anywhere in any documents.

In [None]:
grouped_employment = employee_records_df.groupby(by=['CurrentEmploymentType'], as_index=False).size()
px.bar(grouped_employment, x='CurrentEmploymentType', y='size', title='Current Department Count')

In [None]:
grouped_title = employee_records_df.groupby(by=['CurrentEmploymentTitle'], as_index=False).size()
px.bar(grouped_title, x='CurrentEmploymentTitle', y='size', title='Employee Title Count')

Query if a citzen's birth country is not the same as their current citizenship status

In [None]:
citizen_transfer = employee_records_df.copy()
citizen_transfer = citizen_transfer.query(expr="BirthCountry != CitizenshipCountry")
citizen_transfer.head()

In [None]:
# citizen_transfer = employee_records_df.copy()
# citizen_transfer = citizen_transfer.query(expr="CitizenshipCountry != PassportCountry")
# citizen_transfer.head()

In [None]:
sympathizers_kronos = email_df.query("Subject.str.contains('kronos')", engine='python', inplace=False)
print(sympathizers_kronos.shape)
sympathizers_kronos

In [None]:
sympathizers = sympathizers_kronos.query("Subject.str.contains('defenders')", engine='python', inplace=False)
sympathizers = sympathizers[['From']]
employee_sympatizers = employee_records_df.apply(lambda x: x[employee_records_df['FullName'].isin(sympathizers['From'])])
employee_sympatizers.head()

### Sentiment Analysis of News Articles

Sentiment Analysis with Pattern
https://github.com/clips/pattern/wiki/pattern-en 

Written text can be broadly categorized into two types: facts and opinions. Opinions carry people's sentiments, appraisals and feelings toward the world. The pattern.en module bundles a lexicon of adjectives (e.g., good, bad, amazing, irritating, ...) that occur frequently in product reviews, annotated with scores for sentiment polarity (positive ↔ negative) and subjectivity (objective ↔ subjective). The sentiment() function returns a (polarity, subjectivity)-tuple for the given sentence, based on the adjectives it contains, where polarity is a value between -1.0 and +1.0 and subjectivity between 0.0 and 1.0. The sentence can be a string, Text, Sentence, Chunk, Word or a Synset (see below). 

In [None]:
def sentiment_analysis(sentence) -> tuple:
    """
    Perform Sentiment analysis on recieved text
    Params: str: sentence
    Returns:
        tuple: polarity(sentiment score), subjectivity(sentiment modality)
         Polarity returns the average in the sentence.
         Subjectivity quantifies the amount of personal opinion and factual information contained in the text. 
         The higher subjectivity means that the text contains personal opinion rather than factual information.
    """
    sentiment_score, sentiment_modality = sentiment(sentence)
    return sentiment_score, sentiment_modality

articles_df['sentiment_score'], articles_df['sentiment_modality'] = zip(*articles_df['description'].apply(lambda x: sentiment_analysis(x)))

Sentiment score a value between -1.0 - +1.0. Where -1 means negative sentiment, 0 neutral, and 1 is postive sentiment.

Sentiment modality a value between 0 - 1.0. Where 0 is a fact and 1.0 is a person's opinion

** This is sorted alphabetically.

In [None]:
articles_df.head()

In [None]:
articles_df.describe()

In [None]:
px.bar(articles_df, x='index', y='sentiment_score')

In [None]:
px.bar(articles_df, x='index', y='sentiment_modality')

In [None]:
police_news = articles_df.query("description.str.contains('police')", engine='python', inplace=False)
print(police_news.shape)
police_news.head()

In [None]:
arrest_df = police_news.query("description.str.contains('blotter')", engine='python', inplace=False)
arrest_df

In [None]:
print(arrest_df.description.values[0])

In [None]:
# def search_arrested_articles(employee, arrest_df):
#     if arrest_df['description'].__contains__(employee): 
#         return employee, True

# arrest_df['GASTech Employee'], arrest_df['Arrested'] = zip(*employee_records_df['FullName'].apply(lambda e: search_arrested_articles(e, arrest_df)))

In [None]:
# define the model
# def define_model(vocab_size, max_length):
#     model = Sequential()
#     model.add(Embedding(vocab_size, 100, input_length=max_length))
#     model.add(Conv1D(32, 8, activation='relu'))
#     model.add(MaxPooling1D(2))
#     model.add(Flatten())
#     model.add(Dense(10, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))
#     # compile network
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     # summarize defined model
#     model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
#     return model

In [None]:
# fit network
# model.fit(Xtrain, ytrain, epochs=10, verbose=2)

In [None]:
# save the model
# model.save('model.h5')
# train_docs, ytrain = load_clean_dataset(vocab, True)
# # load the model
# model = load_model('model.h5')
# # evaluate model on training dataset
# _, acc = model.evaluate(Xtrain, ytrain, verbose=0)
# print('Train Accuracy: %f' % (acc*100))
# # evaluate model on test dataset
# _, acc = model.evaluate(Xtest, ytest, verbose=0)
# print('Test Accuracy: %f' % (acc*100))