In [None]:
import os
import pandas as pd
import numpy as np 
import plotly
import re
from datetime import datetime
import explainerdashboard
from d3blocks import d3blocks
from nltk.corpus import stopwords
import string
import nltk
import docx
from collections.abc import Iterable
from pattern.en import sentiment, subjectivity
import seaborn as sns
import plotly.express as px

In [None]:
nltk.download('stopwords')
sw = set(stopwords.words('english'))
list(sw)[0:10]

In [None]:
print(set(string.punctuation))

### Employee Records

In [None]:
employee_records_df = pd.read_excel('EmployeeRecords.xlsx', sheet_name='Employee Records')
employee_records_df.head(2)

For datetime64[ns] types, NaT represents missing values. (Not a valid time). 

While NaN is the default missing value marker for reasons of computational speed and convenience. In many cases, however, the Python `None` will arise and we wish to also consider that “missing” or “not available” or “NA”. 

In [None]:
index_employee_records_df = pd.read_excel('EmployeeRecords.xlsx', sheet_name='Index')
index_employee_records_df.head(3)

In [None]:
employee_records_df.shape

In [None]:
employee_records_df.dtypes

In [None]:
employee_records_df.isnull().sum()

The missingness means the employee did not serve in a military branch.

The missingness from passport means they did not own a passport.

#### Feature Engineering: Create Full Name column

In [None]:
employee_records_df['FullName'] = employee_records_df['FirstName'] + ' ' + employee_records_df['LastName']

### Employee Emails

In [None]:
# single-byte charcter encoding of the latin alphabet
email_df = pd.read_csv('emailheaders.csv', encoding='cp1252')
email_df.head()

In [None]:
print(email_df.shape)
print(email_df.dtypes)

In [None]:
def stripEmail(email):
    return re.sub(pattern=r"\br\.|@[a-z][a-z.+]+.", repl="", string=email)

def addSplit(email):
    return re.sub(pattern="\.", repl=" ", string=email)

email_df['From'] = email_df['From'].apply(lambda x: stripEmail(x))
email_df['From'] = email_df['From'].apply(lambda x: addSplit(x))
email_df.head(3)

#todo add assert from employee verification check

In [None]:
def stripEmail_From(email):
    return re.sub(pattern=r"\br\.|@[a-z][a-z.+]+.", repl="", string=email)

def addSplit(email):
    return re.sub(pattern="\.", repl=" ", string=email)

In [None]:
# M/D/YYYY
email_df['Date'] = pd.to_datetime(email_df['Date'], errors='raise')
email_df.dtypes

In [None]:
print(email_df.shape)

In [None]:
punctuation = None 

def clean(text, stopwords) -> str:
    """
    Clean text sentence 
    Params: text: the string to clean
    stopwords: a list of NLTK stopwords to remove from input row
    Returns: cleaned sentence
    """ 
    text = re.sub(r'<[^>?]*>', '', text)
    text_list = text.split()
    text_words = []
    punctuation = set(string.punctuation)
    
    for word in text_list: 
        while len(word) > 0 and word[0] in punctuation: 
            word = word[1:]
        
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
            
        word = word.replace(',', '')
        word = word.replace('\"', '')
        word = word.replace('\'', '')

        if len(word) > 0 and "/" not in word: 
            if word.lower() not in stopwords: 
                text_words.append(word.lower())
        cleaner_text = " ".join(text_words)
    return cleaner_text

email_df['Subject'] = email_df['Subject'].apply(clean, stopwords=sw)
email_df.head(3)

### News Articles

News articles contain historical information.

In [None]:
def extract_article_name(name) -> str:
    """
    Params: str: article name
    Returns:
        str: shorten file name
    """
    return "article " + re.sub(pattern=r"\.+.*", repl="", string=name)

def extract_datetime(file) -> list:
    """
    Extract datetime of news article for timeline
    Params: article to parse
    Returns: list of matches or empty list if no matches
    """
    return re.findall(r'\d+/\d+/\d+', file) or re.findall(r'\d+ \w+ \d+', file)

def extract_data(rootdir, news_article) -> dict:
    """
    Params:
        rootdir (Path): path to search for article files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('txt'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        news_name = extract_article_name(file)
                        news_article[news_name] = datafile.read()
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse article: ", ex)
                        continue
    return news_article

news_article = {}
rootdir = 'articles/'
news_article = extract_data(rootdir, news_article)
articles_df = pd.DataFrame.from_dict(data=news_article, orient='index', columns=['description'])
articles_df['datetime'] = articles_df['description'].apply(lambda x: extract_datetime(x))
articles_df.reset_index(inplace=True)
print(articles_df.shape)
print(articles_df.dtypes)
articles_df.head()

In [None]:
articles_df['description'] = articles_df['description'].apply(clean, stopwords=sw)
articles_df.head()

### Resumes

Using resumes to look up historical employee data

In [None]:
def extract_resume_name(name) -> str:
    """
    Params: str: resume name
    Returns:
        str: shorten file name
    """
    name = re.sub(pattern=r"\.+.*|Bio|Resume", repl="", string=name)
    name = re.sub(pattern=r"-", repl=" ", string=name)
    return name

def extract_data(rootdir, resumes) -> dict:
    """
    Params:
        rootdir (Path): path to search for article files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('docx'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        resume_name = extract_resume_name(file)
                        doc = docx.Document(nav_file)
                        fullText = []
                        for para in doc.paragraphs:
                            fullText.append(para.text)
                        resumes[resume_name] = '\n'.join(fullText)
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse resume: ", ex)
                        continue
    return resumes

resumes = {}
rootdir = 'resumes/'
resumes = extract_data(rootdir, resumes)
resumes_df = pd.DataFrame.from_dict(data=resumes, orient='index', columns=['resume'])
resumes_df.reset_index(inplace=True)
print(resumes_df.shape)
resumes_df.head()

In [None]:
def clean_text_resume(text):
    text = re.sub(r'<[^>?]*>', '', text)
    text_list = text.split()
    text_words = []
    punctuation = set(string.punctuation)
    
    for word in text_list: 
        while len(word) > 0 and word[0] in punctuation: 
            word = word[1:]
        
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
            
        word = word.replace(',', '')
        word = word.replace('\"', '')
        word = word.replace('\'', '')

        if len(word) > 0 and "/" not in word: 
            text_words.append(word.lower())
        cleaner_text = " ".join(text_words)
    return cleaner_text

resumes_df['resume'] = resumes_df['resume'].apply(clean_text_resume)
resumes_df.head()

### Historical Documents

In [None]:
def extract_doc_name(name) -> str:
    """
    Params: str: resume name
    Returns:
        str: shorten file name
    """
    return re.match(pattern=r"^([0-9]+ year).*$", string=name).group(1)

def extract_data(rootdir, hist_doc) -> dict:
    """
    Params:
        rootdir (Path): path to search for files
        news_article (dict): empty dict to fill of <article, description>
    Returns:
        news_article: full <article, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('docx'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        hist_name = extract_doc_name(file)
                        doc = docx.Document(nav_file)
                        fullText = []
                        for para in doc.paragraphs:
                            fullText.append(para.text)
                        hist_doc[hist_name] = '\n'.join(fullText)
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse resume: ", ex)
                        continue
    return hist_doc

historical_doc = {}
rootdir = 'HistoricalDocuments/'
historical_doc = extract_data(rootdir, historical_doc)
historical_df = pd.DataFrame.from_dict(data=historical_doc, orient='index', columns=['description'])
historical_df.reset_index(inplace=True)
print(historical_df.shape)
historical_df.head()

In [None]:
historical_df['description'].apply(clean_text_resume)
historical_df.head()

### FactBook

In [None]:
def extract_doc_name(name) -> str:
    """
    Params: str: resume name
    Returns:
        str: shorten file name
    """
    return re.match(pattern=r"^.*?(?=\.)", string=name).group(0)

def extract_data(rootdir, hist_doc) -> dict:
    """
    Params:
        rootdir (Path): path to search for files
        news_article (dict): empty dict to fill of <file name, description>
    Returns:
        news_article: full <file name, description> dict
    """
    assert os.path.exists(rootdir)
    for subdir, dirs, files in os.walk(rootdir):
        assert os.path.exists(subdir)
        for file in files:
            if file.__contains__('docx'):
                nav_file = os.path.join(subdir, file)
                with open(nav_file, 'r') as datafile:
                    try:
                        print(file)
                        file_name = str.lower(extract_doc_name(file))
                        doc = docx.Document(nav_file)
                        fullText = []
                        for para in doc.paragraphs:
                            fullText.append(para.text)
                        factbook_doc[file_name] = '\n'.join(fullText)
                        datafile.close()
                    except Exception as ex:
                        print("Failed to parse factbook: ", ex)
                        continue
    return factbook_doc

factbook_doc = {}
rootdir = 'factbook/'
factbook_doc = extract_data(rootdir, factbook_doc)
factbook_df = pd.DataFrame.from_dict(data=factbook_doc, orient='index', columns=['description'])
factbook_df.reset_index(inplace=True)
print(factbook_df.shape)
factbook_df.head()

In [None]:
factbook_df['description'].apply(clean_text_resume)
factbook_df.head()

### Email Analysis

In [None]:
email_df.head()

In [None]:
groupby_name = email_df.copy()
groupby_name = email_df.groupby(by=['From'])
groupby_name = groupby_name.count().sort_values(by='To', ascending=False)
groupby_name.head()


Create a commuication network within the GASTech organization and Protectors of Kronos members.

In [None]:
total_email_count = email_df.shape[0]
groupby_name['Frequency'] = groupby_name['To'].apply(lambda x: (x/total_email_count)*100)
groupby_name.head()

In [None]:
comm_network = email_df.groupby(by=['From'])
comm_network.head()

How frequently do people contact each other? ie. Ada emailed Felix 2 times.

In [None]:
emails_sent = pd.DataFrame()
for employee_from_name, employee_to_name in comm_network:
    name_map_frequency = {}
    for arr_name in employee_to_name['To']:
        # check that email exists in employee records df
        counter = 0
        if arr_name.__contains__(","):
            arr_name = arr_name.split(",")
            for employee in arr_name: 
                try:
                    assert employee_records_df['EmailAddress'].eq(employee.strip()).any()
                    counter+=1
                    employee = stripEmail(employee)
                    employee = addSplit(employee)
                    name_map_frequency[employee] = name_map_frequency.get(employee, 0) + counter
                except Exception as ex:
                    print(employee)
        else:
            try:
                assert employee_records_df['EmailAddress'].eq(arr_name).any()
                counter+=1
                arr_name = stripEmail(arr_name)
                arr_name = addSplit(arr_name)
                name_map_frequency[arr_name] = name_map_frequency.get(arr_name, 0) + counter
            except Exception as ex:
                print(arr_name)
    emails_sent[employee_from_name] = name_map_frequency    
emails_sent = emails_sent.T

In [None]:
# email_df['To'] = email_df['To'].apply(lambda x: stripEmail_From(x))
# email_df['To'] = email_df['To'].apply(lambda x: addSplit(x))

In [None]:
emails_sent.head()

In [None]:
emails_sent.columns = emails_sent.columns.str.strip()

In [None]:
emails_sent.shape

In [None]:
emails_sent.fillna(0, inplace=True)
emails_sent.head()

In [None]:
print(emails_sent.columns)

In [None]:
emails_sent = emails_sent.loc[:,~emails_sent.columns.duplicated()].copy()
emails_sent.columns

In [None]:
emails_sent.shape

In [None]:
emails_sent.head()

In [None]:
emails_sent.describe()

Reads row x column. ie. Ada sent Felix Resumir 2 emails.

In [None]:
px.bar(emails_sent, x=emails_sent.index, y=emails_sent.columns.values, title='GASTech Employee Email Frequency Recipients')

We don't have much insight other than Mat could be a bot. 

In [None]:
emails_sent.head(1)

In [None]:
emails_sent.columns = emails_sent.columns.str.strip()
emails_sent.index = emails_sent.index.str.strip()

In [None]:
emails_sent = emails_sent.unstack().reset_index().rename(columns={'level_0': 'target', 'level_1': 'source', 0: 'weight'})
emails_sent

### GASTech Communication Network Chord Graph

In [None]:
emails_sent.to_csv('emails_sent_header.csv')

In [52]:
emails_sent.head(1)

Unnamed: 0,target,source,weight
0,Felix Resumir,Ada Campo-Corrente,2.0


In [None]:
d3 = d3blocks.D3Blocks()
# d3.set_node_properties(emails_sent, color='source-target', opacity=100)
d3.chord(df=emails_sent, color='source-target', opacity=100, title='GASTech Communication Network Chord Graph', filepath='./d3blocks.html', figsize=[1000,1000])

In [59]:
# import holoviews as hv
# from holoviews import opts, dim

# recipients = list(set(emails_sent["target"].unique().tolist() + emails_sent["source"].unique().tolist()))
# email_HV_dataset = hv.Dataset(pd.DataFrame(recipients, columns=["Email"]))

# chord = hv.Chord(data=(emails_sent, email_HV_dataset))
# chord.opts(
#     opts.Chord(cmap='Category20', edge_cmap='Category20'))
# chord.opts(
#     opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('Email').str(), 
#                labels='name', node_color=dim('index').str()))


divide by zero encountered in double_scalars


invalid value encountered in multiply



ValueError: cannot convert float NaN to integer

In [None]:
grouped_birth = employee_records_df.groupby(by=['BirthCountry'], as_index=False).size()
px.bar(grouped_birth, x='BirthCountry', y='size', title='Birth Country Count')

Asteria isn't mentioned anywhere in any documents.

In [None]:
grouped_employment = employee_records_df.groupby(by=['CurrentEmploymentType'], as_index=False).size()
px.bar(grouped_employment, x='CurrentEmploymentType', y='size', title='Current Department Count')

In [None]:
grouped_title = employee_records_df.groupby(by=['CurrentEmploymentTitle'], as_index=False).size()
px.bar(grouped_title, x='CurrentEmploymentTitle', y='size', title='Employee Title Count')

### Query if a citzen's birth country is not the same as their current citizenship status

In [None]:
citizen_transfer = employee_records_df.copy()
citizen_transfer = citizen_transfer.query(expr="BirthCountry != CitizenshipCountry")
citizen_transfer.head()

### Query Are there any symapthizers within GASTech?

In [None]:
sympathizers_kronos = email_df.query("Subject.str.contains('kronos')", engine='python', inplace=False)
print(sympathizers_kronos.shape)
sympathizers_kronos

In [None]:
sympathizers = sympathizers_kronos.query("Subject.str.contains('defenders')", engine='python', inplace=False)
sympathizers = sympathizers[['From']]
employee_sympatizers = employee_records_df.apply(lambda x: x[employee_records_df['FullName'].isin(sympathizers['From'])])
employee_sympatizers.head()

Ruscella Mies Harver, Isia Vann, Loreto Bodrogi, and Inga Ferro are likely PoK symaptizers. Their citizenship is from Kronos, served in the miliatary as well as sending out email contents of defenders of Kronos propaganda. 

### Sentiment Analysis of News Articles with Pattern Library

Sentiment Analysis with Pattern
https://github.com/clips/pattern/wiki/pattern-en 

Written text can be broadly categorized into two types: facts and opinions. Opinions carry people's sentiments, appraisals and feelings toward the world. The pattern.en module bundles a lexicon of adjectives (e.g., good, bad, amazing, irritating, ...) that occur frequently in product reviews, annotated with scores for sentiment polarity (positive ↔ negative) and subjectivity (objective ↔ subjective). The sentiment() function returns a (polarity, subjectivity)-tuple for the given sentence, based on the adjectives it contains, where polarity is a value between -1.0 and +1.0 and subjectivity between 0.0 and 1.0. The sentence can be a string, Text, Sentence, Chunk, Word or a Synset (see below). 

In [None]:
def sentiment_analysis(sentence) -> tuple:
    """
    Perform Sentiment analysis on recieved text
    Params: str: sentence
    Returns:
        tuple: polarity(sentiment score), subjectivity(sentiment modality)
         Polarity returns the average in the sentence.
         Subjectivity quantifies the amount of personal opinion and factual information contained in the text. 
         The higher subjectivity means that the text contains personal opinion rather than factual information.
    """
    sentiment_score, sentiment_modality = sentiment(sentence)
    return sentiment_score, sentiment_modality

articles_df['sentiment_score'], articles_df['sentiment_modality'] = zip(*articles_df['description'].apply(lambda x: sentiment_analysis(x)))

Sentiment score a value between -1.0 - +1.0. Where -1 means negative sentiment, 0 neutral, and 1 is postive sentiment.

Sentiment modality a value between 0 - 1.0. Where 0 is a fact and 1.0 is a person's opinion

** This is sorted alphabetically.

In [None]:
articles_df.head()

In [None]:
articles_df.describe()

In [None]:
px.bar(articles_df, x='index', y='sentiment_score')

In [None]:
px.bar(articles_df, x='index', y='sentiment_modality')

In [None]:
police_news = articles_df.query("description.str.contains('police')", engine='python', inplace=False)
print(police_news.shape)
police_news.head()

In [None]:
arrest_df = police_news.query("description.str.contains('blotter')", engine='python', inplace=False)
arrest_df

In [None]:
print(arrest_df.description.values[0])

### Timeline of Events

In [None]:
articles_df.head()

In [None]:
def parseDatetime(date):
    format = "%Y/%m/%d"
    for date_string in date:
        try:
            date = datetime.strptime(date_string, format).date()
        except Exception as ex: 
            print("Failed to parse datetime stamp: ", ex)
    return date

articles_df['datetime'] = articles_df['datetime'].apply(lambda x: parseDatetime(x)) 
articles_df.head()

In [None]:
articles_df['datetime'] = pd.to_datetime(articles_df['datetime']).dt.date
print(articles_df.dtypes)

In [None]:
# def search_arrested_articles(employee, arrest_df):
#     if arrest_df['description'].__contains__(employee): 
#         return employee, True

# arrest_df['GASTech Employee'], arrest_df['Arrested'] = zip(*employee_records_df['FullName'].apply(lambda e: search_arrested_articles(e, arrest_df)))

In [None]:
# chord_df = pd.DataFrame(columns=['source', 'target', 'weight'])

# tmp_lst = []

# for employee_from in emails_sent.index:
#     for employee_to in emails_sent.columns:
#         # record = (employee_from, employee_to, emails_sent[employee_to][employee_from])
#         record = (employee_from, employee_to, emails_sent[employee_to][employee_from])
#         tmp_lst.append(record)

# tmp = pd.DataFrame(tmp_lst)
# tmp
# chord_df = emails_sent.reset_index().apply(lambda e: [(e[1], e[0], emails_sent[e])])

# chord_df= emails_sent.index.to_series().str.get(1).apply(lambda e: [(e[1], e[0], emails_sent[e])])

# df2 = pd.MultiIndex.from_tuples(tmp_lst)
# def split(x):
#     return x[0][0], x[0][1], str(x[0][2])

# df2 = pd.DataFrame()
# df2['source'], df2['target'], df2['weight'] = tmp.map()

# df2 = pd.DataFrame(emails_sent.unstack())
# df2


# chord_df['source'], chord_df['target'], chord_df['weight'] = zip(*emails_sent.index.apply(lambda e: [splitchord(e, employee_) for employee_ in emails_sent.columns], axis=1))
# chord_df.head()


In [None]:
# define the model
# def define_model(vocab_size, max_length):
#     model = Sequential()
#     model.add(Embedding(vocab_size, 100, input_length=max_length))
#     model.add(Conv1D(32, 8, activation='relu'))
#     model.add(MaxPooling1D(2))
#     model.add(Flatten())
#     model.add(Dense(10, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))
#     # compile network
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     # summarize defined model
#     model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
#     return model

In [None]:
# fit network
# model.fit(Xtrain, ytrain, epochs=10, verbose=2)

In [None]:
# save the model
# model.save('model.h5')
# train_docs, ytrain = load_clean_dataset(vocab, True)
# # load the model
# model = load_model('model.h5')
# # evaluate model on training dataset
# _, acc = model.evaluate(Xtrain, ytrain, verbose=0)
# print('Train Accuracy: %f' % (acc*100))
# # evaluate model on test dataset
# _, acc = model.evaluate(Xtest, ytest, verbose=0)
# print('Test Accuracy: %f' % (acc*100))