In [None]:
# !pip install langchain
# !pip install openai
# !pip install python-dotenv
# !pip install spacy

In [None]:
# !python3 -m spacy download en_core_web_sm

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

# openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_key = 'your-api-key'

# Load PDF

In [None]:
# !pip install pypdf

In [7]:
from langchain.document_loaders import PyPDFLoader
# Load PDF
loader = PyPDFLoader('the-five-dysfunctions-of-a-team.pdf')

In [None]:
docs = loader.load()
docs[0:750]

# NER (Named Entity Recognition) from the spaCy library

In [None]:
# !pip install spacypdfreader
# !pip install pdf2image
# !pip install pytesseract

In [10]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from spacypdfreader.spacypdfreader import pdf_reader
import networkx as nx

NER = spacy.load('en_core_web_sm')
doc = pdf_reader('the-five-dysfunctions-of-a-team.pdf', NER)

In [11]:
# entity examples
for ent in doc.ents[1500:1510]:
    print(ent.text, ent.label_)

two days DATE
Napa GPE
Kathryn PERSON
117 CARDINAL
2/10/02  3:43 PM TIME
The Five Dysfunctions ORG
Team ORG
just a few days DATE
the same day DATE
Kathryn PERSON


In [12]:
# visualize identified entities
displacy.render(doc[1750:2500], style='ent', jupyter=True)

In [13]:
# character description from wikipedia : 'https://en.wikipedia.org/wiki/The_Five_Dysfunctions_of_a_Team'
character_dict = {
    'Kathryn': 'The newly appointed CEO of DecisionTech, Kathryn is brought in to address the company\'s struggles despite its promising technologies and talented staff. Throughout the book, she leads the executive team through various exercises and conversations to address the dysfunctions and make the team more cohesive.',
    'Jeff': 'Co-founder of DecisionTech and the former CEO. He steps down to the VP of Business Development role due to his recognition of his limitations in leading the company further.',
    'Miyuki': 'Chief Financial Officer, often seen as quiet but extremely competent.',
    'Carlos': 'Chief Customer Officer, who is focused on customer satisfaction.',
    'Jan': 'Head of Marketing. She is one of the youngest and brightest minds in the company but has reservations about being forthright in meetings.',
    'Martin': 'The technologically brilliant CTO. Martin tends to be aloof and prefers to remain focused on the tech aspects of the company, often avoiding team politics.',
    'Nick': 'Chief Operating Officer. He comes from a successful stint at a top Silicon Valley firm and is very results-driven.',
    'JR Rutherford': 'VP of Sales. JR is the energetic sales leader who often finds himself in conflict with others due to the aggressive nature of his department\'s objectives.'}

# Get named entity list per sentence

In [14]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [15]:
sent_entity_df

Unnamed: 0,sentence,entities
0,"(01_960756_ffirs_16.qxd, , 1/13/06, , 8:57, ...","[1/13/06 8:57 AM, Five, aTeam]"
1,"(I, P, , F, A, B, L, E, \n\n, Patrick, Lenci...",[Patrick Lencioni]
2,"(01_960756_ffirs_16.qxd, , 1/13/06, , 8:57, ...",[1/13/06 8:57 AM Page ii]
3,"(01_960756_ffirs_16.qxd, , 1/13/06, , 8:57, ...","[1/13/06 8:57 AM, Patrick Lencioni, The Five ..."
4,"(01_960756_ffirs_16.qxd, , 1/13/06, , 8:57, ...",[1/13/06 8:57 AM Page ii]
...,...,...
3249,"(He, is, the, author, of, ﬁve, nationally, rec...","[the New York Times, Five, Jossey-Bass, 2002]"
3250,"(Patrick, lives, in, the, San, Francisco, Bay,...","[Patrick, the San Francisco Bay Area, Laura, t..."
3251,"(To, learn, more, about, Patrick, and, The, Ta...","[Patrick, The Table Group]"
3252,"(229, \n\n)",[229]


In [16]:
# Function to filter out non-character entities
def filter_entity(entity_list, character_dict):
    return [ent for ent in entity_list 
            if ent in list(character_dict.keys())]

In [17]:
# function example call
filter_entity(['Kathryn', 'chen', '2'], character_dict)

['Kathryn']

In [18]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_dict))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered

Unnamed: 0,sentence,entities,character_entities
72,"(04Lencioni, /, Luck, , 2/10/02, , 3:30, PM,...","[2/10/02 3:30 PM, Kathryn, DecisionTech, Inc.]",[Kathryn]
95,"(None, of, DecisionTech, ’s, 150, employees, w...","[DecisionTech, 150, Jeff]",[Jeff]
105,"(Someone, had, to, be, accountable, for, the, ...",[Jeff],[Jeff]
107,"(Until, three, weeks, later, ,, when, Kathryn,...","[three weeks later, Kathryn, 9]",[Kathryn]
108,"(07Lencioni, /, Kathryn, , 2/10/02, , 3:31, ...","[Kathryn, 2/10/02 3:31 PM]",[Kathryn]
...,...,...,...
2829,"(“, I, think, we, owe, it, to, Jeff, and, \n, ...",[Jeff],[Jeff]
3186,"(METHODS, \n\n, Kathryn, understood, that, a, ...",[Kathryn],[Kathryn]
3187,"(Added, together, ,, Kathryn, and, \n, her, te...","[Kathryn, approximately eight days]",[Kathryn]
3189,"(Though, there, are, actually, many, different...",[Kathryn],[Kathryn]


# Create Relationships

In [19]:
window_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [20]:
relationship_df = pd.DataFrame(relationships)

In [21]:
relationship_df

Unnamed: 0,source,target
0,Jeff,Kathryn
1,Jeff,Kathryn
2,Jeff,Kathryn
3,Jeff,Kathryn
4,Kathryn,Jeff
...,...,...
2500,Jeff,Kathryn
2501,Kathryn,Jeff
2502,Kathryn,Jeff
2503,Kathryn,Jeff


In [22]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [23]:
relationship_df

Unnamed: 0,source,target,value
0,Jeff,Kathryn,154
1,Kathryn,Jeff,148
2,Kathryn,Martin,242
3,Martin,Jeff,46
4,Jeff,Martin,59
5,Carlos,Kathryn,68
6,Jeff,Jan,23
7,Jan,Jeff,46
8,Jan,Nick,53
9,Nick,Jeff,43


# Graph analysis and visualization

In [24]:
# Create a graph from a pandas dataframe
G = nx.from_pandas_edgelist(relationship_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

# Graph visualization - Networkx

In [None]:
!pip install pyvis

In [25]:
from pyvis.network import Network
net = Network(notebook = True, width="1000px", height="700px", bgcolor='#222222', font_color='white')

node_degree = dict(G.degree)

#Setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("networkAnalysis.html")

networkAnalysis.html


The graph presented depicts a straightforward visualization of the connections among company colleagues. Notably, Kathryn (CEO) appears to have stronger interactions with Nick and Martin (compared to others), as indicated by the bolder lines. In our subsequent analysis, we'll delve into the nature of their work relationships using advanced language models like langchain and Llama.