In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re

In [None]:
! pip install -U pip setuptools wheel
! pip install -U spacy
! python -m spacy download en_core_web_sm

[0mCollecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
pip install stanza

[0m

In [None]:
!pip install stanford-corenlp

[0m

In [None]:
!pip list

Package                       Version
----------------------------- ---------------------
absl-py                       1.0.0
alabaster                     0.7.12
albumentations                0.1.12
altair                        4.2.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.11.4
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
atari-py                      0.2.9
atomicwrites                  1.4.0
attrs                         21.4.0
audioread                     2.1.9
autograd                      1.3
Babel                         2.9.1
backcall                      0.2.0
beautifulsoup4                4.6.3
bleach                        4.1.0
blis                          0.4.1
bokeh                         2.3.3
Bottleneck                    1.3.4
branca                        0.4.2
bs4                           0.0.1
CacheC

In [None]:
## load local file 
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

dt =pd.read_csv('coi_2k_01_openie_results.csv')
print(dt.shape)
dt.head()

# 1. Installing Stanza
# Note that Stanza only supports Python 3.6 and above. Installing and importing Stanza are as simple as running the following commands:

In [None]:
>>> import stanza
>>> stanza.download('en')       # This downloads the English models for the neural pipeline
>>> nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
>>> doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
>>> doc.sentences[0].print_dependencies()

## 2. Downloading Models
You can download models with the stanza.download command. The language can be specified with either a full language name (e.g., "english"), or a short code (e.g., "en").

By default, models will be saved to your ~/stanza_resources directory. If you want to specify your own path to save the model files, you can pass a dir=your_path argument.

In [None]:
# Download an English model into the default directory
print("Downloading English model...")
stanza.download('en')

# Similarly, download a (simplified) Chinese model
# Note that you can use verbose=False to turn off all printed messages
print("Downloading Chinese model...")
stanza.download('zh', verbose=False)

Downloading English model...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-03-29 19:36:37 INFO: Downloading default packages for language: en (English)...
2022-03-29 19:36:39 INFO: File exists: /root/stanza_resources/en/default.zip.
2022-03-29 19:36:46 INFO: Finished downloading models and saved to /root/stanza_resources.


Downloading Chinese model...


# 3. Processing Text
Constructing Pipeline
To process a piece of text, you'll need to first construct a Pipeline with different Processor units. The pipeline is language-specific, so again you'll need to first specify the language (see examples).

By default, the pipeline will include all processors, including tokenization, multi-word token expansion, part-of-speech tagging, lemmatization, dependency parsing and named entity recognition (for supported languages). However, you can always specify what processors you want to include with the processors argument.

Stanza's pipeline is CUDA-aware, meaning that a CUDA-device will be used whenever it is available, otherwise CPUs will be used when a GPU is not found. You can force the pipeline to use CPU regardless by setting use_gpu=False.

Again, you can suppress all printed messages by setting verbose=False.

In [None]:
# Build an English pipeline, with all processors by default
print("Building an English pipeline...")
en_nlp = stanza.Pipeline('en')

# Build a Chinese pipeline, with customized processor list and no logging, and force it to use CPU
print("Building a Chinese pipeline...")
zh_nlp = stanza.Pipeline('zh', processors='tokenize,lemma,pos,depparse', verbose=False, use_gpu=False)

# Annotating Text
After a pipeline is successfully constructed, you can get annotations of a piece of text simply by passing the string into the pipeline object. The pipeline will return a Document object, which can be used to access detailed annotations from. For example:

In [None]:
# Processing English text
en_doc = en_nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
print(type(en_doc))



# 4. Accessing Annotations
Annotations can be accessed from the returned Document object.

A Document contains a list of Sentences, and a Sentence contains a list of Tokens and Words. For the most part Tokens and Words overlap, but some tokens can be divided into mutiple words, for instance the French token aux is divided into the words à and les, while in English a word and a token are equivalent. Note that dependency parses are derived over Words.

Additionally, a Span object is used to represent annotations that are part of a document, such as named entity mentions.

The following example iterate over all English sentences and words, and print the word information one by one:

In [None]:
for i, sent in enumerate(en_doc.sentences):
    print("[Sentence {}]".format(i+1))
    for word in sent.words:
        print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format(\
              word.text, word.lemma, word.pos, word.head, word.deprel))
    print("")

In [None]:
print("Mention text\tType\tStart-End")
for ent in en_doc.ents:
    print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char, ent.end_char))

In [None]:
word = en_doc.sentences[0].words[0]
print(word)

In [None]:
('Barack', '4', 'nsubj:pass')
('Obama', '1', 'flat')
('was', '4', 'aux:pass')
('born', '0', 'root')
('in', '6', 'case')
('Hawaii', '4', 'obl')
('.', '4', 'punct')

In [None]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

In [None]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!ls $CORENLP_HOME

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], 
    memory='4G', 
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

In [None]:
# Print background processes and look for java
# You should be able to see a StanfordCoreNLPServer java process running in the background
!ps -o pid,cmd | grep java

In [None]:
# Annotate some text
text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity."
document = client.annotate(text)
print(type(document))

In [None]:
# Iterate over all tokens in all sentences, and print out the word, lemma, pos and ner tags
print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))

for i, sent in enumerate(document.sentence):
    print("[Sentence {}]".format(i+1))
    for t in sent.token:
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(t.word, t.lemma, t.pos, t.ner))
    print("")

In [None]:
# Iterate over all detected entity mentions
print("{:30s}\t{}".format("Mention", "Type"))

for sent in document.sentence:
    for m in sent.mentions:
        print("{:30s}\t{}".format(m.entityMentionText, m.entityType))

In [None]:
# Print annotations of a token
print(document.sentence[0].token[0])

# Print annotations of a mention
print(document.sentence[0].mentions[0])

In [None]:
# Shut down the background CoreNLP server
client.stop()

time.sleep(10)
!ps -o pid,cmd | grep java

In [None]:
print("Starting a server with the Python \"with\" statement...")
with CoreNLPClient(annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], 
                   memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
    text = "Albert Einstein was a German-born theoretical physicist."
    document = client.annotate(text)

    print("{:30s}\t{}".format("Mention", "Type"))
    for sent in document.sentence:
        for m in sent.mentions:
            print("{:30s}\t{}".format(m.entityMentionText, m.entityType))

print("\nThe server should be stopped upon exit from the \"with\" statement.")

# Step1: Upload data

In [None]:
! pip install -U pip setuptools wheel
! pip install -U spacy
! python -m spacy download en_core_web_sm

In [None]:
# load headache notes
from google.colab import files
uploaded = files.upload()

Saving coi_2k_01_openie_results.csv to coi_2k_01_openie_results.csv


In [None]:
import pandas as pd

dt =pd.read_csv('coi_2k_01_openie_results.csv')
print(dt.shape)
dt.head()

(11028, 5)


Unnamed: 0.1,Unnamed: 0,V1,V2,V3,V4
0,1,1.0,V. Teixeira,are employees of,Society
1,2,1.0,B. Ward,are employees of,Respiratory Society
2,3,1.0,V. Teixeira,are employees of,European Society
3,4,1.0,B. Ward,are,employees of European Respiratory Society
4,5,1.0,V. Teixeira,are employees of,Respiratory Society


In [None]:
dt_notes = dt['V2'].copy()
dt_notes.head(10)

0    V. Teixeira
1        B. Ward
2    V. Teixeira
3        B. Ward
4    V. Teixeira
5    Disclosures
6        B. Ward
7        B. Ward
8        B. Ward
9        B. Ward
Name: V2, dtype: object

# Tokenization & Sentence split 



In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("Conflict-of-interest disclosure: M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen. D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene. H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi. I.G.H.S.-W. received research funding from Janssen and Novartis. C.S. received funding outside this work from Novartis, Celgene, and Janssen. K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx. H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS. J.H. received speakers honoraria from Celgene, acts on advisory boards for Janssen, BMS, and Amgen, and received consultancy honoraria from Janssen, Novartis, and Amgen as well as research funding from Sanofi. The remaining authors declare no competing financial interests.")
for token in doc:
  print(token.text)

Conflict
-
of
-
interest
disclosure
:
M.M.
received
travel
grants
from
Celgene
,
Abbvie
,
Amgen
,
and
Janssen
and
acts
on
advisory
boards
for
Amgen
.
D.H.
received
research
funding
from
Sanofi
and
EngMab
,
as
well
as
personal
fees
outside
this
work
from
Celgene
.
H.S.
received
speakers
honoraria
and
travel
grants
from
Celgene
,
Amgen
,
and
Sanofi
.
I.G.H.S.-W.
received
research
funding
from
Janssen
and
Novartis
.
C.S.
received
funding
outside
this
work
from
Novartis
,
Celgene
,
and
Janssen
.
K.C.W.
received
consultancy
honoraria
from
Amgen
,
Takeda
,
and
BMS
,
as
well
as
honoraria
from
Novartis
,
research
funding
from
Janssen
and
Celgene
,
and
consultancy
honoraria
from
Onyx
.
H.G.
received
consultancy
honoraria
and
research
funding
from
Janssen
,
Celgene
,
and
Novartis
,
as
well
as
consultancy
honoraria
from
Onyx
and
Millenium
and
fees
outside
this
work
from
BMS
.
J.H.
received
speakers
honoraria
from
Celgene
,
acts
on
advisory
boards
for
Janssen
,
BMS
,
and
Amgen
,
and
received
consu

In [None]:
tokens = doc.text.split()
print(tokens)

['Conflict-of-interest', 'disclosure:', 'M.M.', 'received', 'travel', 'grants', 'from', 'Celgene,', 'Abbvie,', 'Amgen,', 'and', 'Janssen', 'and', 'acts', 'on', 'advisory', 'boards', 'for', 'Amgen.', 'D.H.', 'received', 'research', 'funding', 'from', 'Sanofi', 'and', 'EngMab,', 'as', 'well', 'as', 'personal', 'fees', 'outside', 'this', 'work', 'from', 'Celgene.', 'H.S.', 'received', 'speakers', 'honoraria', 'and', 'travel', 'grants', 'from', 'Celgene,', 'Amgen,', 'and', 'Sanofi.', 'I.G.H.S.-W.', 'received', 'research', 'funding', 'from', 'Janssen', 'and', 'Novartis.', 'C.S.', 'received', 'funding', 'outside', 'this', 'work', 'from', 'Novartis,', 'Celgene,', 'and', 'Janssen.', 'K.C.W.', 'received', 'consultancy', 'honoraria', 'from', 'Amgen,', 'Takeda,', 'and', 'BMS,', 'as', 'well', 'as', 'honoraria', 'from', 'Novartis,', 'research', 'funding', 'from', 'Janssen', 'and', 'Celgene,', 'and', 'consultancy', 'honoraria', 'from', 'Onyx.', 'H.G.', 'received', 'consultancy', 'honoraria', 'and', 

In [None]:
token_without_punct = [token.orth_ for token in doc if not token.is_punct | token.is_space]
print(token_without_punct)

['Conflict', 'of', 'interest', 'disclosure', 'M.M.', 'received', 'travel', 'grants', 'from', 'Celgene', 'Abbvie', 'Amgen', 'and', 'Janssen', 'and', 'acts', 'on', 'advisory', 'boards', 'for', 'Amgen', 'D.H.', 'received', 'research', 'funding', 'from', 'Sanofi', 'and', 'EngMab', 'as', 'well', 'as', 'personal', 'fees', 'outside', 'this', 'work', 'from', 'Celgene', 'H.S.', 'received', 'speakers', 'honoraria', 'and', 'travel', 'grants', 'from', 'Celgene', 'Amgen', 'and', 'Sanofi', 'I.G.H.S.-W.', 'received', 'research', 'funding', 'from', 'Janssen', 'and', 'Novartis', 'C.S.', 'received', 'funding', 'outside', 'this', 'work', 'from', 'Novartis', 'Celgene', 'and', 'Janssen', 'K.C.W.', 'received', 'consultancy', 'honoraria', 'from', 'Amgen', 'Takeda', 'and', 'BMS', 'as', 'well', 'as', 'honoraria', 'from', 'Novartis', 'research', 'funding', 'from', 'Janssen', 'and', 'Celgene', 'and', 'consultancy', 'honoraria', 'from', 'Onyx', 'H.G.', 'received', 'consultancy', 'honoraria', 'and', 'research', 'f

In [None]:
# load headache notes
from google.colab import files
uploaded = files.upload()


Saving coi_69.txt to coi_69.txt


In [None]:
notes = []
with open('coi_69.txt', 'r') as fin:
  lines = fin.readlines()
  for line in lines:
    notes.append(line)
print(notes)
print(len(notes))

['Conflict-of-interest disclosure: M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen. D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene. H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi. I.G.H.S.-W. received research funding from Janssen and Novartis. C.S. received funding outside this work from Novartis, Celgene, and Janssen. K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx. H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS. J.H. received speakers honoraria from Celgene, acts on advisory boards for Janssen, BMS, and Amgen, and received consultancy honoraria from Janssen,

In [None]:
doc = nlp(notes[0])
print(doc)

Conflict-of-interest disclosure: M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen. D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene. H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi. I.G.H.S.-W. received research funding from Janssen and Novartis. C.S. received funding outside this work from Novartis, Celgene, and Janssen. K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx. H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS. J.H. received speakers honoraria from Celgene, acts on advisory boards for Janssen, BMS, and Amgen, and received consultancy honoraria from Janssen, N

In [None]:
# Token
token_without_punct = [token.orth_ for token in doc if not token.is_punct | token.is_space]
print(token_without_punct)

['Conflict', 'of', 'interest', 'disclosure', 'M.M.', 'received', 'travel', 'grants', 'from', 'Celgene', 'Abbvie', 'Amgen', 'and', 'Janssen', 'and', 'acts', 'on', 'advisory', 'boards', 'for', 'Amgen', 'D.H.', 'received', 'research', 'funding', 'from', 'Sanofi', 'and', 'EngMab', 'as', 'well', 'as', 'personal', 'fees', 'outside', 'this', 'work', 'from', 'Celgene', 'H.S.', 'received', 'speakers', 'honoraria', 'and', 'travel', 'grants', 'from', 'Celgene', 'Amgen', 'and', 'Sanofi', 'I.G.H.S.-W.', 'received', 'research', 'funding', 'from', 'Janssen', 'and', 'Novartis', 'C.S.', 'received', 'funding', 'outside', 'this', 'work', 'from', 'Novartis', 'Celgene', 'and', 'Janssen', 'K.C.W.', 'received', 'consultancy', 'honoraria', 'from', 'Amgen', 'Takeda', 'and', 'BMS', 'as', 'well', 'as', 'honoraria', 'from', 'Novartis', 'research', 'funding', 'from', 'Janssen', 'and', 'Celgene', 'and', 'consultancy', 'honoraria', 'from', 'Onyx', 'H.G.', 'received', 'consultancy', 'honoraria', 'and', 'research', 'f

# Lemmatization

In [None]:
# Lemmatization
doc = nlp("Conflict-of-interest disclosure: M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen. D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene. H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi. I.G.H.S.-W. received research funding from Janssen and Novartis. C.S. received funding outside this work from Novartis, Celgene, and Janssen. K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx. H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS. J.H. received speakers honoraria from Celgene, acts on advisory boards for Janssen, BMS, and Amgen, and received consultancy honoraria from Janssen, Novartis, and Amgen as well as research funding from Sanofi. The remaining authors declare no competing financial interests.")
for token in doc:
  print(token, token.lemma, token.lemma_)

Conflict 1656665257010592753 conflict
- 9153284864653046197 -
of 886050111519832510 of
- 9153284864653046197 -
interest 4982881623027089361 interest
disclosure 6606664241527507269 disclosure
: 11532473245541075862 :
M.M. 4805209739451631105 M.M.
received 9255145433075943736 receive
travel 9016120516514741834 travel
grants 12140365075417128432 grant
from 7831658034963690409 from
Celgene 17406872118705027351 Celgene
, 2593208677638477497 ,
Abbvie 15943484054927552683 Abbvie
, 2593208677638477497 ,
Amgen 2682252856093640495 Amgen
, 2593208677638477497 ,
and 2283656566040971221 and
Janssen 18133833143932169039 Janssen
and 2283656566040971221 and
acts 4330217062942772869 act
on 5640369432778651323 on
advisory 7580777542555033349 advisory
boards 14899812206273857344 board
for 16037325823156266367 for
Amgen 2682252856093640495 Amgen
. 12646065887601541794 .
D.H. 13037265122536628748 D.H.
received 9255145433075943736 receive
research 4129690550177407157 research
funding 3726624177514968955 fun

# POS & Morphological tagging 

In [None]:
# POS Tagging
doc = nlp("Conflict-of-interest disclosure: M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen. D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene. H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi. I.G.H.S.-W. received research funding from Janssen and Novartis. C.S. received funding outside this work from Novartis, Celgene, and Janssen. K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx. H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS. J.H. received speakers honoraria from Celgene, acts on advisory boards for Janssen, BMS, and Amgen, and received consultancy honoraria from Janssen, Novartis, and Amgen as well as research funding from Sanofi. The remaining authors declare no competing financial interests.")
for token in doc:
  print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

NameError: ignored

In [None]:
doc = nlp(notes[0])
for token in doc:
  print(token.text, token.pos_)

Conflict NOUN
- PUNCT
of ADP
- PUNCT
interest NOUN
disclosure NOUN
: PUNCT
M.M. PROPN
received VERB
travel NOUN
grants NOUN
from ADP
Celgene PROPN
, PUNCT
Abbvie PROPN
, PUNCT
Amgen PROPN
, PUNCT
and CCONJ
Janssen PROPN
and CCONJ
acts VERB
on ADP
advisory ADJ
boards NOUN
for ADP
Amgen PROPN
. PUNCT
D.H. PROPN
received VERB
research NOUN
funding NOUN
from ADP
Sanofi PROPN
and CCONJ
EngMab PROPN
, PUNCT
as ADV
well ADV
as SCONJ
personal ADJ
fees NOUN
outside ADP
this DET
work NOUN
from ADP
Celgene PROPN
. PUNCT
H.S. PROPN
received VERB
speakers NOUN
honoraria NOUN
and CCONJ
travel NOUN
grants NOUN
from ADP
Celgene PROPN
, PUNCT
Amgen PROPN
, PUNCT
and CCONJ
Sanofi PROPN
. PUNCT
I.G.H.S.-W. PUNCT
received VERB
research NOUN
funding NOUN
from ADP
Janssen PROPN
and CCONJ
Novartis PROPN
. PUNCT
C.S. PROPN
received VERB
funding NOUN
outside ADP
this DET
work NOUN
from ADP
Novartis PROPN
, PUNCT
Celgene PROPN
, PUNCT
and CCONJ
Janssen PROPN
. PUNCT
K.C.W. PROPN
received VERB
consultancy NOUN
h

# Name Entities

In [None]:
doc = nlp("Conflict-of-interest disclosure: M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen. D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene. H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi. I.G.H.S.-W. received research funding from Janssen and Novartis. C.S. received funding outside this work from Novartis, Celgene, and Janssen. K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx. H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS. J.H. received speakers honoraria from Celgene, acts on advisory boards for Janssen, BMS, and Amgen, and received consultancy honoraria from Janssen, Novartis, and Amgen as well as research funding from Sanofi. The remaining authors declare no competing financial interests.")
for ent in doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

M.M. 33 37 GPE
Celgene 66 73 GPE
Abbvie 75 81 GPE
Amgen 83 88 GPE
Janssen 94 101 ORG
Amgen 134 139 GPE
D.H. 141 145 PERSON
Sanofi 177 183 DATE
EngMab 188 194 GPE
Celgene 244 251 GPE
Celgene 309 316 GPE
Amgen 318 323 GPE
Sanofi 329 335 DATE
Janssen 380 387 GPE
Novartis 392 400 PERSON
Novartis 447 455 PERSON
Celgene 457 464 PERSON
Janssen 470 477 PERSON
K.C.W. 479 485 ORG
consultancy honoraria 495 516 PERSON
Amgen 522 527 GPE
Takeda 529 535 ORG
BMS 541 544 ORG
Novartis 572 580 PERSON
Janssen 604 611 GPE
Celgene 616 623 GPE
consultancy honoraria 629 650 PERSON
Onyx 656 660 GPE
H.G. 662 666 PERSON
Janssen 724 731 GPE
Celgene 733 740 GPE
Novartis 746 754 PERSON
Onyx 794 798 EVENT
Millenium 803 812 ORG
BMS 845 848 ORG
J.H. 850 854 PERSON
Celgene 888 895 GPE
Janssen 925 932 GPE
BMS 934 937 ORG
Amgen 943 948 ORG
Janssen 990 997 GPE
Novartis 999 1007 PERSON
Amgen 1013 1018 ORG
Sanofi 1052 1058 DATE


In [None]:
doc = nlp(notes[0])
for ent in doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

M.M. 33 37 GPE
Celgene 66 73 GPE
Abbvie 75 81 GPE
Amgen 83 88 GPE
Janssen 94 101 ORG
Amgen 134 139 GPE
D.H. 141 145 PERSON
Sanofi 177 183 DATE
EngMab 188 194 GPE
Celgene 244 251 GPE
Celgene 309 316 GPE
Amgen 318 323 GPE
Sanofi 329 335 DATE
Janssen 380 387 GPE
Novartis 392 400 PERSON
Novartis 447 455 PERSON
Celgene 457 464 PERSON
Janssen 470 477 PERSON
K.C.W. 479 485 ORG
consultancy honoraria 495 516 PERSON
Amgen 522 527 GPE
Takeda 529 535 ORG
BMS 541 544 ORG
Novartis 572 580 PERSON
Janssen 604 611 GPE
Celgene 616 623 GPE
consultancy honoraria 629 650 PERSON
Onyx 656 660 GPE
H.G. 662 666 PERSON
Janssen 724 731 GPE
Celgene 733 740 GPE
Novartis 746 754 PERSON
Onyx 794 798 EVENT
Millenium 803 812 ORG
BMS 845 848 ORG
J.H. 850 854 PERSON
Celgene 888 895 GPE
Janssen 925 932 GPE
BMS 934 937 ORG
Amgen 943 948 ORG
Janssen 990 997 GPE
Novartis 999 1007 PERSON
Amgen 1013 1018 ORG
Sanofi 1052 1058 DATE


## Entity Visualizer

In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [None]:
#sentence identifier
for ix, sent in enumerate(doc.sents, 1):
  print("Sentence number {}:{}".format(ix, sent))

Sentence number 1:Conflict-of-interest disclosure:
Sentence number 2:M.M. received travel grants from Celgene, Abbvie, Amgen, and Janssen and acts on advisory boards for Amgen.
Sentence number 3:D.H. received research funding from Sanofi and EngMab, as well as personal fees outside this work from Celgene.
Sentence number 4:H.S. received speakers honoraria and travel grants from Celgene, Amgen, and Sanofi.
Sentence number 5:I.G.H.S.-W. received research funding from Janssen and Novartis.
Sentence number 6:C.S. received funding outside this work from Novartis, Celgene, and Janssen.
Sentence number 7:K.C.W. received consultancy honoraria from Amgen, Takeda, and BMS, as well as honoraria from Novartis, research funding from Janssen and Celgene, and consultancy honoraria from Onyx.
Sentence number 8:H.G. received consultancy honoraria and research funding from Janssen, Celgene, and Novartis, as well as consultancy honoraria from Onyx and Millenium and fees outside this work from BMS.
Senten

In [None]:
# dependence tree
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep", jupyter=True)

# Multi-word Token Explansion

# Dependency Parsing 