# This notebook works on extracting unique named entities and organizations from KDD papers and passing them into a list.  

In [628]:
import pandas as pd
import numpy as np
import nltk
import os
import subprocess
import unicodedata
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk import Tree
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import re
from operator import itemgetter
import polyglot
import string

In [629]:
path        = os.path.abspath(os.getcwd())
TESTDIR     = os.path.normpath(os.path.join(os.path.expanduser("~"),"projects","LC3-Creations", "examples","KDDsample"))


In [680]:
'''
I experienced unicode problems early on.  Everytime I had an error, I scoured the internet for solutions. Here's the credit.



- For Typeerror codes using subprocess to convert pdf2txt output to straight unicode --> http://stackoverflow.com/questions/33283603/python-popen-communicate-str-encodeencoding-utf-8-errors-ignore-cr
- For problems with ASCII characters --> http://stackoverflow.com/questions/175240/how-do-i-convert-a-files-format-from-unicode-to-ascii-using-python
- For unicode characters left in unicode converted to a string  --> http://stackoverflow.com/questions/8689795/how-can-i-remove-non-ascii-characters-but-leave-periods-and-spaces-using-python

'''


a = unicode(subprocess.check_output(['pdf2txt.py',str(os.path.normpath(os.path.join(TESTDIR,"p29.pdf")))]),errors='ignore')
document = filter(lambda x: x in string.printable,unicodedata.normalize('NFKD', a).encode('ascii','ignore').decode('unicode_escape').encode('ascii','ignore'))

In [690]:
"returns named entity chunks in a given text"
tagged = nltk.pos_tag(nltk.word_tokenize(re.sub('[\s]'," ", document)))
entities = nltk.chunk.ne_chunk(tagged)
# Another entity extractor
st = StanfordNERTagger('/Users/linwood/stanford-corenlp-full-2015-04-20/classifiers/english.muc.7class.distsim.crf.ser.gz',
       '/Users/linwood/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar',
       encoding='utf-8')
tokenized_text = word_tokenize(re.sub('[\s]'," ", document))
stanentities = st.tag(tokenized_text)

In [704]:
tokenized_text

['Estimating',
 'Local',
 'Intrinsic',
 'Dimensionality',
 'Laurent',
 'Amsaleg',
 'Equipe',
 'LINKMEDIA',
 ',',
 'CNRS/IRISA',
 'Rennes',
 ',',
 'France',
 'Campus',
 'Universitaire',
 'de',
 'Beaulieu',
 '35042',
 'Rennes',
 'Cedex',
 ',',
 'France',
 'laurent.amsaleg',
 '@',
 'irisa.fr',
 'Stphane',
 'Girard',
 'Equipe',
 'MISTIS',
 ',',
 'INRIA',
 'Grenoble',
 ',',
 'France',
 'Inovalle',
 ',',
 '655',
 ',',
 'Montbonnot',
 '38334',
 'Saint-Ismier',
 'Cedex',
 ',',
 'stephane.girard',
 '@',
 'inria.fr',
 'France',
 'Teddy',
 'Furon',
 'Equipe',
 'LINKMEDIA',
 ',',
 'INRIA/IRISA',
 'Rennes',
 ',',
 'France',
 'Campus',
 'Universitaire',
 'de',
 'Beaulieu',
 '35042',
 'Rennes',
 'Cedex',
 ',',
 'France',
 'teddy.furon',
 '@',
 'inria.fr',
 'Ken-ichi',
 'Kawarabayashi',
 'National',
 'Institute',
 'of',
 'Informatics',
 ',',
 'Japan',
 '2-1-2',
 'Hitotsubashi',
 ',',
 'Chiyoda-ku',
 'Tokyo',
 '101-8430',
 ',',
 'Japan',
 'k_keniti',
 '@',
 'nii.ac.jp',
 'Oussama',
 'Chelly',
 'Nationa

Below, I established two lists to hold the values that I extract from the text.  This itemgetter function will check for unique values.  First, I iterate over the extracted entities and see if the objects is a nltk.tree.Tree with a "Person" label.  If it is, and the length is equal to 1 (first or last name only), I append that value to the list. If it's larger, I iterate of the entity tree and pull out the first value only using itemgetter.  Then, I join the values from the list and append it to the destination list.  

# Getting a list out of NLTK's standard NE chunker

In [697]:
from operator import itemgetter

persons = []
organizations = []
locations =[]

for l in entities:
    if isinstance(l,nltk.tree.Tree):
        if l.label() == 'PERSON':
            if len(l)== 1:
                if l[0][0] in persons:
                    pass
                else:
                    persons.append(l[0][0])
            else:
                if " ".join(map(itemgetter(0), l)) in persons:
                    pass
                else:
                    persons.append(" ".join(map(itemgetter(0), l)))
                    
for o in entities:
    if isinstance(o,nltk.tree.Tree):
        if o.label() == 'ORGANIZATION':
            if len(o)== 1:
                if o[0][0] in organizations:
                    pass
                else:
                    organizations.append(o[0][0])
            else:
                if " ".join(map(itemgetter(0), o)) in organizations:
                    pass
                else:
                    organizations.append(" ".join(map(itemgetter(0), o)))
                    
for o in entities:
    if isinstance(o,nltk.tree.Tree):
        if o.label() == 'LOCATION':
            if len(o)== 1:
                if o[0][0] in locations:
                    pass
                else:
                    locations.append(o[0][0])
            else:
                if " ".join(map(itemgetter(0), o)) in locations:
                    pass
                else:
                    locations.append(" ".join(map(itemgetter(0), o)))
                    
                
print persons
print
print
print organizations
print
print
print locations

['Local Intrinsic Dimensionality Laurent Amsaleg Equipe LINKMEDIA', 'Rennes', 'Campus Universitaire', 'Rennes Cedex', 'Stphane Girard Equipe MISTIS', 'Furon Equipe LINKMEDIA', 'Oussama Chelly National Institute', 'Michael E. Houle National Institute', 'Michael Nett Google', 'Houle', 'Karger', 'Subject Descriptors', 'Parameter', 'Permissions', 'Keywords', 'Isometric Mapping', 'Linear Embedding', 'Principal', 'Analysis', 'Weibull', 'Maximum', 'Hill', 'Pr', 'Fisher', 'Tippett', 'Gnedenko', 'Given', 'Ruhls', 'J', 'M', 'Hein', 'P', 'D', 'Uniformly', 'Amsterdam Library', 'Object Images', 'Likewise', 'Faster', 'D IDMLE', 'Hein Takens', 'Dataset', 'Data', 'Secular', 'Residual Life Time', 'Cambridge University Press', 'J. Fauqueur', 'Pattern Recogn', 'Statistical Modeling', 'Extreme Values', 'Hero', 'Sys', 'Smallest Member', 'Math', 'Cambridge Phil', 'Fraga Alves', 'Portugalia Mathematica', 'Terme Maximum', 'Serie Aleatoire', 'Ann', 'Gupta', 'Audibert', 'Inlierness', 'Technical Report', 'Nett',

In [693]:
from operator import itemgetter

for o in entities:
    if isinstance(o,nltk.tree.Tree):
        if o.label() == 'ORGANIZATION' or o.label() == 'GPE':
            if len(o)>1:
                print " ".join(map(itemgetter(0), o))

INRIA Grenoble
France Inovalle
Component Analysis
Correlation Dimension
Likelihood Estimation
ID2 X
Distance Distributions
Real Data
Swiss Roll
IDANN_SIFT1B35 Dataset
Real Data
JSPS Kakenhi Kiban
Generic Image Retrieval
Asilomar Conf
Frequency Distribution
Methods of Stat.
Bounded Geometries
Distance Distributions
Document Recognition
Kernel Eigenvalue Problem
ACM Trans
Intrinsic Dimensional Outlier


I tried to iterate over the extracted list of entities to get a better break between person's and their university name.  

In [694]:
tokens = [nltk.word_tokenize(l) for l in persons]
fin = [nltk.chunk.ne_chunk(nltk.pos_tag(l)) for l in tokens]
fin;

In [None]:
new =[word_tokenize(l) for l in persons]
stan = [st.tag(l) for l in new]
stan;

# Creating lists of named entities from Stanford's NER model

This function looks though an extracted stanford ner list, and finds continuous entitiy labels.  This should create first name, last name records of entities.  

In [710]:
def get_continuous_chunks(tagged_sent):
    continuous_chunk = []
    current_chunk = []

    for token, tag in tagged_sent:
        if tag != "O":
            current_chunk.append((token, tag))
        else:
            if current_chunk: # if the current chunk is not empty
                continuous_chunk.append(current_chunk)
                current_chunk = []
    # Flush the final current_chunk into the continuous_chunk, if any.
    if current_chunk:
        continuous_chunk.append(current_chunk)
    return continuous_chunk

ne_tagged_sent = [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]

named_entities = get_continuous_chunks(stanentities)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]


In [711]:
named_entities = get_continuous_chunks(stanentities)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
named_entities_str_tag

[(u'Estimating Local Intrinsic Dimensionality Laurent Amsaleg Equipe LINKMEDIA',
  u'ORGANIZATION'),
 (u'CNRSIRISA Rennes', u'LOCATION'),
 (u'France Campus Universitaire de Beaulieu 35042 Rennes Cedex',
  u'ORGANIZATION'),
 (u'France', u'LOCATION'),
 (u'Stphane Girard Equipe MISTIS', u'PERSON'),
 (u'INRIA Grenoble', u'LOCATION'),
 (u'France Inovalle', u'LOCATION'),
 (u'France Teddy Furon Equipe LINKMEDIA', u'ORGANIZATION'),
 (u'INRIAIRISA Rennes', u'LOCATION'),
 (u'France Campus Universitaire de Beaulieu 35042 Rennes Cedex',
  u'ORGANIZATION'),
 (u'France', u'LOCATION'),
 (u'Kawarabayashi National Institute of Informatics', u'ORGANIZATION'),
 (u'Japan', u'LOCATION'),
 (u'Hitotsubashi', u'LOCATION'),
 (u'Chiyoda-ku Tokyo', u'LOCATION'),
 (u'Japan', u'LOCATION'),
 (u'Oussama Chelly National Institute of Informatics', u'ORGANIZATION'),
 (u'Japan', u'LOCATION'),
 (u'Hitotsubashi', u'LOCATION'),
 (u'Chiyoda-ku Tokyo', u'LOCATION'),
 (u'Japan', u'LOCATION'),
 (u'Michael E. Houle National Ins

In [715]:
for l,m in named_entities_str_tag:
    if m == 'PERSON':
        print l
    else:
        pass

Stphane Girard Equipe MISTIS
Michael Nett Google
Houle
Karger
Weibull
Weibull
Weibull
Houle
Houle
Fisher
Tippett
Balkema
Haan
Haan
Haan
Fisher
Iverson
Karger
Hein
Mobius
ANN SIFT1B
ANN SIFT1B
Hein Takens
Hein Takens
Hein Takens
Hein Takens
Hein Takens
L. Amsaleg
M. E. Houle
K. Kawarabayashi
M. E. Houle
A. Balkema
L. de Haan
] N. Bingham
] N. Boujemaa
J. Fauqueur
M. Ferecatu
F. Fleuret
V. Gouet
B. LeSaux
H. Sahbi
] C. Bouveyron
G. Celeux
S. Girard
] J. Bruske
G. Sommer
A. Vinciarelli
] S. Coles
S. Chawla
M. E. Houle
] R. A. Fisher
L. H. C. Tippett
] M. I. Fraga Alves
L. de Haan
T. Lin
] M. I. Fraga Alves
M. I. Gomes
L. de Haan
B. V. Gnedenko
A. Gupta
R. Krauthgamer
J. R. Lee
] M. Hein
B. M. Hill
M. E. Houle
M. E. Houle
Hubness
M. E. Houle
H. Kashima
M. Nett
M. E. Houle
X. Ma
M. Nett
V. Oria
M. E. Houle
X. Ma
V. Oria
M. E. Houle
M. Nett
] H. Jegou
R. Tavenard
M. Douze
L. Amsaleg
I. Jollie
] D. R. Karger
M. Ruhl
J. Karhunen
J. Joutsensalo
] Y. LeCun
L. Bottou
Y. Bengio
P. Haner
C. R. Rao


In [640]:
list1 = range(10)

In [641]:
list2 = [i for i in xrange(7,17,1)]


In [642]:
set(list1) & set(list2)

{7, 8, 9}

In [643]:
def parts_of_speech(corpus):
    "returns named entity chunks in a given text"
    tagged = nltk.pos_tag(nltk.word_tokenize(corpus))
    entities = nltk.chunk.ne_chunk(tagged)
    # Another entity extractor
    st = StanfordNERTagger('/Users/linwood/stanford-corenlp-full-2015-04-20/classifiers/english.muc.7class.distsim.crf.ser.gz',
           '/Users/linwood/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar',
           encoding='utf-8')
    tokenized_text = word_tokenize(corpus)
    stanentities = st.tag(tokenized_text)
    return entities
def find_entities(chunks):
    "given list of tagged parts of speech, returns unique named entities"

    def traverse(tree):
        "recursively traverses an nltk.tree.Tree to find named entities"
        entity_names = []
    
        if hasattr(tree, 'node') and tree.node:
            if tree.node == 'NE':
                entity_names.append(' '.join([child[0] for child in tree]))
            else:
                for child in tree:
                    entity_names.extend(traverse(child))
    
        return entity_names
    
    named_entities = []
    
    for chunk in chunks:
        entities = sorted(list(set([word for tree in chunk
                            for word in traverse(tree)])))
        for e in entities:
            if e not in named_entities:
                named_entities.append(e)
    return named_entities

In [644]:
import os
os.getcwd()

'/Users/linwood/projects/LC3-Creations/notebooks'

# Extracting entities and creating lists using Polyglot

In [645]:
from polyglot.text import Text
e=Text(re.sub('[\s]'," ",document[:10000])).entities

In [718]:
Text(filter(lambda x: x in string.printable, document)).entities;

In [719]:
# problem with unicode; have to get rid of this somehow or the extraction errors out.
# solution from http://stackoverflow.com/questions/8689795/how-can-i-remove-non-ascii-characters-but-leave-periods-and-spaces-using-python

import string
s=document
Text(filter(lambda x: x in string.printable, s)).entities;

In [648]:
# when I get an error for some unicode character, try to get text up to the error; will need a "while" loop

Text(re.sub('[\s]'," ",(document[len(document)-(len(document)/10):len(document)]))).entities;

In [716]:
Text(re.sub('[\s]'," ",document[:2000])).entities;

This code iterates over the polyglot extracted entities and creates a list of person, locations, and organizations

In [668]:
import itertools
import unicodedata

def extraction(corpus):
    
    # extract entities from a single string; remove whitespace characters
    try:
        e = Text(re.sub('[\s]'," ",corpus)).entities
    except:
        pass #e = Text(re.sub("(r'(x0)'," ","(re.sub('[\s]'," ",corpus)))).entities
    
    current_person =[]
    persons =[]
    current_org=[]
    organizations=[]
    current_loc=[]
    locations=[]

    for l in e:
        if l.tag == 'I-PER':
            for m in l:
                current_person.append(unicodedata.normalize('NFKD', m).encode('ascii','ignore'))
            else:
                    if current_person: # if the current chunk is not empty
                        persons.append(" ".join(current_person))
                        current_person = []
        elif l.tag == 'I-ORG':
            for m in l:
                current_org.append(unicodedata.normalize('NFKD', m).encode('ascii','ignore'))
            else:
                    if current_org: # if the current chunk is not empty
                        organizations.append(" ".join(current_org))
                        current_org = []
        elif l.tag == 'I-LOC':
            for m in l:
                current_loc.append(unicodedata.normalize('NFKD', m).encode('ascii','ignore'))
            else:
                    if current_loc: # if the current chunk is not empty
                        locations.append(" ".join(current_loc))
                        current_loc = []
    results = {}
    results['persons']=persons
    results['organizations']=organizations
    results['locations']=locations
    
    return results

In [717]:
extraction(document)['persons']

['Laurent',
 'Equipe',
 'Stphane Girard Equipe MISTIS',
 'Ismier',
 'Teddy Furon Equipe',
 'IRISA',
 'ichi',
 'Michael E',
 'Michael Nett Google',
 'Com',
 'estima',
 'Fisher',
 'Tippett',
 'de Haan',
 'de Haan',
 'Fisher',
 'Tippett',
 'Theo',
 'Fisher',
 'Tippett',
 'satises lim',
 'Fisher',
 'ned',
 'ID2',
 'Iverson',
 'IDX',
 'Sa',
 'Hein',
 'MLE',
 'ber',
 'benet',
 'FrequencyEstimated',
 'FrequencyEstimated',
 'Hein',
 'IDMoM',
 'Hein',
 'IDMoM IDPWM',
 'MiNDml1',
 'Hein Takens',
 'MiNDml1',
 'Hein Takens',
 'MiNDml1',
 'Hein Takens',
 'signicantly',
 'ACKNOWLEDGMENTS',
 'Furon',
 'Kawarabayashi',
 'JST ERATO',
 'Houle',
 'JSPS Kakenhi',
 'Grant',
 'de Haan',
 'Bingham',
 'Goldie',
 'Teugels',
 'Fauqueur',
 'Sommer',
 'Coles',
 'Costa',
 'de Vries',
 'Chawla',
 '. Fisher',
 'Tippett',
 'Cambridge Phil',
 'Soc',
 'Fraga Alves',
 'de Haan',
 'Fraga Alves',
 'Gomes',
 'de Haan',
 'Ann .',
 'Gupta',
 'Lee',
 'Hein',
 'J',
 '. Stat',
 'Jollie',
 'Karhunen',
 'Pickands',
 '. Stat',
 'R

In [670]:
document;

In [671]:
regexp = re.compile("REFERENCES(.*)$")
references = Text(regexp.search(re.sub('[\s]'," ",document)).group(1)).entities

In [672]:
regexp1 = re.compile("REFERENCES(.*)$")
references = Text(regexp.search(re.sub('[\s]'," ",document)).group(1)).entities

In [679]:
len(extraction(regexp.search(re.sub('[\s]'," ",document)).group(1))['persons'])

41

# Completed Features

In [674]:
# Counting the number of references in a research paper

len(re.findall('\[(.*?)\]',regexp.search(re.sub('[\s]'," ",document)).group(1)))

39

# Truth Sets to test extraction accuracy

In [675]:
#p19.pdf

p19pdf_authors=['Tim Althoff','Xin Luna Dong','Kevin Murphy','Safa Alai','Van Dang','Wei Zhang']
p19pdf_author_organizations=['Computer Science Department','Stanford University','Google']
p19pdf_author_locations=['Stanford, CA','Stanford','CA','Google','1600 Amphitheatre Parkway, Mountain View, CA 94043','1600 Amphitheatre Parkway','Mountain View']

p19pdf_references_authors =['A. Ahmed', 'C. H. Teo', 'S. Vishwanathan','A. Smola','J. Allan', 'R. Gupta', 'V. Khandelwal',
                           'D. Graus', 'M.-H. Peetz', 'D. Odijk', 'O. de Rooij', 'M. de Rijke','T. Huet', 'J. Biega', 
                            'F. M. Suchanek','H. Ji', 'T. Cassidy', 'Q. Li','S. Tamang', 'A. Kannan', 'S. Baker', 'K. Ramnath', 
                            'J. Fiss', 'D. Lin', 'L. Vanderwende',  'R. Ansary', 'A. Kapoor', 'Q. Ke', 'M. Uyttendaele',
                           'S. M. Katz','A. Krause','D. Golovin','J. Leskovec', 'A. Krause', 'C. Guestrin', 'C. Faloutsos', 
                            'J. VanBriesen','N. Glance','J. Li','C. Cardie','J. Li','C. Cardie','C.-Y. Lin','H. Lin','J. A. Bilmes'
                           'X. Ling','D. S. Weld', 'A. Mazeika', 'T. Tylenda','G. Weikum','M. Minoux', 'G. L. Nemhauser', 'L. A. Wolsey',
                            'M. L. Fisher','R. Qian','D. Shahaf', 'C. Guestrin','E. Horvitz','T. Althoff', 'X. L. Dong', 'K. Murphy', 'S. Alai',
                            'V. Dang','W. Zhang','R. A. Baeza-Yates', 'B. Ribeiro-Neto', 'D. Shahaf', 'J. Yang', 'C. Suen', 'J. Jacobs', 'H. Wang', 'J. Leskovec',
                           'W. Shen', 'J. Wang', 'J. Han','D. Bamman', 'N. Smith','K. Bollacker', 'C. Evans', 'P. Paritosh', 'T. Sturge', 'J. Taylor',
                           'R. Sipos', 'A. Swaminathan', 'P. Shivaswamy', 'T. Joachims','K. Sprck Jones','G. Calinescu', 'C. Chekuri', 'M. Pl','J. Vondrk',
                           'F. M. Suchanek', 'G. Kasneci','G. Weikum', 'J. Carbonell' ,'J. Goldstein','B. Carterette', 'P. N. Bennett', 'D. M. Chickering',
                            'S. T. Dumais','A. Dasgupta', 'R. Kumar','S. Ravi','Q. X. Do', 'W. Lu', 'D. Roth','X. Dong', 'E. Gabrilovich', 'G. Heitz', 'W. Horn', 
                            'N. Lao', 'K. Murphy',  'T. Strohmann', 'S. Sun','W. Zhang', 'M. Dubinko', 'R. Kumar', 'J. Magnani', 'J. Novak', 'P. Raghavan','A. Tomkins',
                           'U. Feige','F. M. Suchanek','N. Preda','R. Swan','J. Allan', 'T. Tran', 'A. Ceroni', 'M. Georgescu', 'K. D. Naini', 'M. Fisichella',
                           'T. A. Tuan', 'S. Elbassuoni', 'N. Preda','G. Weikum','Y. Wang', 'M. Zhu', 'L. Qu', 'M. Spaniol', 'G. Weikum',
                           'G. Weikum', 'N. Ntarmos', 'M. Spaniol', 'P. Triantallou', 'A. A. Benczr',  'S. Kirkpatrick', 'P. Rigaux','M. Williamson',
                           'X. W. Zhao', 'Y. Guo', 'R. Yan', 'Y. He','X. Li']


print "There are %r authors" % len(p19pdf_authors)
print  # white space
print "There are %r author organizations" %len(p19pdf_author_organizations)
print 
print "There are %r author locations" % len(p19pdf_author_locations)
print  
print "There are %r authors in the references" %len(p19pdf_references_authors)

There are 6 authors

There are 3 author organizations

There are 7 author locations

There are 152 authors in the references


In [676]:
#p29.pdf

p29pdf_authors=['Laurent Amsaleg','Stéphane Girard','Oussama Chelly','Teddy Furon','Michael E. Houle','Ken-ichi Kawarabayashi',
               'Michael Nett']
p29pdf_author_organizations=['Equipe LINKMEDIA','Campus Universitaire de Beaulieu','CNRS/IRISA Rennes','National Institute of Informatics',
                             'Equipe MISTIS INRIA','Google']
p29pdf_author_locations=['Campus Universitaire de Beaulieu','35042 Rennes Cedex, France','France','-1-2 Hitotsubashi, Chiyoda-ku Tokyo 101-8430, Japan',
                        'Japan','6-10-1 Roppongi, Minato-ku Tokyo 106-6126','Inovallée, 655, Montbonnot 38334 Saint-Ismier Cedex','Tokyo']

p29pdf_references_authors =['A. A. Balkema','L. de Haan','N. Bingham', 'C. Goldie','J. Teugels','N. Boujemaa', 'J. Fauqueur', 'M. Ferecatu', 'F. Fleuret',
                            'V. Gouet', 'B. LeSaux','H. Sahbi','C. Bouveyron', 'G. Celeux', 'S. Girard','J. Bruske', 'G. Sommer',
                           'F. Camastra','A. Vinciarelli','S. Coles','J. Costa' ,'A. Hero','T. de Vries', 'S. Chawla','M. E. Houle',
                           'R. A. Fisher','L. H. C. Tippett','M. I. Fraga Alves', 'L. de Haan','T. Lin','M. I. Fraga Alves', 'M. I. Gomes','L. de Haan',
                           'B. V. Gnedenko',' A. Gupta', 'R. Krauthgamer','J. R. Lee','A. Gupta', 'R. Krauthgamer','J. R. Lee','M. Hein','J.-Y. Audibert',
                           'B. M. Hill','M. E. Houle','M. E. Houle','M. E. Houle','M. E. Houle', 'H. Kashima', 'M. Nett','M. E. Houle', 'X. Ma', 'M. Nett',
                            'V. Oria','M. E. Houle', 'X. Ma', 'V. Oria','J. Sun','M. E. Houle','M. Nett','H. Jegou', 'R. Tavenard', 'M. Douze','L. Amsaleg',
                           'I. Jollie','D. R. Karger','M. Ruhl','J. Karhunen','J. Joutsensalo','Y. LeCun', 'L. Bottou', 'Y. Bengio', 'P. Haner',
                           'J. Pickands, III','C. R. Rao','S. T. Roweis','L. K. Saul','A. Rozza', 'G. Lombardi', 'C. Ceruti', 'E. Casiraghi', 'P. Campadelli',
                           'B. Scholkopf', 'A. J. Smola','K.-R. Muller','U. Shaft','R. Ramakrishnan',' F. Takens','J. Tenenbaum', 'V. D. Silva','J. Langford',
                           'J. B. Tenenbaum', 'V. De Silva','J. C. Langford','J. B. Tenenbaum', 'V. De Silva','J. C. Langford','J. Venna','S. Kaski',
                           'P. Verveer','R. Duin','J. von Brunken', 'M. E. Houle', 'A. Zimek','J. von Brunken', 'M. E. Houle','A. Zimek']

print "There are %r authors" % len(p29pdf_authors)
print  # white space
print "There are %r author organizations" %len(p29pdf_author_organizations)
print 
print "There are %r author locations" % len(p29pdf_author_locations)
print  
print "There are %r authors in the references" %len(p29pdf_references_authors)

There are 7 authors

There are 6 author organizations

There are 8 author locations

There are 106 authors in the references


In [677]:
re.sub('[\s]'," ",document)[9300:10500]
#regexp.search(re.sub('[\s]'," ",document)).group(1)[4900:6000]

'in which we compare our estimators with state-of-the-art global and local esti- mators. We also show that the empirical variance and convergence rates of the MLE (Hill) and MoM estima- tors are superior to those of the other local estimators studied.   experiments showing that local estimators are more ro- bust than global ones in the presence of noise in non- linear manifolds. Our experiments show that our ap- proaches are very competitive in this regard with other methods, both local and global.   proles of several real-world data sets in terms of LID, illustrating the degree of variability of complexity from region to region within a dataset. The proles demon- strate that a single global ID value is in general not sucient to fully characterize the complexity of real- world data.  2. CONTINUOUS INTRINSIC DIMENSION LID [17] aims to quantify the local ID of a feature space exclusively in terms of the distribution of inter-point dis- tances. Formally, let (Rm, d) be a domain equipped w

In [678]:
extraction(references)

UnboundLocalError: local variable 'e' referenced before assignment

In [None]:



class entities(object):
  def __init__(self):
    self.persons = extraction(document)['persons']
    self.organizations = extraction(document)['organizations']

my_shape = entities()
print (my_shape.persons)
print(my_shape.organizations)



In [None]:
import emailextractor
from emailextractor import file_to_str, get_emails

In [None]:
tuple(get_emails(document))