In [1]:
cd ..

/home/jovyan/work


In [2]:
import pandas as pd
import re
import spacy
from tqdm import tqdm_notebook as tqdm
from collections import Counter

df = pd.read_csv('local_data/NoMoreSilence_ProjectDataV2.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Collection Title,Title,Local Identifier,Type,Date,Date Type,Publication/Origination Info,Creator 1 Name,Creator 1 NameType,...,Subject (Topic) 4 Heading Type,Subject (Topic) 4 Source,Subject (Topic) 5 Heading,Subject (Topic) 5 Heading Type,Subject (Topic) 5 Source,Subject (Topic) 6 Heading,Subject (Topic) 6 Heading Type,Subject (Topic) 6 Source,Ocr text,Corrected Text
0,0,"AIDS Legal Referral Panel Records, 2000-46, Bo...","""Prop 64: The AIDS Initiative in California""",glbths_200046_003_002,text,1986,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,...,topic,lcsh,,,,,,,PROPOSITION 64 The AIDSInitiativein California...,proposition 64 the aids initiative in califor...
1,1,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Management,glbths_200046_004_004,text,circa 1992,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,...,topic,lcsh,,,,,,,MAKING YOUR WILL California State Aids Legal S...,making your will california state aids legal ...
2,2,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Board Meetings,glbths_200046_009_005,text,1995-1996,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,...,topic,lcsh,,,,,,,"January 11, 1997 Community Liaison Committee c...","january 11,1997 community liaison committee c..."
3,3,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Correspondence,glbths_200046_001_0010,text,1985-1987,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,...,topic,lcsh,,,,,,,^ GREAT REPUBLIC IIMSURAIMCE COMPANY i 470 SOU...,great republic iimsuraimce company i 470 sout...
4,4,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Letters of support,glbths_200046_003_006,text,1993,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,...,topic,lcsh,,,,,,,"SANFRANCISCOAIDSFOUNDATION P.O.BOX 426182,SANF...",san francisco aids foundation p. o. box 42618...


# Names

In [4]:
df.Title[df.Title.map(lambda a: 'dinner' in a.lower())]

297    Leadership Recognition Dinner brochure, San Fr...
377    Program for the San Francisco AIDS Foundation ...
Name: Title, dtype: object

In [5]:
dinner_indices = [297, 377]
# these have a large number of names

In [6]:
nlp = spacy.load('en_core_web_md')

In [7]:
sliced_text = df.loc[dinner_indices, 'Corrected Text'].tolist()
raw_text = df.loc[dinner_indices, 'Ocr text'].tolist()

In [8]:
def remove_bad_chars(a_str, keep_chars=r'\w\d#,\.\-\(\)=\*\s'):
    return re.sub(r'[^'+keep_chars+']','',a_str)

raw1_obj = nlp(remove_bad_chars(raw_text[0]))

# find names checking noun chunks against english vocab
def find_names(spacy_obj, is_valid_name=lambda a: a not in nlp.vocab, keep_context=True):
    candidates = []
    for ch in tqdm(spacy_obj.noun_chunks):
        tmp = []
        for w in ch:
            if is_valid_name(w.text):
                tmp.append(w) 
        if tmp:
            if keep_context:
                candidates.append([tmp, ch.text, ch.start, ch.end, ch.label_])
            else:
                candidates.append([tmp]) 
    return candidates
candidates = find_names(raw1_obj)
print(len(candidates))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


653


In [9]:
[(i, i.label_) for i in raw1_obj.ents if i.label_ == "PERSON"][:10]
# use spacy's NER tagger

[(William D. Glenn, 'PERSON'),
 (Maureen S., 'PERSON'),
 (Barbara Cattolica, 'PERSON'),
 (James R. Harrison Eunice Azzani Evelyn, 'PERSON'),
 (Balancio Elizabeth C. Burrell, 'PERSON'),
 (Byron Carlota Del Portillo, 'PERSON'),
 (William D. Lang, 'PERSON'),
 (M. Protos, 'PERSON'),
 (Rankin M.D. Victor Schachter, 'PERSON'),
 (Andrew P. Small Merritt A. Smith, 'PERSON')]

In [10]:
len(raw1_obj.ents)

3242

In [11]:
Counter(i.label_ for i in raw1_obj.ents)

Counter({'FAC': 12,
         'DATE': 44,
         'ORDINAL': 11,
         'ORG': 205,
         'PERSON': 2794,
         'GPE': 95,
         'WORK_OF_ART': 10,
         'CARDINAL': 35,
         'EVENT': 5,
         'TIME': 6,
         'LOC': 10,
         'QUANTITY': 1,
         'LAW': 4,
         'NORP': 7,
         'PRODUCT': 3})

In [12]:
candidates[:10]

[[[pgNbr=2], 'pgNbr=2 San Francisco AIDS Foundation BOARD', 65, 71, 'NP'],
 [[OLeary, Azzani, Balancio],
  'OLeary Secretary Barbara Cattolica-Hudson Treasurer James R. Harrison Eunice Azzani Evelyn Balancio Elizabeth C. Burrell',
  81,
  98,
  'NP'],
 [[Carlota], 'Byron Carlota Del Portillo William D. Lang', 99, 106, 'NP'],
 [[Blomstedt],
  'James C. Anderson Lia Belli Herbert Blomstedt',
  143,
  150,
  'NP'],
 [[Coblentz, Friia],
  'The Reverend Amos C. Brown William C. Coblentz Mrs. Ralph K. Davies Belva Davis Vincent J. Friia Richard Gere Whoopi Goldberg Robert D. Haas Carole',
  150,
  175,
  'NP'],
 [[pgNbr=3],
  'pgNbr=3 San Francisco AIDS Foundation DINNER COMMITTEE',
  196,
  203,
  'NP'],
 [[Krevans], 'The Honorable Art Julius R. Krevans', 203, 209, 'NP'],
 [[Baack, JoVanna, Luqu],
  'Lawrence J. Baack JoVanna Luqu Mr.',
  228,
  234,
  'NP'],
 [[Moed], 'Amy McCombs Alison Moed', 252, 256, 'NP'],
 [[Denebeim], 'The Honorable Libby Denebeim', 262, 266, 'NP']]

In [13]:
raw_text[0][:500]

'LEADERSHIP RECOGNITION DINNER San Francisco AIDS Foundation Leaders in the Fight Against AIDS March 22, 1990 pgNbr=1 The San Francisco AIDS Foundation would like to thank the following sponsors for their very generous support of our third annual Leadership Recognition Dinner: AT&T Bank of America Chevron, U.S.A. Pacific Gas and Electric Company Pacific Telesis Group Chevron AT&T IB PACIFIC^ TELESIS Bank of America pgNbr=2 San Francisco AIDS Foundation BOARD OF DIRECTORS President William D. Glen'

In [14]:
name_list = []
with open('shared_copy/names.txt') as f:
    for line in f.readlines(): 
        name_list.append(line[:-1])

In [15]:
name_set = set(name_list)
names_from_list = find_names(raw1_obj, lambda a: a.upper() in name_set)
print(len(names_from_list))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


1777


In [16]:
names_from_list[:20]

[[[Francisco], 'San Francisco AIDS Foundation Leaders', 3, 8, 'NP'],
 [[Francisco], 'The San Francisco AIDS Foundation', 18, 23, 'NP'],
 [[America], 'America', 45, 46, 'NP'],
 [[IB],
  'Electric Company Pacific Telesis Group Chevron ATT IB PACIFIC TELESIS Bank',
  52,
  63,
  'NP'],
 [[America], 'America', 64, 65, 'NP'],
 [[Francisco], 'pgNbr=2 San Francisco AIDS Foundation BOARD', 65, 71, 'NP'],
 [[William, Glenn, Maureen],
  'President William D. Glenn Vice President Maureen S.',
  73,
  81,
  'NP'],
 [[Barbara, Hudson, James, Harrison, Eunice, Evelyn, Elizabeth],
  'OLeary Secretary Barbara Cattolica-Hudson Treasurer James R. Harrison Eunice Azzani Evelyn Balancio Elizabeth C. Burrell',
  81,
  98,
  'NP'],
 [[Byron, Carlota, William],
  'Byron Carlota Del Portillo William D. Lang',
  99,
  106,
  'NP'],
 [[Curt, Henri, Norris],
  'M.D. Curt H. Mueller Henri E. Norris',
  107,
  114,
  'NP'],
 [[George, Mike, Victor, Jude, Andrew, Merritt, Smith],
  'George M. Protos Mike Rankin M.D

# Addresses

In [17]:
df['Local Identifier '][df['Local Identifier '].map(lambda a: 'mss95-04_001_071' in  a)]

598    ucsf_mss95-04_001_071
Name: Local Identifier , dtype: object

In [18]:
df['Local Identifier '][df['Local Identifier '].map(lambda a: 'mss95-04_001_047' in  a)]

631    ucsf_mss95-04_001_047
Name: Local Identifier , dtype: object

In [19]:
df['Local Identifier '][df['Local Identifier '].map(lambda a: 'glbths_200046_001_0010' in  a)]

3    glbths_200046_001_0010
Name: Local Identifier , dtype: object

In [20]:
print('nchars: {}'.format(len(df.loc[3,'Corrected Text'])))
print('npages: {}'.format(len(re.findall('pgNbr',df.loc[3, 'Ocr text']))))

nchars: 377092
npages: 232


In [21]:
raw = df.loc[3,'Ocr text']
start = raw.find('pgNbr=127')
end = raw.find('pgNbr=134')
print(start, end)
print(raw[start:end])
test1 = raw[start:end]

234386 240173
pgNbr=127 CLIENT COMMENTS FORM When your case is over, please fill out this form and mail to: Mr. Clint Hockenberry Administrator AIDS Legal Referral Panel 1663 Mission St., Suite 400 San Francisco, CA 94103 Your Name Your Address: Your Telephone Number: Message Telephone Number: Name of your volunteer attorney: Date you first saw the attorney: MONTH YEAR Description of your legal problem: What happened in your case? Were you satisfied with your volunteer attorney? Do you have any comments on our volunteer attorney program pgNbr=128 Dan Chesir Gary Wood (Co-Chair) 769-14th Street 350 California Street San Francisco, California 94114 Suite 2290 645-3112 (W) 441-8116(H) San Francisco, CA 94104 982-9211(W) Barry A. Graynor 360 Guerrero Street #202 San Francisco, California 94103 362'-2375 (W) 864-1780(H) Frederick Hertz 7045 Chabot Road Oakland, California 94618 957-1031(W) 428-2252(H) Clink Hockenberry Administrator AIDS Legal Referral Panel 1663 Mission St., Suite 400 San 

In [22]:
corr = df.loc[3,'Corrected Text']
start = corr.find('client comments form')
end = corr.find('attached please find unofficial minutes of a meeting')
print(start, end)
print(corr[start:end])

231198 235724
client comments form when your case is over, please fill out this form and mail to: mr. clint hock en berry administrator aids legal referral panel 1663 mission st., suite 400 san francisco, ca 94103 your name your address: your telephone number: message telephone number: name of your volunteer attorney: date you first saw the attorney: month year description of your legal problem: what happened in your case? were you satisfied with your volunteer attorney? do you have any comments on our volunteer attorney program dan che sir gary wood co hair 76914 th street 350 california street san francisco, california 94114 suite 22906453112 w 4418116 h san francisco, ca 941049829211 w barry a. gray nor 360 guerre ro street # 202 san francisco, california 941033622375 w 8641780 h frederick hertz 7045 chabot road oakland, california 94618 9571031 w 4282252 h c link hock en berry administrator aids legal referral panel 1663 mission st., suite 400 san francisco, california 941038648186

In [23]:
df.loc[3,:]

Unnamed: 0                                                                        3
Collection Title                  AIDS Legal Referral Panel Records, 2000-46, Bo...
Title                                                                Correspondence
Local Identifier                                             glbths_200046_001_0010
Type                                                                           text
Date                                                                      1985-1987
Date Type                                                                   created
Publication/Origination Info      Digital resource published by the Regents of t...
Creator 1 Name                                            AIDS Legal Referral Panel
Creator 1 NameType                                                         corpname
Creator 1 Source                                                                naf
Creator 2 Name                                                              

In [50]:
%%time
raw_obj = nlp(raw)
corr_obj = nlp(df.loc[3,'Corrected Text'])

CPU times: user 23.6 s, sys: 3.67 s, total: 27.3 s
Wall time: 27.3 s


In [51]:
%%time
# pickling entire Spacy NLP objects can take a long time (>2mins)
# this cell is only meant to be a checkpoint, subsequent cells do not need it to be run.
import pickle

with open("aids-legal-referral-panel-addresses_raw.pickle",'wb') as f:
    pickle.dump(raw_obj, f)
with open("aids-legal-referral-panel-addresses_corrected.pickle", 'wb') as f:
    pickle.dump(corr_obj, f)
    
# objects can be re-instantiated with pickle.load()

CPU times: user 55.2 s, sys: 1min 12s, total: 2min 7s
Wall time: 2min 55s


In [52]:
raw = df.loc[3,'Ocr text']
start = raw.find('pgNbr=127')
end = raw.find('pgNbr=134')
# print(start, end)
# print(raw[start:end])
test1 = raw[start:end]

anchor_on_zips = '(?P<streetAddress>(?:#?\d+|'\
'(?:one|two|three|four|five|six|seven|eight|nine|'\
'ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|'\
'seventeen|eighteen|nineteen))(?:\s+#?[\w\d]+\.?\,?){1,10})'\
'\s(?P<state>ca|california)?\s(?P<zipCode>\d{5})'

# anchor_on_city_name # todo

In [53]:
test_regex = anchor_on_zips
re.findall(test_regex, remove_bad_chars(test1), re.IGNORECASE)[:10]

[('1663 Mission St., Suite 400 San Francisco,', 'CA', '94103'),
 ('350 California Street San Francisco,', 'California', '94114'),
 ('360 Guerrero Street #202 San Francisco,', 'California', '94103'),
 ('7045 Chabot Road Oakland,', 'California', '94618'),
 ('1663 Mission St., Suite 400 San Francisco,', 'California', '94103'),
 ('1975 Diamond Blvd., Suite C210 Concord,', 'California', '94520'),
 ('1712 Vine Street Berkeley,', 'California', '94703'),
 ('2801 Turk Street, #305 San Francisco,', 'California', '94118'),
 ('400 California Street San Francisco,', 'California', '94104'),
 ('707 Haight Street San Francisco,', 'CA', '94117')]

In [54]:
print(test1[:1000])
remove_bad_chars(test1[:1000])

pgNbr=127 CLIENT COMMENTS FORM When your case is over, please fill out this form and mail to: Mr. Clint Hockenberry Administrator AIDS Legal Referral Panel 1663 Mission St., Suite 400 San Francisco, CA 94103 Your Name Your Address: Your Telephone Number: Message Telephone Number: Name of your volunteer attorney: Date you first saw the attorney: MONTH YEAR Description of your legal problem: What happened in your case? Were you satisfied with your volunteer attorney? Do you have any comments on our volunteer attorney program pgNbr=128 Dan Chesir Gary Wood (Co-Chair) 769-14th Street 350 California Street San Francisco, California 94114 Suite 2290 645-3112 (W) 441-8116(H) San Francisco, CA 94104 982-9211(W) Barry A. Graynor 360 Guerrero Street #202 San Francisco, California 94103 362'-2375 (W) 864-1780(H) Frederick Hertz 7045 Chabot Road Oakland, California 94618 957-1031(W) 428-2252(H) Clink Hockenberry Administrator AIDS Legal Referral Panel 1663 Mission St., Suite 400 San Francisco, Cal

'pgNbr=127 CLIENT COMMENTS FORM When your case is over, please fill out this form and mail to Mr. Clint Hockenberry Administrator AIDS Legal Referral Panel 1663 Mission St., Suite 400 San Francisco, CA 94103 Your Name Your Address Your Telephone Number Message Telephone Number Name of your volunteer attorney Date you first saw the attorney MONTH YEAR Description of your legal problem What happened in your case Were you satisfied with your volunteer attorney Do you have any comments on our volunteer attorney program pgNbr=128 Dan Chesir Gary Wood (Co-Chair) 769-14th Street 350 California Street San Francisco, California 94114 Suite 2290 645-3112 (W) 441-8116(H) San Francisco, CA 94104 982-9211(W) Barry A. Graynor 360 Guerrero Street #202 San Francisco, California 94103 362-2375 (W) 864-1780(H) Frederick Hertz 7045 Chabot Road Oakland, California 94618 957-1031(W) 428-2252(H) Clink Hockenberry Administrator AIDS Legal Referral Panel 1663 Mission St., Suite 400 San Francisco, Cal'

# Find all addresses in the document

In [55]:
addr = re.findall(test_regex, remove_bad_chars(raw), re.IGNORECASE)
print(len(addr))
addr[:10]

140


[('1986 1663 Mission Street Suite 400 San Francisco,', 'CA', '94103'),
 ('8186 Skaggs Foundation 1330 Broadway Oakland,', 'CA', '94512'),
 ('1663 Mission Street August 6, 1986 Suite 400 San Francisco,',
  'CA',
  '94103'),
 ('8186 Public Affairs Coordinator 890 Hayes Street San Francisco,',
  'California',
  '94117'),
 ('1986 1663 Mission Street Suite 400 Dear Attorney San Francisco,',
  'CA',
  '94103'),
 ('1663 Mission Street NEW ATTORNEY FORM Suite 400 San Francisco,',
  'CA',
  '94103'),
 ('400 CALENDAR OF ATTORNEY TRAINING SESSIONS San Francisco,', 'CA', '94103'),
 ('1663 Misswn Street Suite 400 REGISTRATION San Francisco,', 'CA', '94103'),
 ('220 Bush St., 21sf Floor, Mills Tower, Son Francisco,', 'CA', '94104'),
 ('400 The Management Committee of the San BALIFNEFIR AIDS Francisco,',
  'CA',
  '94103')]

# Find in all docs

In [57]:
addr = []

for i in tqdm(df.loc[:,'Ocr text']):
    addr.extend(re.findall(test_regex, remove_bad_chars(i), re.IGNORECASE))


HBox(children=(IntProgress(value=0, max=735), HTML(value='')))




In [60]:
print(len(addr))
print(addr[:50])

3382
[('205 SACRAMENTO.', 'CALIFORNIA', '95814'), ('3543 18TH STREET SUITE 11 SAN FRANCISCO,', 'CA', '94110'), ('4996 SF,', 'CA', '94103'), ('3785 SF,', 'CA', '94143'), ('4060 Suite 3150 f SF,', 'CA', '94103'), ('3999 SF,', 'CA', '94117'), ('1135 SF,', 'CA', '94102'), ('2147 SF,', 'CA', '94110'), ('4964 SF,', 'CA', '94102'), ('5472 SF,', 'CA', '94104'), ('1180 SF,', 'CA', '94110'), ('1180 SF,', 'CA', '94110'), ('3543 18th Street, Suite 11 San Francisco,', 'CA', '94110'), ('3543 Eighteenth Street, Suite Eleven San Francisco', 'CA', '94110'), ('3543 18tb Street, Suite 11. San Francisco,', 'CA', '94110'), ('25 Van Ness Avenue, Room 130 San Francisco,', 'CA', '94102'), ('25 Van Ness Avenue, Room 130 San Francisco,', 'CA', '94102'), ('199 Moulton Street SF,', 'CA', '94123'), ('3543 18TH STREET, #11, SAN FRANCISCO,', 'CA', '94110'), ('120 584 Castro San Street, #321 Francisco,', 'CA', '94103'), ('474 Valencia Street, Suite 120 San Francisco,', 'CA', '94103'), ('13 San Francisco,', 'CA', '941

In [61]:
corrected_addrs = []
for j in tqdm(df.loc[:, 'Corrected Text']):
    corrected_addrs.extend(re.findall(test_regex, remove_bad_chars(j), re.IGNORECASE))
print(len(corrected_addrs))
print(corrected_addrs[:25])

HBox(children=(IntProgress(value=0, max=735), HTML(value='')))


5755
[('205 sacramento.', 'california', '95814'), ('354318 th street suite 11 san francisco,', 'ca', '94110'), ('101 h 8644996 ff,', 'ca', '94103'), ('5106586930 carol dawson 529 holly park circle h 2822670 ff,', 'ca', '94110'), ('1312 w 4763785 ff,', 'ca', '94143'), ('5979213 suite 600 email cynthia gomez quick mail. use. edu ff,', 'ca', '94105'), ('20513 th street w 8614060 suite 3150 f ff,', 'ca', '94103'), ('1062 megan shif let page street h 5523999 ff,', 'ca', '94117'), ('785 fulton st., apt a h 6261135 ff,', 'ca', '94102'), ('4878662 street survival pr j. f', '', '48786'), ('515 cor land ave. h 6216364 w 2062147 ff,', 'ca', '94110'), ('1748 market st. h 8644626 w 8614964 ff,', 'ca', '94102'), ('912 h 6215620 w 2915472 ff,', 'ca', '94104'), ('11 w 5751180 ff,', 'ca', '94110'), ('11 w 5751180 ff,', 'ca', '94110'), ('354318 th street, suite 11 san francisco,', 'ca', '94110'), ('3543 eighteenth street, suite eleven san francisco', 'ca', '94110'), ('354318 tm street, suite 11. san fr

In [62]:
with open('addresses_from_corrected.txt', 'w') as f:
    for i in corrected_addrs:
        f.write('|'.join(i) + '\n')

In [63]:
with open('addresses_from_raw.txt', 'w') as f:
    for i in addr:
        f.write('|'.join(i) + '\n')