# Clean NSF-OCE award data set
Created by Ivan Lima on Tue Mar  3 2020 20:37:14 -0500

In this notebook we:

- Clean award titles of typos and misspellings and make them more consistent.
- Clean award abstracts by removing award numbers, PI names, postscript code & other irrelevant information

**NOTE: After running this notebook, the resulting CSV file is edited by hand to remove remaining award numbers, titles and PI names, and postscript code from abstracts**

In [1]:
import pandas as pd
import numpy as np
import datetime, re
from tqdm import tnrange, notebook
pd.options.display.max_columns = 50
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Mon Mar 16 14:17:14 2020


## Read NSF-OCE data

In [2]:
datadir = '/home/ivan/Projects/NSF_OCE_topics/Gender/'
awards = pd.read_csv(datadir + 'data/nsf-oce_awards_1985-2020.csv', parse_dates=[4,5,11],
                     encoding="ISO-8859-1")

# rename some columns
mapper = {'Program(s)':'Programs',
          'PrincipalInvestigator':'PI',
          'AwardInstrument':'Instrument',
          'AwardedAmountToDate':'AwardedAmount',
          'Co-PIName(s)':'Co-PIs',
          'ARRAAmount':'ARRA_Amount'}
awards = awards.rename(mapper, axis='columns')

# select columns
awards = awards[['AwardNumber','Title','Programs','StartDate','EndDate','PI','Co-PIs',
                 'Organization','State','Instrument','AwardedAmount','ARRA_Amount','Abstract']]

# change type to string
for col in ['Title','Programs','PI','Co-PIs','Organization','State','Instrument','Abstract']:
    awards[col] = awards[col].astype('string')

# use only awards from 1987 to 2019
awards = awards.loc[awards.StartDate.dt.year > 1986]
awards = awards.loc[awards.StartDate.dt.year < 2020]
n_total = len(awards)

# drop duplicate records
awards = awards.drop_duplicates('AwardNumber')

# use award number as index
awards = awards.set_index('AwardNumber')
awards.loc[9421772,'Abstract'] = awards.loc[9302890,'Abstract'] # renewal award

# set missing PI names
awards.loc[awards.PI.str.contains('DATA NOT AVAILABLE').fillna(0),'PI'] = np.nan

awards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15207 entries, 8814229 to 1535728
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Title          15207 non-null  string        
 1   Programs       15177 non-null  string        
 2   StartDate      15207 non-null  datetime64[ns]
 3   EndDate        15207 non-null  datetime64[ns]
 4   PI             15173 non-null  string        
 5   Co-PIs         5067 non-null   string        
 6   Organization   15207 non-null  string        
 7   State          15027 non-null  string        
 8   Instrument     15207 non-null  string        
 9   AwardedAmount  15207 non-null  object        
 10  ARRA_Amount    15207 non-null  object        
 11  Abstract       14709 non-null  string        
dtypes: datetime64[ns](2), object(2), string(8)
memory usage: 2.1+ MB


## Number of awards with valid abstracts per year

In [3]:
df = pd.DataFrame(
    {'total':awards.StartDate.groupby(awards.StartDate.dt.year).count(),
     'valid':awards[awards.Abstract.notnull()].StartDate.groupby(awards.StartDate.dt.year).count()})
df['valid/total'] = df.valid/df.total
df

Unnamed: 0_level_0,total,valid,valid/total
StartDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1987,462,266,0.575758
1988,352,341,0.96875
1989,368,355,0.964674
1990,457,446,0.97593
1991,572,562,0.982517
1992,460,430,0.934783
1993,444,426,0.959459
1994,476,462,0.970588
1995,376,356,0.946809
1996,422,408,0.966825


In [4]:
n_unique = len(awards)
n_valid = len(awards[awards.Abstract.notnull()])

print('Total number of records: {}'.format(n_total))
print('Number of unique records: {}'.format(n_unique))
print('Number of duplicate records: {}'.format(n_total - n_unique))
print('Number of records with valid abstracts: {}'.format(n_valid))
print('Number of records with missing abstracts: {}'.format(n_unique - n_valid))

Total number of records: 15421
Number of unique records: 15207
Number of duplicate records: 214
Number of records with valid abstracts: 14709
Number of records with missing abstracts: 498


## Clean award titles

Remove typos, misspellings, inconsistencies and other errors from award Titles.

In [5]:
awards['Title'] = [s.replace('U.S.','US ') for s in awards.Title]

for c in ['1-D','2-D','3-D','4-D']:
    awards['Title'] = [s.replace(c,c.replace('-','')) for s in awards.Title]

for c in [':', '-', ';', ',', '.', '(', ')', '[',']','"']:
    awards['Title'] = [s.replace(c,' ') for s in awards.Title]

awards['Title'] = [' '.join(s.split()).lower() for s in awards.Title]

for word in ['proposal','project','investigation','researchl','reserach']:
    awards.loc[awards.Title.str.startswith('collaborative ' + word),'Title'] = [s.replace(word,'research')
                                                                                for s in awards.Title if
                                                                                s.startswith('collaborative '
                                                                                             + word)]
awards.loc[awards.Title.str.startswith('rui collaborative'),'Title'] = [
    s.replace('rui ','') for s in awards.Title if s.startswith('rui collaborative')]
awards.loc[awards.Title.str.startswith('crui collaborative'),'Title'] = [
    s.replace('crui ','') for s in awards.Title if s.startswith('crui collaborative')]
awards.loc[awards.Title.str.startswith('biocomplexity collaborative'),'Title'] = [
    s.replace('biocomplexity ','') for s in awards.Title if s.startswith('biocomplexity collaborative')]

def replace(df, old_str, new_str, row, col='Title'):
    awards.loc[row,col] = awards.loc[row,col].replace(old_str,new_str)
    
replace(awards,'eddy','ring',324688)
replace(awards,'h20','h2o',1821)
replace(awards,'collaorative','collaborative',1821)
replace(awards,'collaorative','collaborative',2527)
replace(awards,'h2','h2o',1825)
replace(awards,'survey','study',99161)
replace(awards,'dorvilleidea','dorvilleidae',425317)
replace(awards,'climate change','expanding oxygen minimum zones',1459243)
replace(awards,'fom','from',9911418)
replace(awards,'reearch','research',9813645)
replace(awards,'resarch','research',452780)
replace(awards,'reseach','research',1270)
replace(awards,'reseach','research',9907305)
replace(awards,'research/rui','research',314523)
replace(awards,'u s globec','us globec',9813645)
replace(awards,'aghulas','agulhas',95572)
replace(awards,'turbulence','turbulent',1756264)
replace(awards,'measurement','measurements',9523541)
replace(awards,'form','from',9731304)
replace(awards,'hightemperature','high temperature',526072)
replace(awards,'constantflux','constant flux',9711870)
replace(awards,'accfetionaly','accretionary',9731157)
replace(awards,'formuation','formation',9731463)
replace(awards,'sselected','selected',752063)
replace(awards,'meoscale','mesoscale',734)

for word in ['collborative','collaorative','collabortive','collaboratice','colloborative']:
    for i in awards.index:
        if word in awards.loc[i,'Title']:
            awards.loc[i,'Title'] = awards.loc[i,'Title'].replace(word,'collaborative')

for word in ['u s globec','usglobec']:
    for i in awards.index:
        if word in awards.loc[i,'Title']:
            awards.loc[i,'Title'] = awards.loc[i,'Title'].replace(word,'us globec')

for word in ['geotraces','jgofs','arctic']:
    for i in awards.index:
        if 'u s '+word in awards.loc[i,'Title']:
            awards.loc[i,'Title'] = awards.loc[i,'Title'].replace('u s '+word,'us '+word)

awards.loc[327872,'Title'] = awards.loc[327885,'Title']
awards.loc[9310963,'Title'] = awards.loc[9304219,'Title']
awards.loc[9730971,'Title'] = awards.loc[9730813,'Title']
awards.loc[220504,'Title'] = awards.loc[220745,'Title']

for n in [1131016,1130217]:
    replace(awards,'collaborative res','collaborative research',n)

for n in [527075,527053,526120,525894]:
    awards.loc[n,'Title'] = awards.loc[525872,'Title']

for n in [9907884,9908072,9910898,9910896,9912361,624544]:
    awards.loc[n,'Title'] = awards.loc[9910897,'Title']
    
for n in [9806465,9806594]:
    awards.loc[n,'Title'] = awards.loc[9813645,'Title']

for n in [238745,238957,238653,238849]:
    awards.loc[n,'Title'] = awards.loc[240187,'Title']

for n in [95972]:
    awards.loc[n,'Title'] = awards.loc[95913,'Title']

for n in [430463,426428]:
    awards.loc[n,'Title'] = awards.loc[426408,'Title']

for n in [214365,318081]:
    awards.loc[n,'Title'] = awards.loc[318371,'Title']

for n in [138933,138547]:
    awards.loc[n,'Title'] = awards.loc[138544,'Title']

for n in [9811530,9811535,9811536]:
    awards.loc[n,'Title'] = awards.loc[9811034,'Title']

for n in [327644,327693]:
    awards.loc[n,'Title'] = awards.loc[327721,'Title']

for n in [9911286,296087]:
    awards.loc[n,'Title'] = awards.loc[1270,'Title']

for n in [2527,2628]:
    awards.loc[n,'Title'] = awards.loc[2501,'Title']

for n in [814397,814652,814494]:
    awards.loc[n,'Title'] = awards.loc[814584,'Title']

for n in [336834,611530,611521,611486,611572,611579]:
    awards.loc[n,'Title'] = awards.loc[336768,'Title']

awards.loc[1435862,'Title'] = awards.loc[1436019,'Title']
awards.loc[1434998,'Title'] = awards.loc[1434805,'Title']
awards.loc[1435871,'Title'] = awards.loc[1437266,'Title']
awards.loc[1438047,'Title'] = awards.loc[1437266,'Title']
awards.loc[1132480,'Title'] = awards.loc[928191,'Title']
awards.loc[9819615,'Title'] = awards.loc[9900943,'Title']
awards.loc[327273,'Title'] = awards.loc[328119,'Title']
awards.loc[351345,'Title'] = awards.loc[351778,'Title']
awards.loc[1551640,'Title'] = awards.loc[1551657,'Title']
awards.loc[453105,'Title'] = awards.loc[452262,'Title']
awards.loc[9815186,'Title'] = awards.loc[9811471,'Title']
awards.loc[9634593,'Title'] = awards.loc[9529953,'Title']
awards.loc[452664,'Title'] = awards.loc[452727,'Title']
awards.loc[2375,'Title'] = awards.loc[2885,'Title']
awards.loc[99109,'Title'] = awards.loc[99078,'Title']
awards.loc[327209,'Title'] = awards.loc[326734,'Title']
awards.loc[122213,'Title'] = awards.loc[79383,'Title']
awards.loc[1558605,'Title'] = awards.loc[1559080,'Title']
awards.loc[1038803,'Title'] = awards.loc[1038809,'Title']
awards.loc[452800,'Title'] = awards.loc[453117,'Title']
awards.loc[95048,'Title'] = awards.loc[95064,'Title']
awards.loc[424786,'Title'] = awards.loc[424967,'Title']
awards.loc[425363,'Title'] = awards.loc[425583,'Title']
awards.loc[528702,'Title'] = awards.loc[707385,'Title']
awards.loc[1657801,'Title'] = awards.loc[1658135,'Title']
awards.loc[337893,'Title'] = awards.loc[337888,'Title']
awards.loc[1131772,'Title'] = awards.loc[1131455,'Title']
awards.loc[623358,'Title'] = awards.loc[623633,'Title']
awards.loc[425404,'Title'] = awards.loc[425197,'Title']
awards.loc[1354167,'Title'] = awards.loc[1355436,'Title']
awards.loc[1639588,'Title'] = awards.loc[1639614,'Title']
awards.loc[9810223,'Title'] = awards.loc[9808173,'Title']
awards.loc[1332915,'Title'] = awards.loc[1334052,'Title']
awards.loc[9115923,'Title'] = awards.loc[9116028,'Title']
awards.loc[1061189,'Title'] = awards.loc[1059924,'Title']
awards.loc[136098,'Title'] = ' '.join(awards.loc[136098,'Title'].split()[:9])
awards.loc[136215,'Title'] = ' '.join(awards.loc[136215,'Title'].split()[:9])
awards.loc[9714898,'Title'] = awards.loc[9711870,'Title']
awards.loc[9811380,'Title'] = awards.loc[9818479,'Title']
awards.loc[1633936,'Title'] = awards.loc[1634044,'Title']
awards.loc[2385,'Title'] = awards.loc[2223,'Title']
awards.loc[99154,'Title'] = awards.loc[96360,'Title']
awards.loc[1459513,'Title'] = awards.loc[1458936,'Title']
awards.loc[1132752,'Title'] = awards.loc[927558,'Title']
awards.loc[104622,'Title'] = awards.loc[106139,'Title']
awards.loc[898,'Title'] = awards.loc[734,'Title']
awards.loc[352256,'Title'] = awards.loc[352229,'Title']
awards.loc[137110,'Title'] = awards.loc[137365,'Title']
awards.loc[317934,'Title'] = awards.loc[316179,'Title']
awards.loc[9730790,'Title'] = awards.loc[9730569,'Title']
# awards.loc[1924140,'Title'] = awards.loc[1923892,'Title']
awards.loc[9317733,'Title'] = awards.loc[9314470,'Title']
awards.loc[9911339,'Title'] = awards.loc[351,'Title']

for n in [9820105,9905466]:
    awards.loc[n,'Title'] = awards.loc[9901563,'Title']
    
awards.loc[136298,'Title'] = awards.loc[136977,'Title']
awards.loc[115177,'Title'] = awards.loc[116236,'Title']
awards.loc[241681,'Title'] = awards.loc[241052,'Title']

for n in [9633814,9634132]:
    awards.loc[n,'Title'] = awards.loc[9633264,'Title']
    
for n in [9303524,9411771]:
    awards.loc[n,'Title'] = awards.loc[9300275,'Title']

for n in [1824,1826]:
    awards.loc[n,'Title'] = awards.loc[1902,'Title']

for n in [9402324,9402622]:
    awards.loc[n,'Title'] = awards.loc[9406937,'Title']

for n in [2903,2598]:
    awards.loc[n,'Title'] = awards.loc[2758,'Title']

for n in [215399,215416]:
    awards.loc[n,'Title'] = awards.loc[215433,'Title']

replace(awards,'cods','cod',9806712)
replace(awards,'for','from',9806712)

for n in [9813649,218299]:
    awards.loc[n,'Title'] = awards.loc[9806712,'Title']
    # awards.loc[n,'Abstract'] = awards.loc[9806712,'Abstract']

for n in [9505249,9505528,9503817]:
    awards.loc[n,'Title'] = awards.loc[9503931,'Title']

for n in [9732389,9729765]:
    awards.loc[n,'Title'] = awards.loc[9730322,'Title']

replace(awards,'interpretationof','interpretation of',9317611)
replace(awards,'upturing','rupturing',305731)
replace(awards,'lithosph','lithosp',305731)
replace(awards,'interdiscipinary','interdisciplinary',83134)
replace(awards,'geomicrobiology','geomicrobiological',433729)
replace(awards,'prosseses','processes',433869)
replace(awards,'phsysiological','physiological',350359)
replace(awards,'geostropic','geostrophic',351905)
replace(awards,'langrangian','lagrangian',238564)
replace(awards,'verus','versus',242321)
replace(awards,'investigations','investigation',1456710)
replace(awards,'investigations','investigation',1751099)
replace(awards,'deliver','delivery',1456710)
replace(awards,'deliver','delivery',1751099)
replace(awards,'gas','gases',9302313)
replace(awards,'margins','margin',505075)
replace(awards,'communites','communities',528017)
replace(awards,'nitrogen','n2',9981252)
replace(awards,'measurements','experiments',2674)
replace(awards,'millenial','millennial',350497)
replace(awards,'coral','corals',551481)
replace(awards,'paleoventiltion','paleoventilation',425266)
replace(awards,'minerals','mineral',9711735)
replace(awards,'ntirate','nitrate',1635099)
replace(awards,'sub polar','subpolar',9531879)
replace(awards,'/fluorescence','/ fluorescence',220379)
replace(awards,'responses','response',1103508)
replace(awards,'chemoautotrohic','chemoautotrophic',95904)
replace(awards,'b p','bp',9807745)
replace(awards,'ne excretion','n excretion',95404)
replace(awards,'roles','role',9401990)
replace(awards,'convection','convections',137347)
replace(awards,'speices','species',3035)
replace(awards,'hawaii','hawaiian',1737243)
replace(awards,'responses','response',96205)
replace(awards,'responses','response',9901146)
replace(awards,'transformation','transformations',118410)
replace(awards,'transports','transport',117346)
replace(awards,'ridge','range',95262)
replace(awards,'emiliana','emiliania',9102767)
replace(awards,'organiz','organic',962362)
replace(awards,'coal','coral',9907869)
replace(awards,'svedrup','sverdrup',2886)
replace(awards,'sverdup','sverdrup',2473)
replace(awards,'suducting','subducting',2031)
replace(awards,'suducting','subducting',1892)
replace(awards,'investigationof','investigation of',9714368)
replace(awards,'processess','processes',850714)
replace(awards,'collaborative','collaborative research',2816)
replace(awards,'determing','determining',9911962)
replace(awards,'system','systems',9730923)
replace(awards,'anaysis','analysis',351307)
replace(awards,'exhange','exchange',961810)
replace(awards,'exhange','exchange',962391)
replace(awards,'microorganosms','microorganisms',9981437)

awards.loc[8916800,'Abstract'] = awards.loc[8916800,'Abstract'].replace('>','')

awards.loc[9201567,'Title'] = awards.loc[9201332,'Title']
awards.loc[350647,'Title'] = awards.loc[350543,'Title']
awards.loc[9225051,'Title'] = awards.loc[9218511,'Title']
awards.loc[1658132,'Title'] = awards.loc[1658218,'Title']
awards.loc[350409,'Title'] = awards.loc[350359,'Title']
awards.loc[9402690,'Title'] = awards.loc[9409795,'Title']
awards.loc[9018337,'Title'] = awards.loc[9314206,'Title']
awards.loc[9310590,'Title'] = awards.loc[9310591,'Title']
awards.loc[9312695,'Title'] = awards.loc[9500601,'Title']
awards.loc[221008,'Title'] = awards.loc[220680,'Title']
awards.loc[95936,'Title'] = awards.loc[99316,'Title']
awards.loc[9906657,'Title'] = awards.loc[9910446,'Title']
awards.loc[9302496,'Title'] = awards.loc[9302614,'Title']
awards.loc[9314512,'Title'] = awards.loc[9315085,'Title']
awards.loc[96612,'Title'] = awards.loc[96668,'Title']
awards.loc[222285,'Abstract'] = awards.loc[96612,'Abstract']
awards.loc[943387,'Title'] = awards.loc[943430,'Title']
awards.loc[425830,'Title'] = awards.loc[425728,'Title']
awards.loc[9812237,'Title'] = awards.loc[9811575,'Title']
awards.loc[1658318,'Title'] = awards.loc[1657944,'Title']
awards.loc[2715,'Title'] = awards.loc[1876,'Title']
awards.loc[1356822,'Title'] = awards.loc[1357150,'Title']
awards.loc[9403668,'Title'] = awards.loc[9403409,'Title']
awards.loc[9618473,'Title'] = awards.loc[9618203,'Title']
awards.loc[1029290,'Title'] = awards.loc[1031140,'Title']
awards.loc[9712549,'Title'] = awards.loc[9712164,'Title']
awards.loc[352984,'Title'] = awards.loc[352754,'Title']
awards.loc[118036,'Title'] = awards.loc[117796,'Title']
awards.loc[1903650,'Title'] = awards.loc[1903148,'Title']
awards.loc[9730695,'Title'] = awards.loc[9813542,'Title']
awards.loc[1436003,'Title'] = awards.loc[1435515,'Title']
awards.loc[9810669,'Title'] = awards.loc[9815130,'Title']
awards.loc[330808,'Title'] = awards.loc[329308,'Title']
awards.loc[9115357,'Title'] = awards.loc[9101154,'Title']
awards.loc[220213,'Title'] = awards.loc[220379,'Title']
awards.loc[9905136,'Title'] = awards.loc[9819151,'Title']
awards.loc[628372,'Title'] = awards.loc[628491,'Title']
awards.loc[1445723,'Title'] = awards.loc[1445719,'Title']
awards.loc[96909,'Title'] = awards.loc[96814,'Title']
awards.loc[434023,'Title'] = awards.loc[434019,'Title']
awards.loc[9819506,'Title'] = awards.loc[9818464,'Title']
awards.loc[526365,'Title'] = awards.loc[526188,'Title']
awards.loc[242160,'Title'] = awards.loc[241363,'Title']
awards.loc[1459827,'Title'] = awards.loc[1459706,'Title']
awards.loc[1832286,'Title'] = awards.loc[1831415,'Title']
awards.loc[1634467,'Title'] = awards.loc[1634432,'Title']
awards.loc[851205,'Title'] = awards.loc[851128,'Title']
awards.loc[9416583,'Title'] = awards.loc[9417414,'Title']
awards.loc[327327,'Title'] = awards.loc[327382,'Title']
awards.loc[221404,'Title'] = awards.loc[221250,'Title']
awards.loc[9910608,'Title'] = awards.loc[9906223,'Title']
awards.loc[741481,'Title'] = awards.loc[741554,'Title']
awards.loc[116940,'Title'] = awards.loc[117149,'Title']
awards.loc[2186,'Title'] = awards.loc[9986306,'Title']
awards.loc[9102767,'Title'] = awards.loc[8916800,'Title']
awards.loc[9633415,'Title'] = awards.loc[9633688,'Title']
awards.loc[2729,'Title'] = awards.loc[2464,'Title']
awards.loc[1600237,'Title'] = awards.loc[1600131,'Title']
awards.loc[9300508,'Title'] = awards.loc[9307295,'Title']
awards.loc[1737207,'Title'] = awards.loc[1737170,'Title']
awards.loc[1755574,'Title'] = awards.loc[1756613,'Title']
awards.loc[1946072,'Title'] = awards.loc[1756613,'Title']
awards.loc[95219,'Title'] = awards.loc[95171,'Title']
awards.loc[309064,'Title'] = awards.loc[95171,'Title']
awards.loc[95254,'Title'] = awards.loc[95171,'Title']
awards.loc[1060855,'Title'] = awards.loc[1061074,'Title']
awards.loc[326175,'Title'] = awards.loc[326268,'Title']
awards.loc[9311711,'Title'] = awards.loc[9301793,'Title']
awards.loc[9712174,'Title'] = awards.loc[9712135,'Title']
awards.loc[452528,'Title'] = awards.loc[451983,'Title']
awards.loc[9400751,'Title'] = awards.loc[9314517,'Title']
awards.loc[1334641,'Title'] = awards.loc[1334325,'Title']
awards.loc[118033,'Title'] = awards.loc[118707,'Title']
awards.loc[9618442,'Title'] = awards.loc[9725157,'Title']
awards.loc[222537,'Title'] = awards.loc[222752,'Title']
awards.loc[526800,'Title'] = awards.loc[526704,'Title']
awards.loc[9420649,'Title'] = awards.loc[9419323,'Title']
awards.loc[9501580,'Title'] = awards.loc[9503256,'Title']
awards.loc[9501580,'Title'] = awards.loc[9503256,'Title']
awards.loc[9731408,'Title'] = awards.loc[9800047,'Title']
awards.loc[9812453,'Title'] = awards.loc[9902048,'Title']
awards.loc[9812463,'Title'] = awards.loc[9902048,'Title']
awards.loc[9904396,'Title'] = awards.loc[9902048,'Title']
awards.loc[242091,'Title'] = awards.loc[242034,'Title']
awards.loc[137453,'Title'] = awards.loc[136855,'Title']
awards.loc[350834,'Title'] = awards.loc[350970,'Title']
awards.loc[351398,'Title'] = awards.loc[350970,'Title']
awards.loc[9314360,'Title'] = awards.loc[9411377,'Title']
awards.loc[9806566,'Title'] = awards.loc[9806506,'Title']
awards.loc[351498,'Title'] = awards.loc[351169,'Title']
awards.loc[1061881,'Title'] = awards.loc[1061863,'Title']
awards.loc[1061881,'Abstract'] = awards.loc[1061863,'Abstract']
awards.loc[9802264,'Title'] = ('collaborative research ' + 
                               awards.loc[9802264,'Title'].rstrip(' cooperative program'))
awards.loc[9802295,'Title'] = awards.loc[9802264,'Title']
awards.loc[9730637,'Title'] = awards.loc[9802264,'Title']
awards.loc[9819612,'Title'] = awards.loc[9907254,'Title']
awards.loc[9416711,'Title'] = awards.loc[9416630,'Title']
awards.loc[1325518,'Title'] = awards.loc[1325489,'Title']
awards.loc[424921,'Title'] = awards.loc[424744,'Title']
awards.loc[2634,'Title'] = awards.loc[2816,'Title']
awards.loc[9814211,'Title'] = 'collaborative research ' + awards.loc[9814211,'Title']
awards.loc[9810724,'Title'] = awards.loc[9814211,'Title']
awards.loc[9809459,'Title'] = awards.loc[9814211,'Title']
awards.loc[215497,'Title'] = awards.loc[215506,'Title']

for n in [296103,9911877,9912007]:
    awards.loc[n,'Title'] = awards.loc[9911962,'Title']
    
awards.loc[296103,'Abstract'] = awards.loc[9911962,'Abstract']
awards.loc[9314483,'Title'] = awards.loc[9314647,'Title']
awards.loc[9906990,'Title'] = awards.loc[9907205,'Title']
awards.loc[136644,'Title'] = awards.loc[136768,'Title']
awards.loc[9521098,'Title'] = awards.loc[9527777,'Title']
awards.loc[2488,'Title'] = awards.loc[2551,'Title']

for n in [9818632,9818886,478]:
    awards.loc[n,'Title'] = awards.loc[2189,'Title']

awards.loc[453019,'Title'] = awards.loc[453029,'Title']
awards.loc[1129119,'Title'] = awards.loc[1129270,'Title']
awards.loc[551438,'Title'] = awards.loc[551436,'Title']
awards.loc[9908091,'Title'] = awards.loc[9906926,'Title']
awards.loc[1803933,'Title'] = awards.loc[1803803,'Title']
awards.loc[961689,'Abstract'] = awards.loc[961810,'Abstract']
awards.loc[536345,'Title'] = awards.loc[536326,'Title']
awards.loc[1735436,'Title'] = awards.loc[1735846,'Title']

replace(awards,'centers','center',215225)
replace(awards,'cycle','cycling',526522)
replace(awards,'endeavor','endeavour',1038126)
replace(awards,'acitivity','activity',1043549)
replace(awards,'acitivity','activity',1043403)
replace(awards,'climatic','climate',318296)
replace(awards,'quantitiative','quantitative',305607)
replace(awards,'papa','papua',305607)
replace(awards,'distribution','distributions',9815179)
replace(awards,'examing','examining',1313867)
replace(awards,'refinement&application','refinement and application',926986)
replace(awards,'accretionaly','accretionary',9811975)
replace(awards,'&','and',1436748)
replace(awards,'geochemistry','geochemical',9310364)
replace(awards,'seimogenic','seismogenic',1944)
replace(awards,'transformationssss','transformations',118410)
replace(awards,'on zooplankton','of zooplankton',3273)
replace(awards,'modelling','modeling',9711431)
replace(awards,'proposal','research',1045079)
replace(awards,'proposal','research',1801945)
replace(awards,'hot spot','hotspot',2312)

for n in [9907854,9907919]:
    awards.loc[n,'Title'] = awards.loc[9907953,'Title']
    
for n in [117342,336716,117313]:
    awards.loc[n,'Title'] = awards.loc[117582,'Title']
    
awards.loc[1829992,'Title'] = awards.loc[1829921,'Title']
awards.loc[221889,'Title'] = awards.loc[118111,'Title']
awards.loc[99283,'Title'] = awards.loc[95389,'Title']
awards.loc[9302175,'Title'] = awards.loc[9302058,'Title']
awards.loc[1638164,'Title'] = awards.loc[1638168,'Title']
awards.loc[428483,'Title'] = awards.loc[427974,'Title']

for n in [97308,335589]:
    awards.loc[n,'Title'] = awards.loc[97327,'Title']
    
awards.loc[9402469,'Title'] = awards.loc[9402984,'Title']
awards.loc[520833,'Title'] = awards.loc[519602,'Title']
awards.loc[83082,'Title'] = awards.loc[83120,'Title']

for n in [452904,453023]:
    awards.loc[n,'Title'] = awards.loc[451419,'Title']
    
for n in [926485,1211106]:
    awards.loc[n,'Title'] = awards.loc[926986,'Title']
    
for n in [196427,196141]:
    awards.loc[n,'Abstract'] = awards.loc[9981371,'Abstract']
    
for n in [9196174,9196143]:
    awards.loc[n,'Abstract'] = awards.loc[8911939,'Abstract']
    
for n in [326419,324666]:
    awards.loc[n,'Title'] = awards.loc[326616,'Title']
    
for n in [9905469,9910350]:
    awards.loc[n,'Title'] = awards.loc[9910609,'Title']
    
for n in [9907458,9818947]:
    awards.loc[n,'Title'] = awards.loc[2128,'Title']
    
for n in [9996174,9729784]:
    awards.loc[n,'Title'] = awards.loc[9714302,'Title']
for n in [9714302,9996174]:
    awards.loc[n,'Abstract'] = awards.loc[9729784,'Abstract']
    
awards.loc[9317737,'Title'] = awards.loc[9315554,'Title']
awards.loc[9415991,'Title'] = awards.loc[9416595,'Title']
awards.loc[9912333,'Title'] = awards.loc[9981218,'Title']
awards.loc[9301554,'Title'] = awards.loc[9302907,'Title']
awards.loc[1155205,'Title'] = awards.loc[1155676,'Title']
awards.loc[9811925,'Title'] = awards.loc[9814172,'Title']
awards.loc[1635950,'Title'] = awards.loc[1661683,'Title']
awards.loc[525385,'Title'] = awards.loc[526644,'Title']
awards.loc[2543,'Title'] = awards.loc[2529,'Title']
awards.loc[1658214,'Title'] = awards.loc[1658491,'Title']
awards.loc[81770,'Title'] = awards.loc[81175,'Title']
awards.loc[196344,'Abstract'] = awards.loc[81175,'Abstract']
awards.loc[119019,'Title'] = awards.loc[224767,'Title']
awards.loc[118101,'Title'] = awards.loc[118071,'Title']
awards.loc[9503557,'Title'] = awards.loc[9503670,'Title']
awards.loc[1103508,'Title'] = awards.loc[1103519,'Title']
awards.loc[82300,'Title'] = awards.loc[81826,'Title']
awards.loc[9901351,'Title'] = awards.loc[9871982,'Title']
awards.loc[9618661,'Title'] = awards.loc[9618356,'Title']
awards.loc[1138944,'Title'] = awards.loc[1139036,'Title']
awards.loc[600624,'Title'] = awards.loc[601098,'Title']
awards.loc[9523159,'Title'] = awards.loc[9531847,'Title']
awards.loc[9811054,'Title'] = awards.loc[9811163,'Title']
awards.loc[8996266,'Abstract'] = awards.loc[8810932,'Abstract']

awards.loc[9819260,'Abstract'] = awards.loc[9819779,'Abstract']
for n in [9819779,244975]:
    awards.loc[n,'Title'] = awards.loc[9819260,'Title']
    
awards.loc[2600,'Title'] = awards.loc[2488,'Title']
awards.loc[238564,'Title'] = awards.loc[238957,'Title']
awards.loc[2459,'Title'] = awards.loc[2758,'Title']
awards.loc[9303669,'Title'] = awards.loc[9302162,'Title']
awards.loc[9596148,'Title'] = awards.loc[9302162,'Title']

## Estimate number of Collaborative Research awards

In [6]:
colres = awards[awards.Title.str.contains('collaborative')]
grouped = colres.reset_index()[['AwardNumber','Title']].groupby('Title').count()
print('{} records grouped into {} Collaborative Research awards'.format(len(colres),len(grouped)))

4724 records grouped into 1882 Collaborative Research awards


## Compute number of Collaborative Research awards with one single award

In [7]:
ones = grouped[grouped.AwardNumber==1]
print('{} / {} = {:.1f}%'.format(len(ones),len(grouped),len(ones)/len(grouped)*100))
ones.to_csv('colres_one.csv') # save to CSV file

67 / 1882 = 3.6%


## Clean award Abstracts

Remove award number, PI names and other extraneous words and terms for award Abstracts.

In [8]:
awards['Abstract'] = awards.Abstract.str.rstrip('<br/>***<br/>')
awards['Abstract'] = awards.Abstract.str.rstrip('***<br/>')

arra_str = (
    '"This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5)."')

for word in ['ABSTRACT','PROPOSAL','Abstract',arra_str]:
    awards.loc[awards.Abstract.notnull(),'Abstract'] = [s.replace(word,' ')
                                                        for s in 
                                                        awards.loc[awards.Abstract.notnull(),'Abstract']]

# remove repeating empty spaces
awards.loc[awards.Abstract.notnull(),'Abstract'] = [
    ' '.join(s.split()) for s in awards.loc[awards.Abstract.notnull(),'Abstract']]

awards.loc[111623,'Abstract'] = re.sub('^.+:\s*','',awards.loc[111623,'Abstract'])
awards.loc[9711326,'Abstract'] = re.sub('^-','',awards.loc[9711326,'Abstract'])
awards.loc[9016894,'Abstract'] = awards.loc[9016894,'Abstract'].lstrip('. J H.')
awards.loc[9807686,'Abstract'] = re.sub('^\..+<br/>The PIs','The PIs',awards.loc[9807686,'Abstract'])

for i in [536431,327294]:
    awards.loc[i,'Abstract'] = re.sub('^\..+Project Summary<br/>','',awards.loc[i,'Abstract'])
    
for i in [2543,2529,85447]:
    awards.loc[i,'Abstract'] = re.sub('^.+<br/>','',awards.loc[i,'Abstract'])

for i in [1822721,136861,8911430,1025204]:
    awards.loc[i,'Abstract'] = awards.loc[i,'Abstract'].replace('"','')

awards.loc[awards.Abstract.str.contains('de Szoeke').fillna(0),'Abstract'] = [
    s.replace('de Szoeke','DeSzoeke') for s in 
    awards.loc[awards.Abstract.str.contains('de Szoeke').fillna(0),'Abstract']]

awards.loc[awards.Abstract.str.contains('Hickey Vargas').fillna(0),'Abstract'] = [
    s.replace('Hickey Vargas','Hickey-Vargas') for s in 
    awards.loc[awards.Abstract.str.contains('Hickey Vargas').fillna(0),'Abstract']]

awards.loc[awards.Abstract.str.contains('Muller Karger').fillna(0),'Abstract'] = [
    s.replace('Muller Karger','Muller-Karger') for s in 
    awards.loc[awards.Abstract.str.contains('Muller Karger').fillna(0),'Abstract']]

for char in ['Â°','Âº']:
    awards.loc[awards.Abstract.str.contains(char).fillna(0), 'Abstract'] = [
        s.replace(char,' ') for s in awards.loc[awards.Abstract.str.contains(char).fillna(0), 'Abstract']]
    awards.loc[awards.Title.str.contains(char).fillna(0), 'Title'] = [
        s.replace(char,' ') for s in awards.loc[awards.Title.str.contains(char).fillna(0), 'Title']]

for char in ['Ã±','Â¹']:
    awards.loc[awards.Abstract.str.contains(char).fillna(0), 'Abstract'] = [
        s.replace(char,'n') for s in awards.loc[awards.Abstract.str.contains(char).fillna(0), 'Abstract']]
    awards.loc[awards.Title.str.contains(char).fillna(0), 'Title'] = [
        s.replace(char,'n') for s in awards.loc[awards.Title.str.contains(char).fillna(0), 'Title']]

awards.loc[awards.Abstract.str.contains('Ni¹o').fillna(0), 'Abstract'] = [
    s.replace('Ni¹o','Nino') for s in awards.loc[awards.Abstract.str.contains('Ni¹o').fillna(0), 'Abstract']]
awards.loc[awards.Title.str.contains('Ni¹o').fillna(0), 'Title'] = [
    s.replace('Ni¹o','Nino') for s in awards.loc[awards.Title.str.contains('Ni¹o').fillna(0), 'Title']]

awards.loc[9320572,'Abstract'] = np.nan # abstract is postscript garble

indlist = []
for i in notebook.tqdm(awards[awards.Abstract.notnull() & awards.PI.notnull()].index, desc='Awards:'):
    p1 = r'^{:07d}\s*(<br/>)+\s*{}\s*(<br/>)+\s*'.format(i,awards.loc[i,'PI'].split()[-1])
    p2 = r'^{:07d}\s*{}\s*(<br/>)+\s*'.format(i,awards.loc[i,'PI'].split()[-1])
    p3 = r'^{:07d}\s*{}\s*'.format(i,awards.loc[i,'PI'].split()[-1])
    p4 = r'^{:07d}\s*{}\s*'.format(i,awards.loc[i,'PI'].split()[-1].upper())
    p5 = r'^(<br/>)+\s*OCE\s*-\s*{:07d}\s*(<br/>)+\s*'.format(i)
    p6 = r'^(<br/>)+\s*OCE\s*{:07d}\s*(<br/>)+\s*'.format(i)
    p7 = r'^(<br/>)+\s*OCE\s*-\s*\d{7,}\s*(<br/>)+\s*'
    p8 = r'^(<br/>)+\s*OCE\s*-\s*%07d\s*/\s*OCE\s*-\s*\d{7,}\s*(<br/>)+\s*'%(i)
    p9 = r'^(<br/>)+\s*OCE\s*-\s*\d{7,}\s*/\s*OCE\s*-\s*%07d\s*(<br/>)+\s*'%(i)
    p10 = r'^{}\s*{:07d}\s*(<br/>)*\s*'.format(awards.loc[i,'PI'].split()[-1],i)
    p11 = r'^{}\s*OCE\s*-\s*{:07d}\s*(<br/>)*\s*'.format(awards.loc[i,'PI'].split()[-1],i)
    p12 = r'^{}\s*(<br/>)*\s*OCE\s*-\s*{:07d}\s*(<br/>)*\s*'.format(awards.loc[i,'PI'].split()[-1],i)
    p13 = r'^{}\s*(<br/>)*\s*{:07d}\s*(<br/>)*\s*'.format(awards.loc[i,'PI'].split()[-1],i)
    p14 = r'^{}\s*(<br/>)+\s*'.format(awards.loc[i,'PI'].split()[-1])
    p15 = r'^{}\s*(<br/>)*\s*OCE\s*{:07d}\s*(<br/>)*\s*'.format(awards.loc[i,'PI'].split()[-1],i)
    p16 = r'^{:07d}\s*/\s*{}\s*(<br/>)*\s*'.format(i,awards.loc[i,'PI'].split()[-1])
    p17 = r'^{:07d}\s*(<br/>)*\s*{}\s*/\s*[a-zA-Z/&\s-]+\s*(<br/>)*\s*'.format(i,awards.loc[i,'PI'].split()[-1])
    for p in [p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17]:
        if re.search(p, awards.loc[i,'Abstract']):
            awards.loc[i,'Abstract'] = re.sub(p,'',awards.loc[i,'Abstract'])
            indlist.append(i)

# replace these regex patterns with space to avoid fusing words together
p01 = r'\s*(<br/>)*\s*Intellectual Merits\s*[:.]*\s*(<br/>)*\s*'
p02 = r'\s*(<br/>)*\s*Broader Impacts\s*[:.]*\s*(<br/>)*\s*'
p03 = r'\s*(<br/>)*\s*Intellectual Merit\s*[:.]*\s*(<br/>)*\s*'
p04 = r'\s*(<br/>)*\s*Broader Impact\s*[:.]*\s*(<br/>)*\s*'
p05 = r'\s*(<br/>)*\s*Intellectual merits\s*[:.]*\s*(<br/>)*\s*'
p06 = r'\s*(<br/>)*\s*Broader impacts\s*[:.]*\s*(<br/>)*\s*'
p07 = r'\s*(<br/>)*\s*Intellectual merit\s*[:.]*\s*(<br/>)*\s*'
p08 = r'\s*(<br/>)*\s*Broader impact\s*[:.]*\s*(<br/>)*\s*'
for i in notebook.tqdm(awards[awards.Abstract.notnull()].index, desc='Awards:'):
    for p in [p01, p02, p03, p04, p05, p06, p07, p08]:
        if re.search(p, awards.loc[i,'Abstract']):
            awards.loc[i,'Abstract'] = re.sub(p,' ',awards.loc[i,'Abstract'])
            indlist.append(i)

# remove inserted spaces in the begining of abstracts
p05 = r'^\s+'
p06 = r'^\.+\s*'
for i in notebook.tqdm(awards[awards.Abstract.notnull()].index, desc='Awards:'):
    for p in [p05, p06]:
        if re.search(p, awards.loc[i,'Abstract']):
            awards.loc[i,'Abstract'] = re.sub(p,'',awards.loc[i,'Abstract'])

print('Number of modified abstracts: {}'.format(len(set(indlist))))

HBox(children=(FloatProgress(value=0.0, description='Awards:', max=14686.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Awards:', max=14718.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Awards:', max=14718.0, style=ProgressStyle(description_wi…


Number of modified abstracts: 5366


In [9]:
# ilist = []
# for i in awards[awards.Abstract.notnull()].index:
# #     p = r'\s*(<br/>)*\s*Intellectual Merits\s*(<br/>)*\s*'
# #     p = r'^\s*(<br/>)*\s*Intellectual Merit\s*(:)*\s*(<br/>)*\s*'
# #     p = r'(<br/>)*\s*Broader Impacts\s*(<br/>)*\s*'
# #     p = r'(<br/>)*\s*Broader Impact\s*(<br/>)*\s*'
# #     p = r'\s*(<br/>)*\s*Intellectual merit\s*[:.]*\s*(<br/>)*\s*'
# #     p = r'\s*(<br/>)*\s*Broader impact\s*[:.]*\s*(<br/>)*\s*'
# #     p = r'BIOCOMPLEXITY'
# #     p = r'COLLABORATIVE'
# #     p = r'American Recovery and Reinvestment Act of 2009'
#     p = 'Broader Significance'
#     if re.search(p, awards.loc[i,'Abstract']):
#         ilist.append(i)
# #         print(awards.loc[i,'Abstract'] + '\n')
# #         print(re.sub(p,' ',awards.loc[i,'Abstract'] + '\n'+113*'='))

# len(ilist)

## Save cleaned data to CSV file

In [10]:
awards.sort_values('Abstract').to_csv('data/awards_1985-2020_clean.csv', encoding='utf-8-sig')

## Abstracts with Postscript code

In [11]:
# for term in ['LaserJet','Microsoft Word','Û','Â','Times New Roman','Courier',r'\*\*']:
#     print(awards.loc[awards.Abstract.str.contains(term).fillna(0),'Abstract'])

# awards.loc[awards.Abstract.str.contains('LaserJet').fillna(0),'Abstract']
# awards.loc[awards.Abstract.str.contains('Microsoft Word').fillna(0),'Abstract']
# awards.loc[awards.Abstract.str.contains('Û').fillna(0),'Abstract']
# awards.loc[awards.Abstract.str.contains('À').fillna(0),'Abstract']
# awards.loc[awards.Abstract.str.contains('Times New Roman').fillna(0),'Abstract']
# awards.loc[awards.Abstract.str.contains('Courier').fillna(0),'Abstract']

In [12]:
inds = pd.Index([])
for term in ['LaserJet','Microsoft Word','Û','Â','Times New Roman','Courier',r'\*\*','Edited 6/3/04']:
    inds = inds.append(awards.loc[awards.Abstract.str.contains(term).fillna(0),'Abstract'].index)

inds = inds.drop_duplicates()
print(len(inds))
# awards.loc[inds,'Abstract']
for i in inds:
    print('{}\n{}\n'.format(i, awards.loc[i,'Abstract']))

29
9313865

9313671
WPC4 2 B V P Z Courier 10cpi ? x x x , { x 6 X @ 8 ; X @ HP LaserJet III-Change HPIIILAS.PRS x @ , \ , ||X @Courier 10cpi 2 @ Z F P #| x x @ 8 ; X @ HP LaserJet IIID hange HPLASIII.PRS x @ , \ , MX @ #| x 2 2 9313671 Beardsley Present evidence indicates that the recruitment and survival of commerically important fish larvae within the Georges Bank region depends critically on the development of stratification during the spring and summer period. Predicted climate change scenarios are likely to significantly influence the evolution of stratification and thus impact larval survival over Georges Bank. As part of U.S GLOBEC Georges Bank Study, Dr. Beardsley and collaborators will conduct an intensive field project during January August, 1995 aimed at better understanding the development of stratification over Georges Bank and its influence of larval fish survival. Preliminary studies will be initiated in 1994. The project will have a moored instrument component will inv