Connect to database, make loading function

In [2]:
from pymongo import MongoClient
client = MongoClient()
local = MongoClient("mongodb://localhost:27017").reporter
#mlab = MongoClient("mongodb://jv:pw@ds025379.mlab.com:25379/nihreporter").reporter

In [3]:
def loadDF(df, collectionName, db, dropFirst=True):
    if dropFirst:
        db[collectionName].drop()
    db[collectionName].insert_many(df.to_dict(orient='records'))
    print 'Successfully loaded:', collectionName

Get dfs ready for transformation and loading

In [4]:
import pandas as pd

In [99]:
df15links = pd.read_excel('RePORTER_PUBLNK_C_2015.xlsx')
df15pubs = pd.read_excel('RePORTER_PUB_C_2015.xlsx')
df15grants = pd.read_excel('RePORTER_PRJ_C_FY2015.xlsx').drop(['DIRECT_COST_AMT','INDIRECT_COST_AMT'], axis=1)
dfLinks = df15links.copy()
dfPubs = df15pubs.copy()
df = df15grants.copy()

<div style="background: lightgreen">
<h3>Grants model: Non-referential, compound doc approach</h3>
</div>

In [113]:
# loading PMID relationships
grantsLinkTable = dfLinks.groupby('PROJECT_NUMBER').agg(lambda pId: pId.tolist())
df = df.merge(grantsLinkTable, how='left', left_on='CORE_PROJECT_NUM', right_index=True)

In [115]:
cols = {
 'APPLICATION_ID': '_id',
 'ACTIVITY': 'activity',
 'ADMINISTERING_IC': 'administeringIc',
 'APPLICATION_TYPE': 'applicationType',
 'AWARD_NOTICE_DATE': 'awardNoticeDate',
 'BUDGET_START':'budgetStart',
 'BUDGET_END': 'budgetEnd',
 'CORE_PROJECT_NUM': 'coreProjectNum',
 'ED_INST_TYPE': 'edInstType',
 'FUNDING_ICs': 'fundingIcs',
 'FUNDING_MECHANISM': 'fundingMechanism',
 'FY': 'fy',
 'IC_NAME': 'icName',
 'NIH_SPENDING_CATS': 'nihSpendingCats',
 'ORG_CITY': 'orgCity',
 'ORG_COUNTRY': 'orgCountry',
 'ORG_DEPT': 'orgDept',
 'ORG_DISTRICT': 'orgDistrict',
 'ORG_DUNS': 'orgDuns',
 'ORG_FIPS': 'orgFips',
 'ORG_NAME': 'orgName',
 'ORG_STATE': 'orgState',
 'ORG_ZIPCODE': 'orgZip',
 'PHR': 'phr',
 'PI_IDS': 'piIds',
 'PI_NAMEs': 'piNames',
 'PROGRAM_OFFICER_NAME': 'programOfficerName',
 'PROJECT_START': 'projectStart',
 'PROJECT_END': 'projectEnd',
 'PROJECT_TERMS': 'projectTerms',
 'PROJECT_TITLE': 'projectTitle',
 'STUDY_SECTION': 'studySection',
 'STUDY_SECTION_NAME': 'studySectionName',
 'SUPPORT_YEAR': 'supportYear',
 'TOTAL_COST': 'totalCost',
 'PMID': 'publicationIds'
}
dropCols = [
 'ARRA_FUNDED',
 'CFDA_CODE',
 'FOA_NUMBER',
 'FULL_PROJECT_NUM',
 'SERIAL_NUMBER',
 'SUFFIX',
 'SUBPROJECT_ID',
 'TOTAL_COST_SUB_PROJECT'
]

In [116]:
# drop subprojects
# rename columns to JSONAPI style
# drop ID-like cols
# fill NaT type since mongo can't handle that
df = df\
    [df.TOTAL_COST.notnull()]\
    .rename(columns=cols)\
    .drop(dropCols, axis=1)\
    .fillna('')

In [117]:
# create activity type column with first letter of activity code
df['activityType'] = df.activity.apply(lambda i: str(i)[0] if i else '')

In [118]:
# parse the IC names and costs into an array
def fundingSplitter(icArray):
    icArray = str(icArray).split('''\\''')
    extractedICs = []
    for ic in icArray:
        if ic:
            tmp = ic.split(''':''')
            icName = tmp[0] if tmp[0] else ''
            cost = tmp[1] if len(tmp)>1 else 0
            extractedICs.append({'ic': icName, 'cost': cost})
    return extractedICs
df['fundingIcs'] = df.fundingIcs.apply(fundingSplitter)

In [119]:
def termSplitter(arr):
    terms = str(arr).split(';') if not pd.isnull(arr) else []
    return [term.lower().strip() for term in terms]
df['nihSpendingCats'] = df.nihSpendingCats.map(termSplitter)
df['projectTerms'] = df.projectTerms.map(termSplitter)
df['piIds'] = df.piIds.map(lambda name: name.encode('utf-8')).map(termSplitter)
df['piNames'] = df.piNames.map(lambda name: name.encode('utf-8')).map(termSplitter)

In [120]:
# fix timestamp issues w pymongo serializing improperly
df[['awardNoticeDate','budgetStart','budgetEnd','projectStart','projectEnd']] = \
    df[['awardNoticeDate','budgetStart','budgetEnd','projectStart','projectEnd']].apply(lambda c: c.apply(str))

In [132]:
# m
def reMapEmptyPubIds(pubIds):
    if pubIds == "":
        return []
    else:
        return pubIds
df['publicationIds'] = df.publicationIds.map(reMapEmptyPubIds)

In [133]:
loadDF(df, 'grant', local)

Successfully loaded: grant


#### To load into mlab:
Convert dates using:
```
db.grant.find({}).forEach(function(doc) { 
    doc.awardNoticeDate=new ISODate(doc.awardNoticeDate);
    doc.projectStart=new ISODate(doc.projectStart);
    doc.projectEnd=new ISODate(doc.projectEnd);
    doc.budgetStart=new ISODate(doc.budgetStart);
    doc.budgetEnd=new ISODate(doc.budgetEnd);
    db.grant_new.save(doc);
})
```

Then set index w:
```
db.grant_new.createIndex(
    {"$**": "text"},
    {"weights": 
        {
            projectTitle: 6, icName: 5, orgName: 4, projectTerms: 3, 
            nihSpendingCats: 3, administeringIc: 2, 'fundingIcs.ic': 2, phr: 2
        }
    }
)
```

In [134]:
# drop grant, and rename grant_new to grant
local.grant.drop()
local.grant_new.rename('grant')

<div style="background: lightgreen">
<h3>Pubs model: Non-referential, compound doc approach</h3>
</div>

In [121]:
dfGrantLink = df.groupby('coreProjectNum')['_id'].agg(lambda x: x.tolist())

In [122]:
dfLinks = dfLinks\
    .merge(dfGrantLink.reset_index(), how='left', left_on='PROJECT_NUMBER', right_on='coreProjectNum')\
    .drop('coreProjectNum', axis=1)\
    .rename(columns={0:'projects'})

In [123]:
def projectArrayMaker(arrays):
    tmp = []
    for r in arrays:
        if type(r)==list:
            [tmp.append(i) for i in r]
    return tmp
pubsLinkTable = dfLinks.groupby('PMID')['projects'].agg(projectArrayMaker)

In [124]:
dfPubs = dfPubs.merge(pubsLinkTable.reset_index(), how='left', on='PMID')

In [125]:
pubCols = {
    'AFFILIATION': 'affiliation',
    'AUTHOR_LIST': 'authors',
    'COUNTRY': 'country',
    'ISSN': 'issn',
    'JOURNAL_ISSUE': 'journalIssue',
    'JOURNAL_TITLE': 'journal',
    'JOURNAL_TITLE_ABBR': 'journalAbbr',
    'JOURNAL_VOLUME': 'journalVol',
    'LANG': 'lang',
    'PAGE_NUMBER': 'page',
    'PMC_ID': 'pmcId',
    'PMID': '_id',
    'PUB_DATE': 'pubDate',
    'PUB_TITLE': 'title',
    'PUB_YEAR': 'pubYear',
}

In [126]:
dfPubs = dfPubs\
    .rename(columns=pubCols)\
    .fillna('')

In [127]:
dfPubs['authors'] = dfPubs.authors.map(lambda x: x.encode('utf-8')).map(termSplitter)

In [128]:
dfPubs['issn'] = dfPubs.issn.map(str)
dfPubs['page'] = dfPubs.page.map(str)

In [98]:
loadDF(dfPubs, 'publication', local)

Successfully loaded: publication


#### To load into mlab:

Set index w:
```
db.publication.createIndex(
    {"$**": "text"},
    {"weights": 
        {
            title: 6, journal: 5, journalAbbr: 4, author: 3
        }
    }
)
```