In [1]:
import pandas as pd
import numpy as np

In [2]:
def fill_missing_master(master, match_cols):
    '''todo'''
    master = master.copy()
    
    for col, match_list in match_cols.items():
        if len(match_list)>0:
            print(col+': ',end='')
            master_missing = (master[col].isnull()) | (master[col].astype(str).str.strip()=='')
            print('Missing before = '+str(sum(master_missing)),end='')
            
            for row in master.index:
                overwrite=True

                for series in match_list:
                    if (type(series)==str) and (series=='IF_MISSING'):
                        # flag to switch from overwriting to only filling 
                        # missing values for all subsequent series
                        overwrite=False

                    else:
                        series_missing = (series.isnull()) | (series.astype(str).str.strip()=='')

                        if master_missing[row] | overwrite:
                            # no value in master or set to overwrite so try to fill
                            # value in master from series
                            if (row in series) and (not series_missing[row]):
                                master.loc[row,col] = series[row]
                                break

                        else:
                            # some value exists in master and set to not overwrite
                            # so do nothing
                            break
                            
            master_missing = (master[col].isnull()) | (master[col].astype(str).str.strip()=='')            
            print(', Missing after = '+str(sum(master_missing)))

    return master

In [3]:
master = pd.read_excel('data/Award Holders Master.xlsx',
                      index_col='Grant Ref Unique')

# remove duplicated, nan or bad indices
master = master[master.index.notna()]
master = master[~master.index.duplicated(keep='first')]
master = master[~((master.index.str.len()<2)|(master.index.str.len()>20))]

master.index.name = 'MasterID'
master.index = master.index.astype(str)

master.columns = master.columns.str.strip()

In [4]:
unrestricted = pd.to_numeric(master['UNRESTRICTED'],errors='coerce').fillna(0)
restricted = pd.to_numeric(master['RESTRICTED'],errors='coerce').fillna(0)

master['Fund Amount'] = unrestricted+restricted
master.drop(['UNRESTRICTED','RESTRICTED'],axis=1,inplace=True)

master['Fund Amount'].replace(0,np.nan,inplace=True)

In [5]:
rf = pd.read_excel('data/ResearchFish/ResearchFish_master.xlsx',
                   index_col='MasterID')
rf.index = rf.index.fillna(-1)
rf.index = rf.index.astype(str)

ccgt = pd.read_excel('data/CCGrantTracker/CCGT_processed.xlsx',
                     index_col='MasterID')
ccgt.index = ccgt.index.fillna(-1)
ccgt.index = ccgt.index.astype(str)

filing = pd.read_excel('data/FilingCabinet/FilingCabinet_processed.xlsx',
                       index_col='MasterID')
filing.index = filing.index.fillna(-1)
filing.index = filing.index.astype(str)

In [6]:
MATCH_COLS = {'Holder':[],
              'Email':[rf['PI Email'],ccgt['Lead Applicant Email']],
              'Institution':['IF_MISSING',rf['RO'],ccgt['Organisation']],
              'PhD Student':[],
              'Grant':[],
              'Project Title':[rf['Title'],ccgt['Title'],'IF_MISSING',filing['Title of Project']],
              'Abstract':[rf['Technical Abstract'],'IF_MISSING',ccgt['Lay Summary'],ccgt['Synopsis']],
              'Status':[],
              'Fund Amount':[rf['Funding Value'],'IF_MISSING',ccgt['Total Requested']],
              'Start date':[rf['Award Start Date'],ccgt['Start Date']],
              'End date':[rf['Award End Date'],ccgt['End Date']],
              'Fight for Sight Priority':[],
              'Disease Area':[],
              'PSP Condition':[],
              'PSP Priority':[],
              'AMRC grant type':[]}

In [7]:
master = master[list(MATCH_COLS.keys())]

In [8]:
master = fill_missing_master(master, MATCH_COLS)

Email: Missing before = 86, Missing after = 58
Institution: Missing before = 33, Missing after = 27
Project Title: Missing before = 125, Missing after = 26
Abstract: Missing before = 196, Missing after = 181
Fund Amount: Missing before = 93, Missing after = 36
Start date: Missing before = 66, Missing after = 52
End date: Missing before = 105, Missing after = 77


In [9]:
# fill grant type

In [10]:
# tidy up grant type
def get_grant_type(grant_str):
    
    # Grant missing
    if type(grant_str) is not str:
        return np.nan
    
    # PhD grants
    elif 'PhD' in grant_str:
        return 'PhD'
    
    # Project grants
    elif 'Project' in grant_str:
        return 'Project'
    elif 'Alan Bird' in grant_str:
        return 'Project'
    elif 'BUS' in grant_str:
        return 'Project'
    elif 'Birdshott Uveitis Society' in grant_str:
        return 'Project'
    elif 'British Ophthalmology Sureveilance Unit (BOSU)' in grant_str:
        return 'Project'
    elif 'Ellex Medical' in grant_str:
        return 'Project'
    elif 'GMBC Oth Education Foundation' in grant_str:
        return 'Project'
    elif 'Peng Khaw' in grant_str:
        return 'Project'
    elif 'QLT' in grant_str:
        return 'Project'
    elif 'Programme' in grant_str:
        return 'Project'
    elif 'Restricted Grant' in grant_str:
        return 'Project'
    elif 'Tommy Sailsbury Fund' in grant_str:
        return 'Project'
    elif 'Tony Bickford' in grant_str:
        return 'Project'
    elif 'Tony Moore' in grant_str:
        return 'Project'
    
    # Small Grants
    elif 'Small Grant' in grant_str:
        return 'Small Grant'
    elif 'Hirsch' in grant_str:
        return 'Small Grant'
    
    # Clinical Fellowships
    elif 'Clinical Fellowship' in grant_str:
        return 'Clinical Fellowship'
    elif 'CRTF' in grant_str:
        return 'Clinical Fellowship'
    elif 'Primer Fellowship' in grant_str:
        return 'Clinical Fellowship'
    
    # Basic Fellowships
    elif 'ECIA' in grant_str:
        return 'Basic Fellowship'
    
    # Other
    else:
        return 'Other'

master['Grant Type'] = master['Grant'].apply(get_grant_type)

In [11]:
grant_year = master['Grant'].str.findall('\d{4}')
start_year = master['Start date'].str.findall('\d{4}')

def flatten_list(row):
    if type(row)==list and len(row)>0:
        return row[0]
    else:
        return np.nan

start_year = start_year.apply(flatten_list)
print(sum(start_year.isnull()))

grant_year = grant_year.apply(flatten_list)
print(sum(grant_year.isnull()))

grant_year[grant_year.isnull()] = start_year[grant_year.isnull()]
print(sum(grant_year.isnull()))
master['Grant Year'] = grant_year

116
44
37


In [12]:
# tidy up disease area
disease_areas = pd.read_csv('data/FFS_Terms_to_Categories - DiseaseArea.csv',index_col='Original')
disease_areas = disease_areas.to_dict()['Final']
master['Disease Area'].replace(disease_areas,inplace=True)

In [13]:
print(master['Institution'].nunique())
# tidy up institution
institutions = {'Institute of Ophthalmology UCL':'UCL Institute of Ophthalmology',
                "King's College  London":"King's College London",
                'Moorfields Eye Hospital NHS & UCL Institute of Ophthalmology':'Moorfields Eye Hospital NHS Foundation Trust, UCL Institute of Ophthalmology',
                'UCL Institute of OPhthalmology':'UCL Institute of Ophthalmology',
                'University of Oxford':'University of Oxford'}

master['Institution'].replace(institutions,inplace=True)
print(master['Institution'].nunique())

74
71


In [14]:
# tidy up ffs priority
priorities = {1:'New Treatments',
              2:'Understanding of Causes',
              3:'Prevention',
              4:'Early Diagnosis',
              5:'Quality of Life',
              6:'Emerging Threats'}

master['Fight for Sight Priority'].replace(priorities,inplace=True)

In [15]:
master.info()
master.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 398 entries, 1673 to 2310/2312
Data columns (total 18 columns):
Holder                      391 non-null object
Email                       340 non-null object
Institution                 371 non-null object
PhD Student                 50 non-null object
Grant                       394 non-null object
Project Title               373 non-null object
Abstract                    217 non-null object
Status                      365 non-null object
Fund Amount                 362 non-null float64
Start date                  346 non-null object
End date                    321 non-null object
Fight for Sight Priority    232 non-null object
Disease Area                287 non-null object
PSP Condition               192 non-null object
PSP Priority                189 non-null object
AMRC grant type             268 non-null object
Grant Type                  394 non-null object
Grant Year                  361 non-null object
dtypes: float64(1), object(

Unnamed: 0_level_0,Holder,Email,Institution,PhD Student,Grant,Project Title,Abstract,Status,Fund Amount,Start date,End date,Fight for Sight Priority,Disease Area,PSP Condition,PSP Priority,AMRC grant type,Grant Type,Grant Year
MasterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1411/1412,Mr Scott Robbie,s.robbie@ucl.ac.uk,UCL Institute of Ophthalmology,,New Lecturers Award 2012,The development of novel therapeutic approache...,,Ended- Complete,14000.0,2013-01-01,2013-12-31,Prevention,Cornea/Corneal Dystrophies,,,Other,Other,2012
1982,Mr David Charteris,david.charteris@moorfields.nhs.uk,Moorfields Eye Hospital NHS Foundation Trust,,Small Grant 2010 (STMEH),,,Ended- Complete,18365.0,2011-03-01 00:00:00,,,Retinal Detachment,,,,Small Grant,2010
1393/1394,Professor Maria Balda,m.balda@ucl.ac.uk,UCL Institute of Ophthalmology,Clare Mills,Frankenburg PhD 2011,Inhibition of the Rho signalling activator GEF...,Pathological responses involving degeneration ...,Ended- Complete,100000.0,2012-10-01,2015-09-30,,,,,,PhD,2011
1775/1776,Dr Hannah Dunbar,,,,Clinical Fellowship 2008,Visual disability in diabetic eye disease and ...,,Ended- Complete,,2008-10-01 00:00:00,,,,,,,Clinical Fellowship,2008
24CEC12,Professor Shin-ichi Ohnuma,s.ohnuma@ucl.ac.uk,UCL Institute of Ophthalmology,,Small Grant Award 2012,Role of SLRP family members in malignant progr...,,Ended- Complete,14994.0,2013-01-01,2013-12-31,,Retinoblastoma,,,Other,Small Grant,2012


In [16]:
master.to_excel('data/combined.xlsx')