# Select Best Matches #
**Author:** Andrew Larkin <br>
**Date Created:** January 13, 2022 <br>
**Summary:** Match residences in the top and bottom quartile of wind exposures.  For each record in the top quartile, a list of nearby exposures in the bottom quartile has already been created.  This script attempts to find the best set of top:bottom quartile matches.  
**Note:** for more information on how the lists of nearby exposures was derived, see the python script 'deriveMatchParallel.py'<br>
**Note:** for more information on how these matched exposures were further transformed into the final dataset used for the wind epidemiological study, see the jupyter notebook 'collateWindEpiDataset.ipynb'.

## Part 1: load libraries and define global static variables ##

In [None]:
import pandas as ps
import os
import glob
from copy import deepcopy
import gConst as const # contains absolute filepaths.  Hidden so script can be shared without compromising sensitivte storage locations
BUFFER_DISTANCE = 500
MAX_DIFF = [15,25,50,100]
print(const.sampleNumber)
print(const.MATCH_FOLDER)
import numpy as np

## Part 2: helper functions used by the main script ##

### combine multiple csv files of canidate matches into a single pandas dataframe  ###
**INPUTS:**
 - matchFolder (str) - absolute filepath to folder containing csv files to combine
<br>

**OUTPUTS:**<br>
 - pandas dataframe containing the comprehensie set of candidate matches

In [None]:
def combineMatches(matchFolder):
    filesToCombine = glob.glob(matchFolder + "/*.csv")
    print("found %i match files to combine" %(len(filesToCombine)))
    
    li = []
    index = 0
    for filename in filesToCombine:
        tempDF = ps.read_csv(filename)
        li.append(tempDF)
        if(index %10000 == 0):
            print("completed loading %i files" %(index))
        index+=1
    combinedData = ps.concat(li)    

    # here is where you define the variable names
    newDF = ps.DataFrame({
        'exp_id':combinedData['exp_id'],
        'ctrl_id':combinedData['ctrl_id'],
        'ctrl_year':combinedData['ctrl_year'],
        'ctrl_cat':combinedData['ctrl_cat'],
        'exp_year':combinedData['exp_year'],
        'exp_cat':combinedData['exp_cat'],
        'exp_dist':combinedData['exp_dist'],
        'ctrl_dist':combinedData['ctrl_dist'],
        'cat_diff':combinedData['cat_diff'],
        'dist_diff':combinedData['dist_diff'],
        'year_diff':combinedData['year_diff'],
        'abs_dist':combinedData['abs_dist'],
        'abs_year':combinedData['abs_year']
    })
    
    # rename variables based on whether the residence is an exposed or control
    newDF['uniqueid'] = newDF['exp_id']
    valsWithDist = ps.read_csv(const.WIND_METRICS)
    mergeVals = valsWithDist[['uniqueid','NEAR_DIST']]
    newDF = newDF.merge(mergeVals,how='inner',on='uniqueid')
    newDF = newDF[newDF['NEAR_DIST']>=0]
    newDF['near_dist_exp'] = newDF['NEAR_DIST']
    newDF = newDF.drop(['NEAR_DIST'],axis=1)
    newDF['uniqueid'] = newDF['ctrl_id']
    newDF = newDF.merge(mergeVals,how='inner',on='uniqueid')
    newDF = newDF[newDF['NEAR_DIST']>=0]
    newDF['near_dist_ctrl'] = newDF['NEAR_DIST']
    newDF = newDF.drop(['NEAR_DIST','uniqueid'],axis=1)
    newDF ['near_dist_diff'] = newDF['near_dist_exp'] - newDF['near_dist_ctrl']
    newDF['gradient_match_score'] = newDF['near_dist_diff'].abs()
    
    # create a score that quantifies how good the matches are, and sort matches by the derived score
    newDF['match_score'] = newDF['abs_dist'] + newDF['gradient_match_score'] + newDF['abs_year']*10
    newDF.sort_values(by=['match_score'],inplace=True)
    
    print("number of records in comprehensive match dataframe: %i " %(newDF.count()[0]))
    return(newDF)

### select best exposed/control matches that sample controls without replacement ###
**INPUTS:**
 - candidateMatches (pandas DataFrame) - dataset of candidate matches
<br>

**OUTPUTS:**<br>
 - pandas dataframe containing matches with best match scores that without using repeat controls

In [None]:
def getOriginalMatches(candidateMatches):
    
    candidateMatches.sort_values(by=['match_score'],inplace=True)
    # iterate through candidate matches, selecting best score and removing other scores using the same control
    remainingMatches = deepcopy(candidateMatches)
    firstSet = True
    while(remainingMatches.count()[0]>0):
        curMatch = remainingMatches.groupby('exp_id').head(1)
        print(curMatch.count()[0])
        curMatch = curMatch.groupby('ctrl_id').head(1)
        print(curMatch.count()[0])
        #curMatch = remainingMatches.iloc[0]
        if(firstSet):
            chosenMatches = curMatch
            firstSet = False
        else:
            chosenMatches = ps.concat([curMatch,chosenMatches],ignore_index=False)#ps.concat([chosenMatches,curMatch])
        remainingMatches = remainingMatches[~remainingMatches['exp_id'].isin(list(set(curMatch['exp_id'])))]
        remainingMatches = remainingMatches[~remainingMatches['ctrl_id'].isin(list(set(curMatch['ctrl_id'])))]
        numProcessed = len(chosenMatches)
        if(numProcessed%250==0):
            print("num procesed: %i, num remaining: %i" %(numProcessed,remainingMatches.count()[0]))
    
    # convert list of original matches into a pandas DataFrame
    origMatches = ps.DataFrame({
    'exp_id':list(chosenMatches['exp_id']),
    'ctrl_id':list(chosenMatches['ctrl_id']),
    'ctrl_year':list(chosenMatches['ctrl_year']),
    'ctrl_cat':list(chosenMatches['ctrl_cat']),
    'exp_year':list(chosenMatches['exp_year']),
    'exp_cat':list(chosenMatches['exp_cat']),
    'exp_dist':list(chosenMatches['exp_dist']),
    'ctrl_dist':list(chosenMatches['ctrl_dist']),
    'cat_diff':list(chosenMatches['cat_diff']),
    'dist_diff':list(chosenMatches['dist_diff']),
    'year_diff':list(chosenMatches['year_diff']),
    'abs_dist':list(chosenMatches['abs_dist']),
    'abs_year':list(chosenMatches['abs_year']),
    'gradient_score':list(chosenMatches['gradient_match_score']),
    'match_score':list(chosenMatches['match_score']),
    'near_dist_exp':list(chosenMatches['near_dist_exp']),
    'near_dist_ctrl':list(chosenMatches['near_dist_ctrl'])
    })
    
    origMatches['orig_ctrl'] = [1 for x in range(origMatches.count()[0])]
    origMatches['n_matches'] = [0 for x in range(origMatches.count()[0])]
    
    print("created %i original matches " %(origMatches.count()[0]))
    return(origMatches)

### select best exposed/control matches that sample controls with replacement ###
**INPUTS:**
 - unmatchedCandidates (pandas DataFrame) - dataset of candidate matches
 - resample (string) - whether to resample exposed or ctrl to get multiple matches
 - prevMatchPenalty (int) - adjust match score to reduce match quality for repeated sampling
 - maxAllowedMatches (int) - maximum number of times a control id can be repeatedly sampled
<br>

**OUTPUTS:**<br>
 - pandas dataframe containing matches with best match scores using repeat controls

In [None]:
def getUnoriginalMatches(unmatchedCandidates,resample='exp_id',prevMatchPenalty=10,maxAllowedMatches=4):
    
    # define whether exposed or controls are resampled to get multiple matches
    if resample == 'ctrl_id':
        uniqueSample = 'exp_id'
    else:
        uniqueSample = 'ctrl_id'
    remainingMatches = deepcopy(unmatchedCandidates)
 
    
    # iterate through candidate matches, selecting best match and updating scores with new penalty
    firstSet = True
    while(remainingMatches.count()[0]>0):
        
        # get the best matches, using each record only once
        curMatch = remainingMatches.groupby('exp_id').head(1)
        curMatch = curMatch.groupby('ctrl_id').head(1)
        
        # if this is the first match create a new dataset.  Otherwise append to previous matches
        if(firstSet):
            chosenMatches = curMatch
            firstSet = False
        else:
            chosenMatches = ps.concat([curMatch,chosenMatches],ignore_index=True)
            
        # update match scores to add the penalty for previous matches
        newUnMatch = list(set(curMatch[uniqueSample]))
        remainingMatches = remainingMatches[~remainingMatches[uniqueSample].isin(newUnMatch)]
        newReMatch = list(set(curMatch[resample]))
        addVals = (remainingMatches[resample].isin(newReMatch))
        remainingMatches['n_matches'] += addVals
        remainingMatches['match_score'] += addVals*prevMatchPenalty
        
        # remove all remaining matches with controls that have been used maxAllowedMatches or more times
        remainingMatches = remainingMatches[remainingMatches['n_matches']<maxAllowedMatches]
        remainingMatches.sort_values(by=['match_score'],inplace=True)
        numProcessed = len(chosenMatches)
        if(numProcessed%500==0):
            print("num procesed: %i, num remaining: %i" %(numProcessed,remainingMatches.count()[0]))
    
    # create new dataframe from matches
    secondMatches = ps.DataFrame({
        'ctrl_id':list(chosenMatches['ctrl_id']),
        'exp_id':list(chosenMatches['exp_id']),
        'ctrl_year':list(chosenMatches['ctrl_year']),
        'ctrl_cat':list(chosenMatches['ctrl_cat']),
        'exp_year':list(chosenMatches['exp_year']),
        'exp_cat':list(chosenMatches['exp_cat']),
        'exp_dist':list(chosenMatches['exp_dist']),
        'ctrl_dist':list(chosenMatches['ctrl_dist']),
        'cat_diff':list(chosenMatches['cat_diff']),
        'dist_diff':list(chosenMatches['dist_diff']),
        'year_diff':list(chosenMatches['year_diff']),
        'abs_dist':list(chosenMatches['abs_dist']),
        'abs_year':list(chosenMatches['abs_year']),
        'match_score':list(chosenMatches['match_score']),
        'gradient_score':list(chosenMatches['gradient_match_score']),
        'n_matches':list(chosenMatches['n_matches']),
        'near_dist_exp':list(chosenMatches['near_dist_exp']),
        'near_dist_ctrl':list(chosenMatches['near_dist_ctrl'])
    })
    
    secondMatches['orig_match_id'] = 1*(secondMatches['n_matches']==0)
    print("found %i matches that used previously matched controls " %(secondMatches.count()[0]))
    
    return(secondMatches)

### select best exposed/control matches, first using without control replacement and then for unmatched exposed try sampling controls with replacement ###
**INPUTS:**
 - candidateMatches (pandas DataFrame) - dataset of candidate matches
 - outputFilepath (str) absolute filepath where selected matches will be stored
 - maxDistance (int) - maximum allowable difference in distance to road between exposed and matched control
 - numMatches (int) - maximum number of times any maternal residence can be matched to another
<br>

In [None]:
def getBestMatchesWithinDistance2(candidateMatches,outputFilepath,maxDistance=100,numMatches=4):
    
    # restrict candidate matches based on distance and year inclusion criteria
    screenedCandidates = candidateMatches[candidateMatches['abs_dist']<maxDistance]
    screenedCandidates = screenedCandidates[screenedCandidates['gradient_match_score']<maxDistance]
    screenedCandidates = screenedCandidates[screenedCandidates['abs_year']<4]
    print("%i candidate matches were found within %i distance" %(screenedCandidates.count()[0],maxDistance))
    
    # get original matches (each residence can only be matched once)
    origMatches = getOriginalMatches(screenedCandidates)
    if(numMatches>1):
        print("matching more than once")
        
        # identify exposed that have not yet been matches and find best matches using previously matched controls
        unmatchedCandidates = screenedCandidates[~screenedCandidates['exp_id'].isin(list(origMatches['exp_id']))]
    
        unmatchedCandidates['match_score'] += 10
        unmatchedCandidates['n_matches'] = [1 for i in range(unmatchedCandidates.count()[0])]
    
        unorigMatches = getUnoriginalMatches(unmatchedCandidates,'ctrl_id',maxAllowedMatches=numMatches-1)
        allMatches = ps.concat([origMatches,unorigMatches])
    else:
        print("only selecting original matches")
        allMatches = origMatches
    
    print("found %i matches total for max distance %i" %(allMatches.count()[0],maxDistance))
    allMatches.to_csv(outputFilepath,index=False)
    print("saved all matches to designated filepath (see constants file for absolute filepath)")

## Part 3: main script ##

### load match data and vital statistics ###

In [None]:
# load matches 
candidateMatches = combineMatches(const.MATCH_FOLDER)

# load the most up to date cohort records
vitalStats = ps.read_csv(const.VITAL_STATS_FILEPATH)
vitalStatsIds = list(set(vitalStats['uniqueid']))
print("%i unique records found in vital stats file" %(len(vitalStatsIds)))

# remove records not included with the latest cohort inclusion criteria
candidateMatches = candidateMatches[candidateMatches['exp_id'].isin(vitalStatsIds)]
candidateMatches = candidateMatches[candidateMatches['ctrl_id'].isin(vitalStatsIds)]
print("%i candidate matches left after screening with vital stats" %(candidateMatches.count()[0]))

### create a copy of match and vital statistics data restricted to 37 to 42 weeks

In [None]:
# for analyzing term birth weight and low term birth weight, restrict matches to those where the exposed and control both have
# estimated gestational ages between 37 and 42 weeks
vitalStats_37_42 = vitalStats[vitalStats['b_es_ges']<=42]
vitalStats_37_42 = vitalStats_37_42[vitalStats_37_42['b_es_ges']>=37]
vitalStats_37_42IDs = list(set(vitalStats_37_42['uniqueid']))
print("%i unique records found in vital stats file" %(len(vitalStats_37_42)))
candidateMatches_37_42 = candidateMatches[candidateMatches['exp_id'].isin(vitalStats_37_42IDs)]
candidateMatches_37_42 = candidateMatches_37_42[candidateMatches_37_42['ctrl_id'].isin(vitalStats_37_42IDs)]
print("%i candidate matches left after restricting to 37-42 weeks" %(candidateMatches_37_42.count()[0]))

### find best matches for all match distances and save results ###

In [None]:
for maxDistance in [15,25,50,100]:
    for numMatches in [1,10]:
        outputFilepath = const.SELECTED_MATCH_FOLDER +  "selected_matches_all_" + str(numMatches) + "_" + str(maxDistance) + ".csv"
        if not(os.path.exists(outputFilepath)):
                getBestMatchesWithinDistance2(candidateMatches,outputFilepath,maxDistance,numMatches)
        outputFilepath = const.SELECTED_MATCH_FOLDER  +  "selected_matches_37to42_" + str(numMatches) + "_" + str(maxDistance) + ".csv"
        if not(os.path.exists(outputFilepath)):
                getBestMatchesWithinDistance2(candidateMatches_37_42,outputFilepath,maxDistance,numMatches)