In [45]:
import pandas as pd
import numpy as np
import os
import jellyfish


In [46]:
# All data files are located here. Change if running.
dataDir = r'C:\Users\Ian\Dropbox\ECON 407 - US Courts & Judges'


In [47]:
# Load raw data
judge_att_data = pd.read_csv(os.path.join(dataDir,'Judge Attribute Data.csv'))


In [48]:
# Drop unnecessary columns, rename necessary columns 
judge_att_data = judge_att_data.drop(columns = ['name_original','___l','___j','___char','elevate','dcother',
                                                'liable', 'dummy','religion','circuit',
                                                'songer_code','amon','crossl','pred','appt','temp',
                                                'trans','liable','abamin','dsenate','rsenate','dhouse',
                                                'rhouse','fhouse','fsenate','drhouse','drsenate',
                                                'whouse','wsenate','nrhouse','nrsenate','dsens','rsens',
                                                'yeari','yearc','e1','e2','e3','e4','e5','e6','congresi',
                                                'unity','e7','e8','yearo','congreso','unityo','cityb',
                                                'badeg','bastate','bastatus','jddeg','jdstate','jdstatus',
                                                'grad1','grad2','tperm','fsens','drsens','wsens','nrsens',
                                                'osens','agego','service','csb','ba','bast','bapp','ls',
                                                'lsst','jdpp','graddeg1','graddeg2','statecab','state2',
                                                'recdate','ageon'])
judge_att_data = judge_att_data.rename(columns = {'name':'Name','circuit_original':'Circuit','id':'ID',
                                        'pres':'Appointing President','yearl':'Year of Departure',
                                        'yearb':'Year of Birth','yeard':'Year of Death',
                                        'pleft':'President when Departed','left':'Reason for Departing',
                                        'party':'Judge Party','district':'District','state':'State',
                                        'city':'City','gender':'Gender','race':'Race',
                                        'ayear':'Year of Appointment','crossa':'Cross Appointment',
                                        'recess':'Recess Appointment','aba':'ABA Rating',
                                        'assets':'Assets','congress':'Congress','unityi':'Unity',
                                        'hdem':'House Democrats','hrep':'House Republicans',
                                        'sdem':'Senate Democrats','srep':'Senate Republicans',
                                        'hother':'House Independents','sother':'Senate Independents',
                                        'networth':'Net Worth'})



In [49]:
# Replace zero values with missing for net worth and assets
def replace_zero_with_na(x):
    if x == 0:
        return np.nan
    else:
        return x
judge_att_data['Assets'] = judge_att_data['Assets'].apply(replace_zero_with_na)
judge_att_data['Net Worth'] = judge_att_data['Net Worth'].apply(replace_zero_with_na)


In [50]:
# Turn the position indicator columns into dummies and rename
def turn_into_dummy(val):
    if np.isnan(val):
        return 0
    else:
        return 1

position_columns = list(filter(lambda col: col[0] == 'p', list(judge_att_data.columns)))
for col in position_columns:
    judge_att_data[col] = judge_att_data[col].apply(turn_into_dummy)
    judge_att_data = judge_att_data.rename(columns = {col:'Previous Position - ' + col[1:]})


In [51]:
# Load ideology data
judge_ideo_score = pd.read_excel(os.path.join(dataDir,'Judge Ideology Scores.xlsx'))
judge_ideo_score = judge_ideo_score[['judgename','ideology_score']]
judge_ideo_score = judge_ideo_score.rename(columns = {'judgename':'Name','ideology_score':'Ideology Score'})



In [52]:
def get_best_name_match_from_ideo_data(name):
    ideo_names = judge_ideo_score['Name']
    best_match = ""
    highest_jw = 0
    
    for ideo_name in ideo_names:
        current_score = jellyfish.jaro_winkler(name, ideo_name)
        if ((current_score > highest_jw) and (current_score > 0.89)):
            highest_jw = current_score
            best_match = ideo_name
    
    return best_match

judge_att_data['Closest Name'] = judge_att_data['Name'].apply(get_best_name_match_from_ideo_data)
judge_att_data = judge_att_data.merge(judge_ideo_score, left_on = 'Closest Name', right_on = 'Name', how = 'left')
judge_att_data = judge_att_data.drop(columns = ['Name_y','Closest Name'])
judge_att_data = judge_att_data.rename(columns = {'Name_x':'Name'})


In [53]:
judge_att_data.to_csv(os.path.join(dataDir,'Judge Attribute and Ideology.csv'), index = False)