# Load and categorize the code

In [3]:
import pandas as pd
import numpy as np

import os
import datetime

from sklearn import preprocessing
from matplotlib import pyplot as plt


In [4]:
# Large dataset, this line can take a little bit of time to run (about 30 seconds)
raw_data = pd.read_csv(os.getcwd() + '/asylum_clean_chicago_eoirstrictconsec3_1.csv', low_memory=False)

In [7]:

missing = np.sum(pd.isnull(raw_data))
len(missing[missing > 0])

166

In [4]:
len(raw_data.columns)

264

In [7]:
len(pd.unique(raw_data.year))


40

In [83]:
imputedL1 = pd.read_csv(os.getcwd() + '/L1grant2_imputed.csv')
imputedL2 = pd.read_csv(os.getcwd() + '/L2grant2_imputed.csv')
countries = pd.read_csv(os.getcwd() + '/country_data_frame.csv')

In [55]:
countries = pd.read_csv(os.getcwd() + '/country_data_frame.csv')

In [68]:
len(countries.columns)-1

47

In [70]:
np.mean(raw_data["numinfamily"] == 1)

0.86848097905810362

In [73]:
# Let's group all the time series features with x out of the prev 5-20 decisions
nums = range(5,21)
dropcolnames = ["numcourtgrant_prev" + str(i) for i in nums]
dropcolnames = dropcolnames + ["numcourtgrantself_prev" + str(i) for i in nums]
dropcolnames = dropcolnames + ["numcourtdecideself_prev" + str(i) for i in nums]
dropcolnames = dropcolnames + ["courtprev" + str(i) + "_dayslapse" for i in nums]

# Some of the features only go from 5-10 instead of 5-20
nums = range(5,11)
keepcolnames = ["prev" + str(i) + "_dayslapse" for i in nums]
keepcolnames = keepcolnames + ["numcourtgrantother_prev" + str(i) for i in nums]
keepcolnames = keepcolnames + ["courtprevother" + str(i) + "_dayslapse" for i in nums]
keepcolnames = keepcolnames + ["numgrant_prev" + str(i) for i in nums]

In [74]:
# Let's take all the features that refer to averages over a given subset of decisions
# lomeangrant is the average over all decisions in the subset where the current decision is not included in the mean
# we want to throw away the meangrant (which includes the current decision)
# and also get rid of all avgs not based on grantraw
avgnames = ["judge", "judgenat", "judgedef", "judgenatdef", "judgelawyer"]
keep = ["lomeangrantraw_", "numdecisionsraw_", "numdecisions_"]
drop = ["meangrant_", "lomeangrant_", "meangrantraw_"]

keepnames = [i + j for i in keep for j in avgnames]
dropnames = [i + j for i in drop for j in avgnames]

In [75]:
# Same goes for the averages in a given year
avgbyyear = ["", "nat", "natdef"]
yearnames = ["judgenumdec" + i + "year" for i in avgbyyear]
yearnames = yearnames + ["lojudgemean" + i + "year" for i in avgbyyear]

In [76]:
# DateofAppointment comes in all different formats so let's transform the dates to a datetime object
raw_data["DateofAppointment_formatted"] = raw_data["DateofAppointment"]
for datestring in pd.unique(raw_data["DateofAppointment"][raw_data["DateofAppointment"].notnull()]):
    try: 
        date = datetime.datetime.strptime(datestring, "%d%b%Y").date()
    except ValueError:
        if datestring[3] == "-":
            date = datetime.datetime.strptime(datestring, "%b-%y").date()
        else:
            date = datetime.datetime.strptime(datestring, "%b %y").date()
    raw_data["DateofAppointment_formatted"][raw_data["DateofAppointment"] == datestring] = date

# comp_date is given in number of days since January 1, 1960, so let's also transform this to a datetime object
dates_available = raw_data["comp_date"][raw_data["comp_date"].notnull()]  
raw_data["comp_date_formatted"] = raw_data["comp_date"]
raw_data["comp_date_formatted"] = [datetime.date(1960, 1, 1) + datetime.timedelta(i) for i in dates_available]

In [8]:
# import datetime
# i = 12724
# date = datetime.date(1960, 1, 1) + datetime.timedelta(i)
# print date.year
# print date.month
# print date.day
# print date

In [105]:
len(pd.unique(raw_data["ij_code"][pd.isnull(raw_data["LastName"])]))/426.

0.13145539906103287

In [107]:
data2.dropna().shape

(312577, 480)

In [108]:
312577./len(data2)

0.6238401925544803

In [77]:
# Now let's create a numeric feature that gives the number of days since the judge's appointment 
# up to the current decision, nan if DateofAppointment is missing for that judge
subset = raw_data[raw_data["comp_date_formatted"].notnull() & raw_data["DateofAppointment_formatted"].notnull()]
timediff = [(a-b).days for (a,b) in zip(subset["comp_date_formatted"], subset["DateofAppointment_formatted"])]
raw_data["TimeSinceAppointment"] = np.nan * np.zeros(len(raw_data))
raw_data["TimeSinceAppointment"][raw_data["comp_date_formatted"].notnull() 
                                 & raw_data["DateofAppointment_formatted"].notnull()] = timediff

In [78]:
# These are mostly unique identifiers and we wouldn't want to include them in the model
identifiers = ["idncase", "idnproceeding", "eoirattyid", "alienattyid", "hearing_loc_code", 
              "LastName", "FirstName", "IJ_NAME", "Judge_name_SLR", "famcode",
              "judge_name_caps", "ij_code"]
# Need to transform to sets of indicator variables with sklearn.preprocessing
categorical = ["natid", "courtid", "comp_dow", "President_SLR"]
# Note that we've taken certain judge-level identifiers including ij_code, "FirstUndergrad", "LawSchool",
# "Bar", "Court_SLR" out of the data because they are categorical variables with too many levels, 
# and because we would like to be able to make predictions on new judges not in the data set

# Variables related to the biographical info we have on each judge, missing for about 5% of cases
# Overall I am a little skeptical about the quality of this data
judge_vars = ["Male_judge", "TimeSinceAppointment",
             "Year_Appointed_SLR", "Year_College_SLR", "Year_Law_school_SLR", "President_SLR", 
             "Government_Years_SLR", "Govt_nonINS_SLR", "INS_Years_SLR", "Military_Years_SLR", "NGO_Years_SLR",
             "Privateprac_Years_SLR", "Academia_Years_SLR", "experience", "experience8", "log_experience", 
             "log_gov_experience", "log_INS_experience", "log_military_experience", "log_private_experience",
             "log_academic_experience", "govD", "INSD", "militaryD", "privateD", "academicD", "democrat"]
# These are mostly interactions of already included categorical variables, so we don't want to include them
interactions = ["ij_court_code", "natcourtcode", "natdefcode", "natdefcourtcode"]
# These essentially duplicate other variables already contained within the model, often based on less complete
# information, (i.e. based on grant rather than grantraw)
duplicates = ["orderwithinday","L1grant", "L2grant", "moderategrant3070", "judgemeanyear", "judgemeannatyear", 
              "judgemeannatdefyear", "grantgrant", "grantdeny", "denygrant", "denydeny", "Gender", 
             "republican", "DateofAppointment", "comp_date_formatted", "DateofAppointment_formatted", 
             "YearofFirstUndergradGraduatio", "INS_Every5Years_SLR", "afternoon"] + dropnames
# Information about average grant rate for particular subsets of the data
averages = keepnames + yearnames
# Important to exclude these or we will have major leakage
alternative_targets = ["grant2", "grant"]
# grantraw differs from grant in that grant is NA when orderwithinday is unknown (see flag_unknownorderwithinday)
target = "grantraw"
unhelpful = ["order_raw", "min_osc_date", "max_osc_date", "min_input_date", "max_input_date", 
             "negoutliermeanyear", "negoutliermeannatyear", "negoutliermeannatdefyear", "OtherLocationsMentioned",
            "JudgeUndergradLocation", "JudgeLawSchoolLocation", "FirstUndergrad", "LawSchool",
            "Bar", "Court_SLR", "adj_time_start"]
flag_variables = ["flag_decisionerror_strdes", "flag_decisionerror_idncaseproc", "flag_earlystarttime", 
                  "flag_mismatch_base_city", "flag_mismatch_hearing", "flag_multiple_proceedings",
                 "flag_notfirstproceeding", "flag_notfirstproceeding2", "flag_multiple_proceedings2",
                 "flag_prevprocgrant", "flag_prevprocdeny", "flag_unknowntime", "flag_unknownorderwithinday"]
time_series = ["L1grant_sameday", "L2grant_sameday", "L1grant2", "L2grant2"] + keepcolnames

In [27]:
len(identifiers + duplicates + alternative_targets + unhelpful + interactions + dropcolnames)

132

In [67]:
sum(data["flag_unknownorderwithinday"])/(1.*len(data))

0.17332298180032851

# Drop unnecessary variables and transform categorical

In [79]:
data = raw_data.drop(identifiers + duplicates + alternative_targets + unhelpful + interactions + dropcolnames, axis=1)
data["morning"][data["hour_start"].isnull()] = np.nan
data["lunchtime"][data["hour_start"].isnull()] = np.nan
data["President_SLR"][data.Year_Appointed_SLR == 1990] = "Bush I"
data["democrat"][data.Year_Appointed_SLR == 1990] = 0

In [47]:
sum([len(pd.unique(data[x])) == 2 for x in data.columns])

24

In [48]:
print len(pd.unique(data.President_SLR))
print len(pd.unique(data.comp_dow))
print len(pd.unique(data.natid))
print len(pd.unique(data.courtid))

10
7
227
54


In [49]:
10+7+227+54

298

In [50]:
len(data.columns)

135

In [80]:
data2 = data.copy()
for feature in categorical:
    dummies = pd.get_dummies(data2[feature], feature, dummy_na=True)
    data2 = pd.concat([data2, dummies], axis = 1)

In [52]:
len(data2.columns)

435

In [81]:
data2 = data2.drop(categorical, axis=1)

In [84]:
imputedL1.drop(list(imputedL1.columns[:2]), axis=1, inplace=True)
data2.drop("L1grant2", axis=1, inplace=True)
data2 = pd.concat([data2, imputedL1], axis=1)

In [85]:
imputedL2.drop("index", axis=1, inplace=True)
data2.drop("L2grant2", axis=1, inplace=True)
data2 = pd.concat([data2, imputedL2], axis=1)

In [90]:
imputedL1.columns

Index([u'L1grant2', u'L1imputed'], dtype='object')

In [86]:
# TODO add in data for country of origin
countries.drop(countries.columns[0], axis=1, inplace=True)
data2 = pd.concat([data2, countries], axis=1)

# Deal with missing data in non-time series features

In [18]:
time_series.remove("L1grant2")
time_series.remove("L2grant2")
data3 = data2.drop(time_series, axis=1)
missing = np.sum(pd.isnull(data3))
missing.sort(ascending=False)
#print missing[missing > 0]

In [93]:
missing = np.mean(pd.isnull(data2))
len(missing[missing > 0])

73

In [94]:
missing.sort(ascending=False)

In [96]:
np.median(missing[missing > 0])

0.050900802909073492

In [110]:
len(missing[missing == 0.050900802909073492])

18

In [19]:
# This code block takes too long to run

#subset = raw_data[["ij_code", "year", "natid"]][data3["lojudgemeannatyear"].isnull()]
#judge_year = subset.groupby(["ij_code", "year", "natid"])
#judge_year = judge_year.groups

#imputed_values = {}
#for judge, year, nat in judge_year.keys():
#    mean = np.mean(raw_data["lojudgemeannatyear"][(raw_data["ij_code"] != judge) 
#                                                  & (raw_data["year"] == year) & (raw_data["natid"] == nat)])
#    for row in judge_year[(judge, year, nat)]:
#        imputed_values[row] = mean 

In [20]:
subset = data3[data3.TimeSinceAppointment.isnull() & data3.Year_Appointed_SLR.notnull()].copy()
subset.TimeSinceAppointment = (subset.year - subset.Year_Appointed_SLR) * 365.
data3["TimeSinceAppointment"].loc[subset.index] = subset["TimeSinceAppointment"]
data3["experience_missing"] = data3.experience.isnull()
data3.experience[data3.experience.isnull()] = (data3.Govt_nonINS_SLR + data3.INS_Years_SLR + data3.Military_Years_SLR +
            data3.NGO_Years_SLR + data3.Privateprac_Years_SLR + data3.Academia_Years_SLR)

In [21]:
#judge_vars.remove("President_SLR")
start = data3.columns.get_loc('Male_judge')
stop = data3.columns.get_loc('natid_1.0')
start2 = data3.columns.get_loc('President_SLR_Bush I')
stop2 = data3.columns.get_loc('President_SLR_nan') + 1
judge_vars = list(data3.columns[range(start, stop)])
judge_vars = judge_vars + list(data3.columns[range(start2,stop2)])
judge_vars.remove("experience8")
judge_vars.remove("log_experience")

In [22]:
mi = preprocessing.Imputer()
missing_vars = ["difmeannatdefyear", "outliermeannatdefyear", "absdifmeannatdefyear", 
        "difmeannatyear", "outliermeannatyear", "absdifmeannatyear",
        "difmeanyear", "outliermeanyear", "absdifmeanyear", "defensive",
        "hour_start",  "morning", "lunchtime"] + averages + judge_vars
missing = np.sum(pd.isnull(data3[missing_vars]))
missing_vars = list(missing[missing > 0].keys())

In [23]:
print len(judge_vars)
print len(missing_vars)
print len(missing[missing > 0])

34
45
45


In [24]:
data4 = mi.fit_transform(data3[missing_vars])
data5 = data3.copy()
print data4.shape

(501053, 45)


In [25]:
data5[missing_vars] = data4
data5.experience8 = 1 * (data5.experience >= 8)
data5.experience8[data5.experience.isnull()] = np.nan
data5.log_experience = np.log(data5.experience)

In [26]:
data5["judge_mean_nat_missing"] = data3.lojudgemeannatyear.isnull()
data5["judge_mean_nat__def_missing"] = data3.lojudgemeannatdefyear.isnull()
data5["judge_mean_year_missing"] = data3.lojudgemeanyear.isnull()
data5["judgedef_missing"] = data3.lomeangrantraw_judgedef.isnull()
data5["judgemean_missing"] = data3.lomeangrantraw_judge.isnull()
data5["judgelawyer_missing"] = data3.lomeangrantraw_judgelawyer.isnull()
data5["judgenat_missing"] = data3.lomeangrantraw_judgenat.isnull()
data5["judge_edu_missing"] = data3.Year_Law_school_SLR.isnull()
data5["defensive_missing"] = data3.defensive.isnull()
data5["judge_bio_missing"] = data3.experience.isnull()

In [27]:
missing = np.sum(pd.isnull(data5))
missing.sort(ascending=False)
missing[missing > 0]

L2grant2    2423
L1grant2     200
dtype: int64

# Now let's try and fill in some of the time series data

In [28]:
data4 = pd.concat([data5, data2[time_series]], axis=1)

In [29]:
missing = np.sum(pd.isnull(data4))
missing.sort(ascending=False)
len(missing[missing > 0])

26

In [30]:
time_series.remove("L1grant_sameday")
time_series.remove("L2grant_sameday")

In [31]:
print pd.unique(missing[missing > 0])
missing[missing > 0]

[40061 31831 25427 20634 17046 14506 14283 13459 12352 11196  9954  8635
  2423   200]


courtprevother10_dayslapse    40061
numcourtgrantother_prev10     40061
courtprevother9_dayslapse     31831
numcourtgrantother_prev9      31831
courtprevother8_dayslapse     25427
numcourtgrantother_prev8      25427
numcourtgrantother_prev7      20634
courtprevother7_dayslapse     20634
numcourtgrantother_prev6      17046
courtprevother6_dayslapse     17046
numgrant_prev10               14506
prev10_dayslapse              14506
numcourtgrantother_prev5      14283
courtprevother5_dayslapse     14283
numgrant_prev9                13459
prev9_dayslapse               13459
numgrant_prev8                12352
prev8_dayslapse               12352
numgrant_prev7                11196
prev7_dayslapse               11196
prev6_dayslapse                9954
numgrant_prev6                 9954
numgrant_prev5                 8635
prev5_dayslapse                8635
L2grant2                       2423
L1grant2                        200
dtype: int64

In [32]:
missing_vars = time_series
data5 = mi.fit_transform(data4[missing_vars])
data6 = data4.copy()
data6[missing_vars] = data5

In [33]:
data6[missing_vars].isnull()

Unnamed: 0,prev5_dayslapse,prev6_dayslapse,prev7_dayslapse,prev8_dayslapse,prev9_dayslapse,prev10_dayslapse,numcourtgrantother_prev5,numcourtgrantother_prev6,numcourtgrantother_prev7,numcourtgrantother_prev8,...,courtprevother7_dayslapse,courtprevother8_dayslapse,courtprevother9_dayslapse,courtprevother10_dayslapse,numgrant_prev5,numgrant_prev6,numgrant_prev7,numgrant_prev8,numgrant_prev9,numgrant_prev10
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [34]:
data_dropped = data4.dropna()
print len(data4)
print len(data_dropped)

501053
445145


In [35]:
missing = np.sum(pd.isnull(data6))
missing.sort(ascending=False)
print missing[missing > 0]

L2grant2    2423
L1grant2     200
dtype: int64


In [36]:
data6.L1grant_sameday[(data4.orderwithindayraw > 1) & (data4.L1imputed == 1)] = 1
data6.L2grant_sameday[(data4.orderwithindayraw > 2) & (data4.L2imputed == 1)] = 1

In [37]:
missing_vars = ["L1grant2", "L2grant2"]
data6.L1imputed[data6.L1grant2.isnull()] = 1
data6.L2imputed[data6.L2grant2.isnull()] = 1
data4 = mi.fit_transform(data6[missing_vars])
data_final = data6.copy()
data_final[missing_vars] = data4

In [38]:
data_final.to_csv("refugee_data_post_processing.csv", index = False)

In [39]:
missing = np.sum(pd.isnull(data_final))
missing.sort(ascending=False)
len(missing[missing > 0])

0

In [40]:
len(data_final.columns)

491

In [41]:
# Let's make sure that the number of columns is less than the square root of n where n is the size of our training set
len(data_final.columns) < np.sqrt(len(data_final) * .80)

True

In [42]:
# Let's make sure that the number of columns is less than the square root of n where n is the size of our training set
len(data_dropped.columns) < np.sqrt(len(data_dropped) * .80)

True

In [43]:
data_dropped.L1grant_sameday[(data_dropped.orderwithindayraw > 1) & (data_dropped.L1imputed == 1)] = 1
data_dropped.L2grant_sameday[(data_dropped.orderwithindayraw > 2) & (data_dropped.L2imputed == 1)] = 1

In [44]:
data_dropped.to_csv("refugee_data_with_missing_time_dropped.csv", index = False)

In [45]:
data_final["no_time_missing"] = [i in data_dropped.index for i in data_final.index]

In [46]:
data_final.no_time_missing = 1.0 * data_final.no_time_missing

In [47]:
sum(data_final.no_time_missing)

445145.0

In [48]:
data_final.to_csv("refugee_data_post_processing_v2.csv", index = False)

In [49]:
data_dropped.shape

(445145, 491)

In [50]:
data_final.shape

(501053, 492)

In [1]:
judge = "AA"
print("judge " + judge + " complete")

judge AA complete


In [112]:
data = pd.read_csv("refugee_data_post_processing_v2.csv")

In [122]:
data_baseline.shape

(501053, 424)

In [114]:
data_baseline = data.copy()

In [116]:
countries.columns

Index([u'Percent_Christian', u'Percent_Muslim', u'Percent_Unaffiliated', u'Percent_Hindu', u'Percent_Buddhist', u'Percent_Jewish', u'World_High_income_OECD', u'World_High_income_nonOECD', u'World_Low_income', u'World_Lower_middle_income', u'World_Upper_middle_income', u'World_nan', u'Subregion_Antarctica', u'Subregion_Australia_and NewZealand', u'Subregion_Caribbean', u'Subregion_Central_Asia', u'Subregion_Central_Europe', u'Subregion_Eastern_Africa', u'Subregion_Eastern_Asia', u'Subregion_Eastern_Europe', u'Subregion_Melanesia', u'Subregion_Mexico_and_Central_America', u'Subregion_Micronesia', u'Subregion_Middle_Africa', u'Subregion_North_Africa', u'Subregion_Northern_America', u'Subregion_Northern_Europe', u'Subregion_Polynesia', u'Subregion_South_America', u'Subregion_Southeastern_Asia', u'Subregion_Southeastern_Europe', u'Subregion_Southern_Africa', u'Subregion_Southern_Asia', u'Subregion_Southern_Europe', u'Subregion_The_Middle_East', u'Subregion_Western_Africa', u'Subregion_Weste

In [121]:
data_baseline.drop(averages, axis=1, inplace=True)

In [131]:
data_baseline.shape

(501053, 424)

In [125]:
baseline = linear_model.LogisticRegression(penalty='l2')

In [124]:
from sklearn import linear_model

In [133]:
data_baseline.to_csv("baseline.csv", index=False)

In [140]:
train = pd.read_csv("baseline_train.csv")
test = pd.read_csv("baseline_test.csv")

In [139]:
split_train_test("baseline_train.csv", "baseline_test.csv", "baseline.csv")

In [138]:
def split_train_test(train_out_loc, test_out_loc, processed_code_file,
                     judge_code_file = 'asylum_clean_chicago_eoirstrictconsec3_1.csv',
                     train_percentage = .80, rnd_seed = 4850):

    raw_data = pd.read_csv(processed_code_file)
    judge_code_data = pd.read_csv(judge_code_file, low_memory=False)
    random.seed(rnd_seed)
    
    train_size = int(train_percentage * len(raw_data))
    judges = judge_code_data .groupby("ij_code")
    all_judges = list(judges.groups.keys())
    np.random.shuffle(all_judges)
    
    train_length = 0
    train_judges = []
    train_cases = []
    i = 0
    while train_length < train_size :
        judge = all_judges[i]
        i += 1
        size = len(judges.groups[judge])
        if (size + train_length) > train_size:
            break
        train_judges.append(judge)
        train_length += size
        train_cases.extend(judges.groups[judge])
    
    test_cases = list(set(raw_data.index).difference(set(train_cases)))
    train_data = raw_data.loc[train_cases]
    test_data = raw_data.loc[test_cases]
    
    train_data.to_csv(train_out_loc, index = False)
    test_data.to_csv(test_out_loc, index = False)

In [135]:
import random

In [143]:
train.replace(-1*np.inf, -1000., inplace=True)
test.replace(-1*np.inf, -1000., inplace=True)

In [8]:
baseline.fit(train.drop("grantraw", axis=1), train["grantraw"])

NameError: name 'baseline' is not defined

In [9]:
train = pd.read_csv("baseline_train.csv")
test = pd.read_csv("baseline_test.csv")

In [12]:
len(pd.unique(raw_data["ij_code"][train.index]))

328