In [12]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('criminal_main.csv',encoding='latin-1')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,File,CaseName,CivilKriminal,Court,County,Judge,DistrictAttorney,ADA,Keywords,...,Defense,First_Date,Appeal_Date,FirstD_dow,AppealD_dow,FirstD_weekday,AppealD_weekday,FirstD_month,AppealD_month,Y
0,0,2009_08295.htm.txt,,K,County Court,Niagara County,"Peter L. Broderick, Sr., J.",Michael J. Violante,Thomas H. Brandt,affirmed,...,,11/9/07,11/13/09,Friday,Friday,1,1,11.0,11,0
1,1,2009_08296.htm.txt,,K,County Court,Oneida County,"Michael L. Dwyer, J.",Scott D. McNamara,Steven G. Cox,affirmed;interest of justice,...,,11/6/06,11/13/09,Monday,Friday,1,1,11.0,11,0
2,2,2009_08297.htm.txt,,K,County Court,Onondaga County,"Joseph E. Fahey, J.",William J. Fitzpatrick,Victoria M. White,affirmed;modified;interest of justice,...,Legal Aid Society,10/18/06,11/13/09,Wednesday,Friday,1,1,10.0,11,1
3,3,2009_08298.htm.txt,,K,County Court,Niagara County,"Angelo J. Morinello, A.J.",,,affirmed,...,,11/2/07,11/13/09,Friday,Friday,1,1,11.0,11,0
4,4,2009_08311.htm.txt,,K,County Court,Onondaga County,"Joseph E. Fahey, J.",William J. Fitzpatrick,James P. Maxwell,affirmed;dismissed;interest of justice,...,Legal Aid Society,4/21/05,11/13/09,Thursday,Friday,1,1,4.0,11,1


#### Part I. GroundsForAppeal column

Assuming that there was more than merely harmless error, there are four basic grounds for appeal:

- The lower court made a serious error of law (plain error);
- The weight of the evidence does not support the verdict;
- The lower court abused its discretion in making an errant ruling;
- The claim of Ineffective Assistance of Counsel under the Sixth Amendment.

In [41]:
grounds = df.GroundsForAppeal

In [5]:
num_rows = grounds.shape[0]

In [6]:
num_missing = grounds.isnull().sum()

In [58]:
share_missing = num_missing/num_rows
print('Share of missing values:', str(share_missing))

Share of missing values: 0.472076884948


In [7]:
unique  = grounds.unique()

In [8]:
grounds_list = []

for line in grounds:
    if isinstance(line, float): #skip nan
        continue
    words = line.split(';')
    grounds_list.extend(words)

In [50]:
unique_grounds = set(grounds_list)

In [10]:
unique_grounds

{'coerce',
 'coercion',
 'double jeopardy',
 'incapac',
 'ineffective counsel',
 'instructions',
 'juror',
 'mental',
 'resentenc',
 'sever',
 'speedy',
 'sufficient',
 'suppress',
 'youth'}

In [66]:
#Count number of unique occurrences per category - this does
for ground in unique_grounds:
    print(ground, grounds_list.count(ground))

instructions 1556
suppress 5042
mental 3119
coercion 365
ineffective counsel 3
youth 740
sufficient 9569
coerce 552
sever 5128
double jeopardy 382
speedy 500
resentenc 3096
incapac 153
juror 2038


In [67]:
len(grounds)

36314

#### Conclusion: about half of the data in the column is missing. However, we could clean and featurize existing categories.

#### Part II. Mode of Conviction column

In [36]:
mode = df.ModeOfConviction

In [49]:
unique_modes = mode.unique()

In [35]:
num_rows = mode.shape[0]
num_missing = mode.isnull().sum()

In [36]:
share_missing_mode = num_missing/num_rows
print("Share of missing mode of conviction:",str(share_missing_mode))

Share of missing mode of conviction: 0.461998127444


In [52]:
#Count number of unique occurrences per category
mode_list = mode.tolist()
for ex_mode in unique_modes:
    if isinstance(ex_mode,float):
        continue #ignore nan
    print(ex_mode, mode_list.count(ex_mode))

jury verdict 7852
plea of guilty 10690
nonjury trial 995


#### Conclusion: only 46% of values are present. Could featurize if needed

### Featurizing columns

In [39]:
#Create dummy variable for mode
mode_dummies = pd.get_dummies(mode)

In [40]:
mode_dummies.head()

Unnamed: 0,jury verdict,nonjury trial,plea of guilty
0,0,0,0
1,1,0,0
2,1,0,0
3,0,0,0
4,1,0,0


In [63]:
#Featurize grounds

shape = grounds.shape[0]
num_grounds = len(unique_grounds)
unique_grounds = list(unique_grounds)

output = np.zeros((shape,num_grounds))
for g in range(num_grounds):
    for v in range(shape):
        
        val = grounds.iloc[v]
        
        if pd.isnull(val):
            continue
        else:    
            val_list = val.split(';')
            if unique_grounds[g] in val_list:
                output[v,g] = 1
            else:
                output[v,g] = 0

In [76]:
names=list(map(lambda x: str(x), unique_grounds))
names

['sever',
 'suppress',
 'sufficient',
 'instructions',
 'coerce',
 'ineffective counsel',
 'double jeopardy',
 'speedy',
 'coercion',
 'youth',
 'incapac',
 'resentenc',
 'juror',
 'mental']

In [81]:
grounds_dummies = pd.DataFrame(output, index = df.File, columns=names)

In [79]:
mode_dummies.index = df.File

In [85]:
grounds_dummies.head()

Unnamed: 0_level_0,sever,suppress,sufficient,instructions,coerce,ineffective counsel,double jeopardy,speedy,coercion,youth,incapac,resentenc,juror,mental
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009_08295.htm.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009_08296.htm.txt,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009_08297.htm.txt,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2009_08298.htm.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009_08311.htm.txt,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
mode_grounds_df = mode_dummies.merge(grounds_dummies, left_index=True, right_index=True)

In [90]:
mode_grounds_df.head()

Unnamed: 0_level_0,jury verdict,nonjury trial,plea of guilty,sever,suppress,sufficient,instructions,coerce,ineffective counsel,double jeopardy,speedy,coercion,youth,incapac,resentenc,juror,mental
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2003_18059.htm.txt,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003_18060.htm.txt,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003_18061.htm.txt,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003_18062.htm.txt,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2003_18063.htm.txt,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
mode_grounds_df.drop('ineffective counsel',axis=1)

In [92]:
mode_grounds_df.to_csv('mode_grounds_dummies.csv')