# I Fought the Law:
### Contesting Charges in Virginia's District Courts

<img src="internet.png">

## Data Collection
Data for this project was pulled from Ben Schoenfeld's repository of scraped Virginia Court Cases. I used district criminal court information for 2017, which amounted to 2.1m rows.

## Feature Engineering
To answer the problem statement, I needed to engineer several features:
1. Positive Outcome for the Defense: final dispositions that were 'Not Guilty', 'Dismissed', 'Nolle Prosequi', 'Not Guilty Due to Insanity', and 'Amended Charge'.
2. Type of Defense: differentiate hearings by defendants who had a Private Lawyer, Public Defender, or no hired defense (defended themselves)
3. Types of Charges: the original dataset had 66,000+ unique charge descriptions and over 5,000 charge codes associated with them. To quickly bucket these into more manageable categories, I used topic modeling to cluster charges into 59 charge categories.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix

import ipywidgets as widgets
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

import plotly.plotly as py
import plotly.tools as pytools
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.offline as pyo
import cufflinks as cf

from IPython.display import HTML

import warnings
warnings.filterwarnings('ignore')

pytools.set_credentials_file(username='katerdowdy', api_key='hBCWsR3iY9a1feRSpU2A')

%matplotlib inline

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [2]:
df_summary = pd.read_csv('./summary_county_data.csv')
df_agg = pd.read_csv('./aggregate_charge_data.csv')
df_full = pd.read_csv('./2017_full.csv')

df_summary = df_summary.sort_values(by = 'Court')
counties = list(df_summary['Court'].unique())
df_agg = df_agg.sort_values(by = 'ChargeType')
charges = list(df_agg['ChargeType'].unique())

In [3]:
def chloropleth(selection):
    try:
        fips = df_summary['full_fips']
        values = df_summary[selection]
        step1 = values.mean() / 5
        step2 = (max(values) - values.mean()) / 5
        step3 = (max(values) / 10)
        num_endpoints = [np.round((values.mean() - (step1 * 4)), -3),
                    np.round((values.mean() - (step1 * 3)), -3),
                    np.round((values.mean() - (step1 * 2)), -3),
                    np.round((values.mean() - step1), -3),
                    np.round((values.mean()), -3),
                    np.round((max(values) - (step2 * 4)), -3),
                    np.round((max(values) - (step2 * 3)), -3),
                    np.round((max(values) - (step2 * 2)), -3),
                    np.round((max(values) - step2), -3)]
        prop_endpoints = [.1, .2, .3, .4, .5, .6, .8, 1, 1.2]
        even_endpoints = [step3, (step3 * 2), (step3 * 3), (step3 * 4), (step3 * 5),
                         (step3 * 6), (step3 * 7), (step3 * 8), (step3 * 9)]
    
        colorscale = ["#eafcfd", "#b7e0e4", "#85c5d3", "#60a7c7", "#4989bc",
               "#3e6ab0", "#3d4b94", "#323268", "#1d1d3b", "#030512"]
        
        fig = ff.create_choropleth(fips = fips, 
                           values = values,
                          scope = ['VA'],
                          county_outline={'color': 'rgb(169,169,169)', 'width': 1},
                           exponent_format=True,
                           #binning_endpoints = hearing_endpoints,
                           binning_endpoints = num_endpoints,
                          colorscale = colorscale,
                           legend_title=selection)
        return py.iplot(fig, filename=selection)
            
    except:
        fig = ff.create_choropleth(fips = fips, 
                           values = values,
                          scope = ['VA'],
                          county_outline={'color': 'rgb(169,169,169)', 'width': 1},
                           exponent_format=True,
                           #binning_endpoints = hearing_endpoints,
                           binning_endpoints = even_endpoints,
                          colorscale = colorscale,
                           legend_title=selection)
        return py.iplot(fig, filename=selection)

In [4]:
options = ['county_hearings', 'county_fines_charged', 'county_sentencing',
          'county_probation', 'defense_win_rate', 'Population',
          'fines_per_capita', 'hearings_per_capita']

## Trends Across Virginia
#### Hearings by County
The number of hearings (corresponding to the number of tickets/arrests) seem to be higher along the I-95 and I-81 corridors and in high-density population areas (Northern Virginia, Richmond, and Virginia Beach/Norfolk).

In [5]:
chloropleth('county_hearings')

#### Where Defenses Are Won

In [6]:
chloropleth('defense_win_rate')

In [7]:
chloro = interactive(chloropleth, selection = ['county_hearings', 'county_fines_charged', 
                                               'county_sentencing',
          'county_probation', 'defense_win_rate', 'Population',
          'fines_per_capita', 'hearings_per_capita'])

In [8]:
def plot_agg_charges(casetype):
    trace1 = go.Bar(
        x=df_agg[df_agg['CaseType'] == casetype]['ChargeType'],
        y=df_agg[df_agg['CaseType'] == casetype]['agg_charge'],
        name='All Charges'
    )
    trace2 = go.Bar(
        x=df_agg[df_agg['CaseType'] == casetype]['ChargeType'],
        y=df_agg[df_agg['CaseType'] == casetype]['agg_charge'] * df_agg[df_agg['CaseType'] == casetype]['agg_contested_rate'],
        name='Defendants Went to Court'
    )

    trace3 = go.Bar(
        x=df_agg[df_agg['CaseType'] == casetype]['ChargeType'],
        y=df_agg[df_agg['CaseType'] == casetype]['agg_charge'] * df_agg[df_agg['CaseType'] == casetype]['agg_charge_overturn_rate'],
        name='Charges Dismissed/Overturned/Amended'
    )

    data = [trace1, trace2, trace3]
    
    layout = go.Layout(
    autosize=False,
    width=800,
    height=800,
    barmode='group',
    xaxis=dict(
        title='CHARGES',
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        automargin=True,
        tickangle=45,
        tickfont=dict(
            family='Arial, sans-serif',
            size=14,
            color='black'
        ),
        exponentformat='e',
        showexponent='all'
    ),
)

    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename='grouped-bar')

## Which Charges are Contested / Defeated Most?
### Infractions

In [9]:
plot_agg_charges(casetype = 'Infraction')

### Misdemeanors

In [10]:
plot_agg_charges(casetype = 'Misdemeanor')

### Felonies

In [11]:
plot_agg_charges(casetype = 'Felony')

### Civil Violations

In [12]:
plot_agg_charges(casetype = 'Civil Violation')

## How Well Can We Predict the Outcome of a Case?

In [13]:
# drop features # complainant
drop_features = ['Unnamed: 0',
                 'level_0',
                             'index', 
                             'HearingDate', 
                             'HearingResult', 
                             #'HearingPlea',
                             'HearingContinuanceCode',
                             'HearingType',
                             'HearingCourtroom',
                             'fips',
                             'FiledDate',
                             'Locality',
                             'Status',  
                             'Address',
                             'Gender',
                             'Race',
                             'Charge', 
                             'CodeSection', 
                             'Contested',
                             'CaseType', 
                             'Class',
                             'OffenseDate', 
                             'ArrestDate', 
                             'AmendedCharge',
                             'AmendedCode', 
                             'AmendedCaseType', 
                             'FinalDisposition',
                             'ProbationTime', 
                             'ProbationStarts',
                             'SentenceTime', 
                             'SentenceSuspendedTime', 
                             'ProbationType',
                             'OperatorLicenseSuspensionTime',
                               'RestrictionEffectiveDate', 
                             'RestrictionEndDate',
                               'OperatorLicenseRestrictionCodes', 
                             'Fine', 
                             'Costs', 
                             'FineCostsDue',
                               'FineCostsPaid', 
                             'FineCostsPaidDate', 
                             'VASAP', 
                             'FineCostsPastDue',
                             'person_id', 
                             'person_id_freq',
                             'full_fips',
                             'Outcome_Positive', 
                             'Amended', 
                             'Total_Positive',
                            'ChargeType',
                            'Court']

In [14]:
df_full['DefenseAttorney'].fillna(0, inplace = True)
df_full['Complainant'].fillna(0, inplace = True)
df_full['HearingPlea'].fillna(0, inplace = True)

def log_odds(x):
    return np.exp(x)

In [15]:
logreg = LogisticRegression()
df = df_full[(df_full['Court'] == 'Fairfax County') & (df_full['ChargeType'] == 'MIS: Assault')]
df_dummied = pd.get_dummies(df, columns = ['DefenseAttorney', 'Complainant', 'HearingPlea'], drop_first = True)
df_model = df_dummied[df_dummied['Contested'] == 1]
X = df_model.drop(columns = drop_features)
features = X.columns
y = df_model['Total_Positive']

In [16]:
X_real_test = pd.DataFrame(index = [0], columns = [features])
X_real_test.fillna(0, inplace = True)

In [17]:
X_real_test.fillna(0, inplace = True)

In [18]:
had_lawyer_options = ['Yes', 'No']
public_defender = ['Yes', 'No']
prior_hearings = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
plea = ['Guilty', 'Not Guilty', 'Nolo Contendere', 'Tried In Absentia', 'Unknown']

In [19]:
def detail_logreg(county, charge, hire_lawyer, public_defender, prior_hearings, plea):
    try:
        logreg = LogisticRegression()
        df = df_full[(df_full['Court'] == county) & (df_full['ChargeType'] == charge)]
        df_dummied = pd.get_dummies(df, columns = ['DefenseAttorney', 'Complainant', 'HearingPlea'], drop_first = True)
        df_model = df_dummied[df_dummied['Contested'] == 1]
        X = df_model.drop(columns = drop_features)
        features = X.columns
        y = df_model['Total_Positive']
        baseline = y.value_counts(normalize = True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
        ss = StandardScaler()
        X_train_sc = ss.fit_transform(X_train)
        X_test_sc = ss.transform(X_test)
        logreg.fit(X_train_sc, y_train)
        cv_train = cross_val_score(logreg, X_train_sc, y_train)
        cv_test = cross_val_score(logreg, X_test_sc, y_test)
        
        # sample prediction
        X_real_test = pd.DataFrame
        X_real_test = pd.DataFrame(index = [0], columns = [features])
        X_real_test.fillna(0, inplace = True)
        if hire_lawyer == 'Yes':
            X_real_test.loc['HadLawyer'] = 1
        if public_defender == 'Yes':
            X_real_test.loc['PublicDefender'] = 1
        X_real_test.loc['prior_hearings'] = prior_hearings
        X_real_test.loc['TimeSinceOffense'] = df_model['TimeSinceOffense'].mean()
        if plea == 'Guilty':
            X_real_test.loc['HearingPlea_Guilty'] = 1
        elif plea == 'Not Guilty':
            X_real_test.loc['HearingPlea_Not Guilty'] = 1
        elif plea == 'Nolo Contendere':
            X_real_test.loc['HearingPlea_Nolo Contendere'] = 1
        elif plea == 'Tried In Absentia':
            X_real_test.loc['HearingPlea_Tried in Absentia'] = 1
        elif plea == 'Unknown':
            X_real_test.loc['HearingPlea_Unknown'] = 1
        X_real_test_sc = ss.transform(X_real_test)
        prediction = logreg.predict(X_real_test_sc)[0]
        prediction_prob = logreg.predict_proba(X_real_test_sc)[0]

        # coefficients
        coefficients = logreg.coef_
        coef_df = pd.DataFrame(coefficients, columns = features).T
        coef_df['change_odds_ratio'] = coef_df.apply(lambda x: log_odds(x))
        coef_df.rename(columns = {0: 'logreg_coefficient'}, inplace = True)
        coef_df_top = coef_df.sort_values(by = 'logreg_coefficient', ascending = False).head()
        coef_df_bottom = coef_df.sort_values(by = 'logreg_coefficient', ascending = False).tail()
        coef_df_all = pd.concat([coef_df_top, coef_df_bottom])
        
        # real predictions
        outcome_preds = logreg.predict(X_test_sc)
        confusion_matrix(y_test, outcome_preds)

        tn, fp, fn, tp = confusion_matrix(y_test, outcome_preds).ravel()
    
        print("Model for {} in {}".format(charge, county))
        print("Baseline:")
        print(baseline)
        print("-----")
        print("Predicting Your Outcome:")
        if prediction == 1:
            print("You will be victorious!")
        elif prediction == 0:
            print("You will fail...")
        print("Probability of success according to this model:", prediction_prob[1])
        print("-----")
        print("How Good is This Model?")
        print("Train Accuracy Scores:", cv_train, "Train Average Accuracy:", cv_train.mean())
        print("Test Accuracy Scores:", cv_test, "Test Average Accuracy:", cv_test.mean())
        print("-----")
        print("Classification Metrics:")
        print("True Negatives: %s" % tn)
        print("False Positives: %s" % fp)
        print("False Negatives: %s" % fn)
        print("True Positives: %s" % tp)
        print("-----")
        print("Accuracy: %s" % ((tp + tn) / (tn + fp + fn + tp)))
        print("Misclassification Rate: %s" % ((fp + fn) / (tn + fp + fn + tp)))
        print("-----")
        print("Sensitivity/Recall (True Positive Rate): %s" % ((tp) / (tp + fn)))
        print("Specificity (True Negative Rate): %s" % ((tn) / (tn + fp)))
        print("False Positive Rate: %s" % ((fp) / (tp + fn)))
        print("Precision: %s" % ((tp) / (tp + fp)))
        print("-----")
        print("Factors that Help the Case")
        print(coef_df_top)
        print("-----")
        print("Factors that Hurt the Case")
        print(coef_df_bottom)
    except:
        print("Thinking...")

In [20]:
a = interactive(detail_logreg, county = counties, charge = charges, hire_lawyer = had_lawyer_options, 
                public_defender = public_defender, prior_hearings = prior_hearings, plea = plea)
display(a)

interactive(children=(Dropdown(description='county', options=('Accomack County', 'Albemarle County', 'Alexandr…

### Does Race Make a Difference?

In [21]:
def anova_race1(county, charge, *args):
    df_test = df_full[(df_full['ChargeType'] == charge) &
                         (df_full['Court'] == county) &
                         (df_full['Contested'] == 1)]
    if len(args[0]) == 2:
        test = stats.f_oneway(df_test[df_test['Race'] == str(args[0][0])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][1])]['Total_Positive'])
    elif len(args[0]) == 3:
        test = stats.f_oneway(df_test[df_test['Race'] == str(args[0][0])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][1])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][2])]['Total_Positive'])
    elif len(args[0]) == 4:
        test = stats.f_oneway(df_test[df_test['Race'] == str(args[0][0])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][1])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][2])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][3])]['Total_Positive'])
    elif len(args[0]) == 5:
        test = stats.f_oneway(df_test[df_test['Race'] == str(args[0][0])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][1])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][2])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][3])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][4])]['Total_Positive'])
    elif len(args[0]) == 6:
        test = stats.f_oneway(df_test[df_test['Race'] == str(args[0][0])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][1])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][2])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][3])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][4])]['Total_Positive'],
                          df_test[df_test['Race'] == str(args[0][5])]['Total_Positive'])

    percent_race = df_test['Race'].value_counts(normalize = True)
    p_value = test.pvalue
   
    print("Comparing mean outcomes for these populations contesting {} charges in {}:".format(charge, county))
    for i in args[0]:
        print("    ", i)
    print("P-value:", p_value)
    if p_value <= 0.01:
        print("The p-value is sufficiently small that we can reject the null hypothesis and accept the alternative hypothesis: that there is a statistically significant difference in defense outcomes for these groups based on race.")
    if p_value > 0.01:
        print("The p-value is not small enough to reject the null hypothesis. We cannot draw a conclusion about how outcomes differ by race for this charge.")
    print("--------------")
    print("Demographic makeup of defendees contesting {} charges in {}:".format(charge, county))
    print(percent_race)

In [22]:
def anova_race(county, charge, comparison):
    df_test = df_full[(df_full['ChargeType'] == charge) &
                         (df_full['Court'] == county) &
                         (df_full['Contested'] == 1)]
    if comparison == 'White, Black':
        test = stats.f_oneway(df_test[df_test['Race'] == 'White Caucasian(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Black(Non-Hispanic)']['Total_Positive'])
    elif comparison == 'White, Black, Latino':
        test = stats.f_oneway(df_test[df_test['Race'] == 'White Caucasian(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Black(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Hispanic']['Total_Positive'])
    elif comparison == 'White, Black, Latino, Asian Or Pacific Islander':
        test = stats.f_oneway(df_test[df_test['Race'] == 'White Caucasian(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Black(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Hispanic']['Total_Positive'],
                          df_test[df_test['Race'] == 'Asian Or Pacific Islander']['Total_Positive'])
    elif comparison == 'White, Black, Latino, Asian Or Pacific Islander, Native American':
        test = stats.f_oneway(df_test[df_test['Race'] == 'White Caucasian(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Black(Non-Hispanic)']['Total_Positive'],
                          df_test[df_test['Race'] == 'Hispanic']['Total_Positive'],
                          df_test[df_test['Race'] == 'Asian Or Pacific Islander']['Total_Positive'],
                          df_test[df_test['Race'] == 'American Indian']['Total_Positive'])

    percent_race = df_test['Race'].value_counts(normalize = True)
    p_value = test.pvalue
   
    print("Comparing mean outcomes for these populations contesting {} charges in {}:".format(charge, county))
    print(comparison)
    print("-----")
    print("P-value:", p_value)
    if p_value <= 0.01:
        print("YES. Race makes a difference.")
        print("The p-value is sufficiently small that we can reject the null hypothesis and accept the alternative hypothesis: "
              "that there is a statistically significant difference in defense outcomes for these groups based on race.")
    if p_value > 0.01:
        print("Inconclusive.")
        print("The p-value is not small enough to reject the null hypothesis. "
              "We cannot draw a conclusion about how outcomes differ by race for this charge.")
    print("--------------")
    print("Demographic makeup of defendants contesting {} charges in {}:".format(charge, county))
    print(percent_race)

In [23]:
comps = ['White, Black', 
        'White, Black, Latino',
        'White, Black, Latino, Asian Or Pacific Islander',
        'White, Black, Latino, Asian Or Pacific Islander, Native American']

In [24]:
anova_race = interactive(anova_race, county = counties, charge = charges, comparison = comps)
display(anova_race)

interactive(children=(Dropdown(description='county', options=('Accomack County', 'Albemarle County', 'Alexandr…

### Does Type of Defense Matter?

In [25]:
def anova_defense(county, charge, defense_comparison):
    df_test = df_full[(df_full['ChargeType'] == charge) &
                         (df_full['Court'] == county) &
                         (df_full['Contested'] == 1)]
    if defense_comparison == 'Include Defending Self':
        test = stats.f_oneway(df_test[(df_test['HadLawyer'] == 1) &
                                 (df_test['PublicDefender'] == 0)]['Total_Positive'],
                   df_test[(df_test['HadLawyer'] == 1) &
                          (df_test['PublicDefender'] == 1)]['Total_Positive'],
                   df_test[df_test['HadLawyer'] == 0]['Total_Positive'])
        print("Comparing mean outcomes for defendees with public defenders, private lawyers, and no lawyers contesting {} charges in {} based on defense strategy:".format(charge, county))
    
    elif defense_comparison == 'Only Lawyers':
        test = stats.f_oneway(df_test[(df_test['HadLawyer'] == 1) &
                                 (df_test['PublicDefender'] == 0)]['Total_Positive'],
                   df_test[(df_test['HadLawyer'] == 1) &
                          (df_test['PublicDefender'] == 1)]['Total_Positive'])
        print("Comparing mean outcomes for defendants "
              "contesting {} charges in {} based on defense strategy:".format(charge, county))
    
    percent_had_lawyer = df_test['HadLawyer'].value_counts(normalize = True)
    percent_had_pd = df_test[df_test['HadLawyer'] == 1]['PublicDefender'].value_counts(normalize = True)
    p_value = test.pvalue
    
    no_pd = df_test[(df_test['DefenseAttorney'] != 0) & (df_test['DefenseAttorney'] != 'publicdefender')]
    top_5 = no_pd.groupby('DefenseAttorney')['Outcome_Positive'].sum().sort_values(ascending = False)[0:5]
    
    print("P-value:", p_value)
    if p_value <= 0.01:
        print("YES, defense strategy matters.")
        print("The p-value is sufficiently small that we can reject the null hypothesis and "
        "accept the alternative hypothesis: that there is a statistically significant difference "
        "in defense outcomes for these groups based on defense strategy.")
    if p_value > 0.01:
        print("Inconclusive.")
        print("The p-value is not small enough to reject the null hypothesis. We cannot draw "
        "a conclusion about how outcomes differ by defense strategy for this charge.")
    print("--------------")
    print("Defendants who had lawyers to help contest {} charges in {}:".format(charge, county))
    print(percent_had_lawyer)
    print("Of those with lawyers, the percentage with public defenders:")
    print(percent_had_pd)
    print('-------')
    print('Private Lawyers Who Won the Most (by # of cases)')
    print(top_5)
    print('-------')
    print('Top Lawyers\' Win Rates for This Charge')
    for i in top_5.index:
        print(i, no_pd[no_pd['DefenseAttorney'] == i]['Outcome_Positive'].mean())

In [26]:
defenses = ['Include Defending Self', 'Only Lawyers']

In [27]:
defense = interactive(anova_defense, county = counties, charge = charges, 
                      defense_comparison = defenses)
display(defense)

interactive(children=(Dropdown(description='county', options=('Accomack County', 'Albemarle County', 'Alexandr…

In [28]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')