## Warren County Analysis

#### Import Libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
import plotly.express as px

#### Exploratory Data Analysis

In [4]:
warren_df = pd.read_csv('../Data/Warren_county_synthetic_data.csv', usecols = ["Race", "Age", "Gender", "Education Level", "Employment Status", "Income Level", "Prior Convictions", "Risk Score", "Judge Decision", "Re-offense"], index_col=False)

# Display the first few rows of the dataframe
# This will help us understand the structure of the data and what columns are available for analysis
warren_df.head()

Unnamed: 0,Race,Age,Gender,Education Level,Employment Status,Income Level,Prior Convictions,Risk Score,Judge Decision,Re-offense
0,White,18,Male,High School,Unemployed,34165.969893,1,3.899632,1,0
1,Black,37,Male,High School,Employed,34114.425797,2,7.631,0,0
2,White,49,Male,Less than High School,Unemployed,36421.891032,0,6.278393,0,0
3,White,33,Female,Less than High School,Employed,121869.327498,1,1.019732,1,0
4,Black,19,Male,Bachelor's Degree,Employed,73289.622946,1,5.587222,0,0


In [5]:
# Display the data types of each column
# This will help us understand the types of data we are working with
# and if any conversions are needed
warren_df.dtypes

Race                  object
Age                    int64
Gender                object
Education Level       object
Employment Status     object
Income Level         float64
Prior Convictions      int64
Risk Score           float64
Judge Decision         int64
Re-offense             int64
dtype: object

In [6]:
# Display the summary statistics of the dataframe
# This will provide insights into the distribution of numerical columns and help identify any potential outliers
warren_df.describe()

Unnamed: 0,Age,Income Level,Prior Convictions,Risk Score,Judge Decision,Re-offense
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,46.747,43825.011091,1.76,5.294246,0.534,0.323
std,16.410432,32327.559434,1.392242,2.037417,0.499092,0.467857
min,18.0,14250.017452,0.0,0.908174,0.0,0.0
25%,32.0,15392.412137,1.0,3.890328,0.0,0.0
50%,47.0,34631.999183,2.0,5.261343,1.0,0.0
75%,60.0,72708.753616,3.0,6.583151,1.0,1.0
max,75.0,131060.730035,8.0,10.993274,1.0,1.0


In [7]:
# Display the number of missing values in each column
# This will help us identify if any data cleaning is needed before analysis 
# No missing values identified
warren_df.isnull().sum()

Race                 0
Age                  0
Gender               0
Education Level      0
Employment Status    0
Income Level         0
Prior Convictions    0
Risk Score           0
Judge Decision       0
Re-offense           0
dtype: int64

In [8]:
# Risk Score Distribution

fig = px.histogram(
    warren_df, 
    x='Risk Score', 
    color_discrete_sequence=['#3498db'],
    title='Distribution of Risk Scores'
)

fig.update_layout(
    title='Distribution of Risk Scores',
    xaxis_title='Risk Score',
    yaxis_title='Count',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [9]:
# Binning Risk Score
bins = [0, 2, 5, 8, 11]
labels = ['0-2', '3-5', '6-8', '9-11']
warren_df['Risk Score Bin'] = pd.cut(warren_df['Risk Score'], bins=bins, labels=labels, right=True)
warren_df.head()

Unnamed: 0,Race,Age,Gender,Education Level,Employment Status,Income Level,Prior Convictions,Risk Score,Judge Decision,Re-offense,Risk Score Bin
0,White,18,Male,High School,Unemployed,34165.969893,1,3.899632,1,0,3-5
1,Black,37,Male,High School,Employed,34114.425797,2,7.631,0,0,6-8
2,White,49,Male,Less than High School,Unemployed,36421.891032,0,6.278393,0,0,6-8
3,White,33,Female,Less than High School,Employed,121869.327498,1,1.019732,1,0,0-2
4,Black,19,Male,Bachelor's Degree,Employed,73289.622946,1,5.587222,0,0,6-8


#### Demographic Profiles

In [10]:
# Distribution of the Race Column
race_counts = warren_df['Race'].value_counts()

fig = px.bar(
    x=race_counts.index,
    y=race_counts.values,
    labels={'x': 'Race', 'y': 'Count'},
    title='Distribution of Race',
    color=race_counts.index,
    color_discrete_sequence=px.colors.sequential.thermal,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Race',
    yaxis_title='Count',
    template='seaborn',
    width=900,
    height=400
)

fig.show()

In [11]:
# Distribution of the Gender Level Column
gender_counts = warren_df['Gender'].value_counts()

fig = px.bar(
    x=gender_counts.index,
    y=gender_counts.values,
    labels={'x': 'Gender', 'y': 'Count'},
    title='Distribution of Gender',
    color=gender_counts.index,
    color_discrete_sequence=px.colors.sequential.Sunsetdark,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Gender',
    yaxis_title='Count',
    template='seaborn',
    width=900,
    height=400
)

fig.show()

In [12]:
# Distribution of the Education Level Column
education_counts = warren_df['Education Level'].value_counts()

fig = px.bar(
    x=education_counts.index,
    y=education_counts.values,
    labels={'x': 'Education Level', 'y': 'Count'},
    title='Distribution of Education Level',
    color=education_counts.index,
    color_discrete_sequence=px.colors.sequential.Viridis,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Education Level',
    yaxis_title='Count',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [13]:
# Distribution of the Age Column

fig = px.histogram(warren_df, 
                   x='Age', 
                   nbins=7, 
                   color_discrete_sequence=['#2B3A67'],
                   text_auto=True
                   )

fig.update_layout(
    title='Distribution of Age',
    xaxis_title='Age',
    yaxis_title='Frequency',
    bargap=0.05,  
    template='plotly_white', 
    width=900,  
    height=400,
)

fig.show()

In [14]:
# Distribution of the Employment Status Column
employment_counts = warren_df['Employment Status'].value_counts()

fig = px.bar(
    x=employment_counts.index,
    y=employment_counts.values,
    labels={'x': 'Employment Status', 'y': 'Count'},
    title='Distribution of Employment Status',
    color=employment_counts.index,
    color_discrete_sequence=px.colors.sequential.Mint,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Employment Status',
    yaxis_title='Count',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [15]:
# Distribution of the Income Level Column
income_counts = warren_df['Income Level'].value_counts()

fig = px.histogram(
    warren_df, 
    x='Income Level', 
    nbins=10, 
    color_discrete_sequence=px.colors.qualitative.Set3,
    text_auto=True
)

fig.update_layout(
    title='Distribution of Income Level',
    xaxis_title='Income Level',
    yaxis_title='Frequency',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

#### Risk Scores across Demographic Groups

In [16]:
# Average of risk scores by Race
avg_risk_score_race = warren_df.groupby('Race')['Risk Score'].mean()
avg_risk_score_race

Race
Black    5.795578
Other    4.656769
White    4.876668
Name: Risk Score, dtype: float64

In [17]:
# Visualizing risk scores by Race
fig = px.bar(
    x=avg_risk_score_race.index,
    y=avg_risk_score_race.values,
    labels={'x': 'Race', 'y': 'Average Risk Score'},
    title='Average Risk Score by Race',
    color=avg_risk_score_race.index,
    color_discrete_sequence=px.colors.sequential.Viridis,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Race',
    yaxis_title='Average Risk Score',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [18]:
# Average of risk scores by Gender
avg_risk_score_gender = warren_df.groupby('Gender')['Risk Score'].mean()
avg_risk_score_gender

Gender
Female    5.258912
Male      5.330150
Name: Risk Score, dtype: float64

In [19]:
# Visualizing risk scores by Gender
fig = px.bar(
    x=avg_risk_score_gender.index,
    y=avg_risk_score_gender.values,
    labels={'x': 'Gender', 'y': 'Average Risk Score'},
    title='Average Risk Score by Gender',
    color=avg_risk_score_gender.index,
    color_discrete_sequence=px.colors.sequential.Blugrn,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Gender',
    yaxis_title='Average Risk Score',
    template='plotly_white',
    width=900,
    height=400,
)

fig.show()

In [20]:
# Average of Risk Scores by Race and Gender
avg_risk_score_race_gender = warren_df.groupby(['Race', 'Gender'])['Risk Score'].mean()
avg_risk_score_race_gender

Race   Gender
Black  Female    5.740620
       Male      5.857595
Other  Female    4.824229
       Male      4.532994
White  Female    4.797872
       Male      4.951137
Name: Risk Score, dtype: float64

In [21]:
# Visualizing Risk Scores by Race and Gender
fig = px.bar(
    avg_risk_score_race_gender.reset_index(),
    x='Race',
    y='Risk Score',
    color='Gender',
    barmode='group',
    labels={'Risk Score': 'Average Risk Score'},
    title='Average Risk Score by Race and Gender',
    color_discrete_sequence=px.colors.qualitative.Set2,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Race',
    yaxis_title='Average Risk Score',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [22]:
# Average of Risk Scores by Education Level
avg_risk_score_education = warren_df.groupby('Education Level')['Risk Score'].mean()
avg_risk_score_education

Education Level
Bachelor's Degree        5.115802
High School              5.285864
Less than High School    5.504879
Master's Degree          5.127023
PhD                      4.624439
Some College             5.347367
Name: Risk Score, dtype: float64

In [23]:
# Visualizing Risk Scores by Education Level
fig = px.bar(
    x=avg_risk_score_education.index,
    y=avg_risk_score_education.values,
    labels={'x': 'Education Level', 'y': 'Average Risk Score'},
    title='Average Risk Score by Education Level',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Education Level',
    yaxis_title='Average Risk Score',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [24]:
# Average of Risk Scores by Employment Status
avg_risk_score_employment = warren_df.groupby('Employment Status')['Risk Score'].mean()
avg_risk_score_employment

Employment Status
Employed      5.278791
Unemployed    5.326938
Name: Risk Score, dtype: float64

In [25]:
# Visualizing Risk Scores by Employment Status
fig = px.bar(
    x=avg_risk_score_employment.index,
    y=avg_risk_score_employment.values,
    labels={'x': 'Employment Status', 'y': 'Average Risk Score'},
    title='Average Risk Score by Employment Status',
    color_discrete_sequence=px.colors.qualitative.Bold,
    text_auto=True
)

fig.update_layout(
    xaxis_title='Employment Status',
    yaxis_title='Average Risk Score',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

#### Comparative Analysis of Judges’ Bail Decisions to AI Risk Scores

In [26]:
warren_df['Judge Decision'].value_counts()

Judge Decision
1    534
0    466
Name: count, dtype: int64

In [27]:
# Cross-tabulate Risk Score Bin and Judge Decision segmented by Race
cross_tab_race = warren_df.groupby(['Race', 'Risk Score Bin', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_race['Judge Decision'] = cross_tab_race['Judge Decision'].astype(str)

# Plot stacked bar chart segmented by Race with separate Y-axis
fig = px.bar(
    cross_tab_race, 
    x="Risk Score Bin", 
    y="Count", 
    color="Judge Decision",  # Stack by judge decision
    facet_col="Race",  # Separate plots for each race group
    title="Cross-tabulate Risk Scores and Judges' Decisions by Race", 
    labels={'Risk Score Bin': 'Risk Score Range', 'Count': 'Frequency'},
    text_auto=True,
    template='plotly_white',
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

# Update layout for separate Y-axis per facet
fig.update_yaxes(matches=None)  # Ensures each facet has its own Y-scale
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))

fig.show()





In [28]:
# Cross-tabulate Risk Score Bin and Judge Decision segmented by Gender
cross_tab_gender = warren_df.groupby(['Gender', 'Risk Score Bin', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_gender['Judge Decision'] = cross_tab_gender['Judge Decision'].astype(str)

# Plot stacked bar chart segmented by Gender with separate Y-axis
fig = px.bar(
    cross_tab_gender, 
    x="Risk Score Bin", 
    y="Count", 
    color="Judge Decision",  # Stack by judge decision
    facet_col="Gender",  # Separate plots for each gender group
    title="Cross-tabulate Risk Scores and Judges' Decisions by Gender", 
    labels={'Risk Score Bin': 'Risk Score Range', 'Count': 'Frequency'},
    text_auto=True,
    template='plotly_white',
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

# Update layout for separate Y-axis per facet
fig.update_yaxes(matches=None)  # Ensures each facet has its own Y-scale
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))

fig.show()





In [29]:
# Cross-tabulate Risk Score Bin and Judge Decision segmented by Education Level
cross_tab_education = warren_df.groupby(['Education Level', 'Risk Score Bin', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_education['Judge Decision'] = cross_tab_education['Judge Decision'].astype(str)

# Plot stacked bar chart segmented by Education Level with two rows
fig = px.bar(
    cross_tab_education, 
    x="Risk Score Bin", 
    y="Count", 
    color="Judge Decision",  # Stack by judge decision
    facet_col="Education Level",  # Separate plots for each education level
    facet_col_wrap=3,  # Wrap into two rows
    title="Cross-tabulate Risk Scores and Judges' Decisions by Education Level", 
    labels={'Risk Score Bin': 'Risk Score Range', 'Count': 'Frequency'},
    text_auto=True,
    template='plotly_white',
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

fig.update_layout(width=1200, height=800)

# Update layout for separate Y-axis per facet
fig.update_yaxes(matches=None)  # Ensures each facet has its own Y-scale
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))

fig.show()





In [30]:
# Cross-tabulate Risk Score Bin and Judge Decision segmented by Employment Status
cross_tab_employment = warren_df.groupby(['Employment Status', 'Risk Score Bin', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_employment['Judge Decision'] = cross_tab_employment['Judge Decision'].astype(str)

# Plot stacked bar chart segmented by Employment Status with two rows
fig = px.bar(
    cross_tab_employment, 
    x="Risk Score Bin", 
    y="Count", 
    color="Judge Decision",  # Stack by judge decision
    facet_col="Employment Status",  # Separate plots for each employment status
    facet_col_wrap=3,  # Wrap into two rows
    title="Cross-tabulate Risk Scores and Judges' Decisions by Employment Status", 
    labels={'Risk Score Bin': 'Risk Score Range', 'Count': 'Frequency'},
    text_auto=True,
    template='plotly_white',
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

# Update layout for separate Y-axis per facet
fig.update_yaxes(matches=None)  # Ensures each facet has its own Y-scale

fig.show()





#### Re-offense Rates and Fairness Metrics

In [31]:
# Analyzing Re-offense Rates by Race and Judge Decision
cross_tab_race_decision = warren_df.groupby(['Race', 'Re-offense', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_race_decision['Re-offense'] = cross_tab_race_decision['Re-offense'].astype(str)
cross_tab_race_decision['Judge Decision'] = cross_tab_race_decision['Judge Decision'].astype(str)

fig = px.bar(
    cross_tab_race_decision, 
    x='Race', 
    y='Count', 
    color='Re-offense', 
    barmode='group', 
    facet_col='Judge Decision', 
    labels={'Count': 'Frequency', 'Re-offense': 'Re-offense Status'}, 
    title='Re-offense Rates by Race and Judge Decision',
    text_auto=True,
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

fig.update_layout(
    xaxis_title='Race',
    yaxis_title='Frequency',
    template='plotly_white',
    width=1200,
    height=400
)

fig.show()

In [32]:
# FPR, FNR for each racial group
# Calculate FPR and FNR for each racial group
fpr_fnr_by_race = []

for race in warren_df['Race'].unique():
    group = warren_df[warren_df['Race'] == race]
    
    # False Positive Rate (FPR): Predict 1 (Judge Decision) but Actual 0 (Re-offense)
    fp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 1)])
    tn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 1)])
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    # False Negative Rate (FNR): Predict 0 (Judge Decision) but Actual 1 (Re-offense)
    fn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 0)])
    tp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 0)])
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    fpr_fnr_by_race.append({'Race': race, 'FPR': fpr, 'FNR': fnr})

fpr_fnr_by_race = pd.DataFrame(fpr_fnr_by_race)
fpr_fnr_by_race

Unnamed: 0,Race,FPR,FNR
0,White,0.658683,0.264438
1,Black,0.256757,0.626582
2,Other,0.5,0.3125


In [33]:
# Visualizing fpr_fnr_by_race
fig = px.bar(
    fpr_fnr_by_race.melt(id_vars='Race', var_name='Metric', value_name='Rate'),
    x='Race',
    y='Rate',
    color='Metric',
    barmode='group',
    title='False Positive Rate (FPR) and False Negative Rate (FNR) by Race',
    labels={'Rate': 'Rate', 'Race': 'Race', 'Metric': 'Metric'},
    text_auto=True,
    color_discrete_map={'FPR': '#6ed5ee', 'FNR': '#836eee'}
)

fig.update_layout(
    xaxis_title='Race',
    yaxis_title='Rate',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [34]:
# Analyzing Re-offense Rates by Gender and Judge Decision 
cross_tab_gender_decision = warren_df.groupby(['Gender', 'Re-offense', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_gender_decision['Re-offense'] = cross_tab_gender_decision['Re-offense'].astype(str)
cross_tab_gender_decision['Judge Decision'] = cross_tab_gender_decision['Judge Decision'].astype(str)

fig = px.bar(
    cross_tab_gender_decision, 
    x='Gender', 
    y='Count', 
    color='Re-offense', 
    barmode='group', 
    facet_col='Judge Decision', 
    labels={'Count': 'Frequency', 'Re-offense': 'Re-offense Status'}, 
    title='Re-offense Rates by Gender and Judge Decision',
    text_auto=True,
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

fig.update_layout(
    xaxis_title='Gender',
    yaxis_title='Frequency',
    template='plotly_white',
    width=1200,
    height=400
)

fig.show()

In [35]:
# FPR, FNR for each gender group
# Calculate FPR and FNR for each gender group
fpr_fnr_by_gender = []

for gender in warren_df['Gender'].unique():
    group = warren_df[warren_df['Gender'] == gender]
    
    # False Positive Rate (FPR): Predict 1 (Judge Decision) but Actual 0 (Re-offense)
    fp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 1)])
    tn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 1)])
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    # False Negative Rate (FNR): Predict 0 (Judge Decision) but Actual 1 (Re-offense)
    fn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 0)])
    tp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 0)])
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    fpr_fnr_by_gender.append({'Gender': gender, 'FPR': fpr, 'FNR': fnr})

fpr_fnr_by_gender = pd.DataFrame(fpr_fnr_by_gender)
fpr_fnr_by_gender

Unnamed: 0,Gender,FPR,FNR
0,Male,0.464052,0.437318
1,Female,0.476471,0.434132


In [36]:
# Visualizing fpr_fnr_by_gender
fig = px.bar(
    fpr_fnr_by_gender.melt(id_vars='Gender', var_name='Metric', value_name='Rate'),
    x='Gender',
    y='Rate',
    color='Metric',
    barmode='group',
    title='False Positive Rate (FPR) and False Negative Rate (FNR) by Gender',
    labels={'Rate': 'Rate', 'Race': 'Race', 'Metric': 'Metric'},
    text_auto=True,
    color_discrete_map={'FPR': '#6ed5ee', 'FNR': '#836eee'}
)

fig.update_layout(
    xaxis_title='Gender',
    yaxis_title='Rate',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [37]:
# Analyzing Re-offense Rates by Education Level and Judge Decision
cross_tab_education_decision = warren_df.groupby(['Education Level', 'Re-offense', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_education_decision['Re-offense'] = cross_tab_education_decision['Re-offense'].astype(str)
cross_tab_education_decision['Judge Decision'] = cross_tab_education_decision['Judge Decision'].astype(str)

fig = px.bar(
    cross_tab_education_decision, 
    x='Education Level', 
    y='Count', 
    color='Re-offense', 
    barmode='group', 
    facet_col='Judge Decision', 
    labels={'Count': 'Frequency', 'Re-offense': 'Re-offense Status'}, 
    title='Re-offense Rates by Education Level and Judge Decision',
    text_auto=True,
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

fig.update_layout(
    xaxis_title='Education Level',
    yaxis_title='Frequency',
    template='plotly_white',
    width=1200,
    height=400
)

fig.show()

In [38]:
# FPR, FNR for each education level group

fpr_fnr_by_education = []

for education in warren_df['Education Level'].unique():
    group = warren_df[warren_df['Education Level'] == education]
    
   # False Positive Rate (FPR): Predict 1 (Judge Decision) but Actual 0 (Re-offense)
    fp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 1)])
    tn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 1)])
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    # False Negative Rate (FNR): Predict 0 (Judge Decision) but Actual 1 (Re-offense)
    fn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 0)])
    tp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 0)])
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    fpr_fnr_by_education.append({'Education Level': education, 'FPR': fpr, 'FNR': fnr})

fpr_fnr_by_education = pd.DataFrame(fpr_fnr_by_education)
fpr_fnr_by_education

Unnamed: 0,Education Level,FPR,FNR
0,High School,0.433962,0.401042
1,Less than High School,0.439394,0.440559
2,Bachelor's Degree,0.526316,0.438095
3,Some College,0.534091,0.487654
4,Master's Degree,0.411765,0.396552
5,PhD,0.375,0.411765


In [39]:
# Visualizing fpr_fnr_by_education

fig = px.bar(
    fpr_fnr_by_education.melt(id_vars='Education Level', var_name='Metric', value_name='Rate'),
    x='Education Level',
    y='Rate',
    color='Metric',
    barmode='group',
    title='False Positive Rate (FPR) and False Negative Rate (FNR) by Education Level',
    labels={'Rate': 'Rate', 'Education Level': 'Education Level', 'Metric': 'Metric'},
    text_auto=True,
    color_discrete_map={'FPR': '#6ed5ee', 'FNR': '#836eee'}
)

fig.update_layout(
    xaxis_title='Education Level',
    yaxis_title='Rate',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()

In [40]:
# Analyzing Re-offense Rates by Employment Status and Judge Decision
cross_tab_employment_decision = warren_df.groupby(['Employment Status', 'Re-offense', 'Judge Decision']).size().reset_index(name='Count')

cross_tab_employment_decision['Re-offense'] = cross_tab_employment_decision['Re-offense'].astype(str)
cross_tab_employment_decision['Judge Decision'] = cross_tab_employment_decision['Judge Decision'].astype(str)

fig = px.bar(
    cross_tab_employment_decision, 
    x='Employment Status', 
    y='Count', 
    color='Re-offense', 
    barmode='group', 
    facet_col='Judge Decision', 
    labels={'Count': 'Frequency', 'Re-offense': 'Re-offense Status'}, 
    title='Re-offense Rates by Employment Status and Judge Decision',
    text_auto=True,
    color_discrete_map={'0': '#ee836e', '1': '#9fe598'}
)

fig.update_layout(
    xaxis_title='Employment Status',
    yaxis_title='Frequency',
    template='plotly_white',
    width=1200,
    height=400
)

fig.show()

In [41]:
# FPR, FNR for each employment status group
fpr_fnr_by_employment = []

for employment in warren_df['Employment Status'].unique():
    group = warren_df[warren_df['Employment Status'] == employment]
    
    # False Positive Rate (FPR): Predict 1 (Judge Decision) but Actual 0 (Re-offense)
    fp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 1)])
    tn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 1)])
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    # False Negative Rate (FNR): Predict 0 (Judge Decision) but Actual 1 (Re-offense)
    fn = len(group[(group['Judge Decision'] == 0) & (group['Re-offense'] == 0)])
    tp = len(group[(group['Judge Decision'] == 1) & (group['Re-offense'] == 0)])
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    fpr_fnr_by_employment.append({'Employment Status': employment, 'FPR': fpr, 'FNR': fnr})

fpr_fnr_by_employment = pd.DataFrame(fpr_fnr_by_employment)
fpr_fnr_by_employment


Unnamed: 0,Employment Status,FPR,FNR
0,Unemployed,0.46087,0.441748
1,Employed,0.475962,0.433121


In [42]:
# Visualizing fpr_fnr_by_employment

fig = px.bar(
    fpr_fnr_by_employment.melt(id_vars='Employment Status', var_name='Metric', value_name='Rate'),
    x='Employment Status',
    y='Rate',
    color='Metric',
    barmode='group',
    title='False Positive Rate (FPR) and False Negative Rate (FNR) by Employment Status',
    labels={'Rate': 'Rate', 'Employment Status': 'Employment Status', 'Metric': 'Metric'},
    text_auto=True,
    color_discrete_map={'FPR': '#6ed5ee', 'FNR': '#836eee'}
)

fig.update_layout(
    xaxis_title='Employment Status',
    yaxis_title='Rate',
    template='plotly_white',
    width=900,
    height=400
)

fig.show()