# Links to datasets:
[1] COMPAS data (compas_file_path) - https://github.com/propublica/compas-analysis/blob/master/compas-scores-two-years.csv
[2] Demographic data (
[3] Poverty data (



In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the COMPAS dataset
compas_file_path = 'compas-scores-two-years.csv'

# Read the COMPAS CSV file into a DataFrame
try:
    compas_data = pd.read_csv(compas_file_path)
    print("COMPAS CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{compas_file_path}' was not found.")
    compas_data = None

# Check if the DataFrame is loaded
if compas_data is not None:
    # Extract the specified columns
    try:
        extracted_data_race = compas_data[['race', 'two_year_recid', 'v_decile_score']]
        print("Columns extracted successfully.")
    except KeyError as e:
        print(f"Error: One or more columns are missing in the dataset - {e}")
        extracted_data_race = None

# Load the Florida Census dataset
census_file_path = 'FL_census.csv'

# Read the census data
try:
    census_data = pd.read_csv(census_file_path)
    print("Census CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{census_file_path}' was not found.")
    census_data = None

# Process the data
if extracted_data_race is not None and census_data is not None:
    # Filter for individuals with nonzero 'two_year_recid'
    recidivated_data = extracted_data_race[extracted_data_race['two_year_recid'] != 0]

    # Group COMPAS data by race and count recidivated entries
    recidivated_counts = recidivated_data.groupby('race').size()

    # Group Census data by race and get population
    census_population = census_data.groupby('Race')['Population'].sum()

    # Calculate total number of entries for each race
    total_entries_per_race = extracted_data_race.groupby('race').size()

    # Filter for v_decile_score between 8 and 10 (inclusive) and group by race
    high_decile_scores = extracted_data_race[
        (extracted_data_race['v_decile_score'] >= 8) & (extracted_data_race['v_decile_score'] <= 10)
    ].groupby('race').size()

    # Ensure consistent race labels
    races = ['African-American', 'Asian', 'Caucasian', 'Hispanic', 'Native American', 'Other']
    recidivated_counts = recidivated_counts.reindex(races, fill_value=0)
    census_population = census_population.reindex(races, fill_value=0)
    total_entries_per_race = total_entries_per_race.reindex(races, fill_value=0)
    high_decile_scores = high_decile_scores.reindex(races, fill_value=0)

    # Save total counts
    total_recidivated_counts = recidivated_counts.sum()
    total_population_counts = census_population.sum()
    total_entries_count = total_entries_per_race.sum()
    total_high_decile_scores = high_decile_scores.sum()

    # Plot the connected scatterplot for high decile scores
    plt.figure(figsize=(10, 6))
    plt.plot(races, high_decile_scores, marker='o', label='High v_decile_scores (8-10)', color='purple')
    for i, count in enumerate(high_decile_scores):
        plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
    plt.title('High v_decile_scores (8-10) by Race')
    plt.xlabel('Race')
    plt.ylabel('Number of Entries')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

    # Plot the connected scatterplot for total entries
    plt.figure(figsize=(10, 6))
    plt.plot(races, total_entries_per_race, marker='o', label='Total Entries', color='green')
    for i, count in enumerate(total_entries_per_race):
        plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
    plt.title('Total Entries by Race')
    plt.xlabel('Race')
    plt.ylabel('Number of Entries')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

    # Plot the connected scatterplot for recidivated population
    plt.figure(figsize=(10, 6))
    plt.plot(races, recidivated_counts, marker='o', label='Recidivated Population', color='red')
    for i, count in enumerate(recidivated_counts):
        plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
    plt.title('Recidivated Population by Race')
    plt.xlabel('Race')
    plt.ylabel('Number of Recidivated Individuals')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

    # Plot the connected scatterplot for general population
    plt.figure(figsize=(10, 6))
    plt.plot(races, census_population, marker='o', label='General Population', linestyle='--', color='blue')
    for i, count in enumerate(census_population):
        plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
    plt.title('General Population by Race')
    plt.xlabel('Race')
    plt.ylabel('Number of Individuals')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

    # Print the total counts
    print("\nTotal number of entries (recidivated or not) by race:")
    for race, count in total_entries_per_race.items():
        print(f"{race}: {count}")

    print(f"\nTotal number of high v_decile_scores (8-10) by race:")
    for race, count in high_decile_scores.items():
        print(f"{race}: {count}")

    print(f"\nTotal recidivated individuals: {total_recidivated_counts}")
    print(f"Total general population: {total_population_counts}")
    print(f"Total entries in the dataset: {total_entries_count}")
    print(f"Total high v_decile_scores (8-10): {total_high_decile_scores}")


Error: The file 'compas-scores-two-years.csv' was not found.
Error: The file 'FL_census.csv' was not found.


NameError: name 'extracted_data_race' is not defined

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the COMPAS dataset
compas_file_path = 'compas-scores-two-years.csv'

# Read the COMPAS CSV file into a DataFrame
try:
    compas_data = pd.read_csv(compas_file_path)
    print("COMPAS CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{compas_file_path}' was not found.")
    compas_data = None

# Load the PovertyUSA dataset
poverty_file_path = 'PovertyUSA.csv'

# Read the Poverty data into a DataFrame
try:
    poverty_data = pd.read_csv(poverty_file_path)
    print("PovertyUSA CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{poverty_file_path}' was not found.")
    poverty_data = None

# Load the Florida Census dataset
census_file_path = 'FL_census.csv'

# Read the Census data into a DataFrame
try:
    census_data = pd.read_csv(census_file_path)
    print("Census CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{census_file_path}' was not found.")
    census_data = None

# Process the data
if compas_data is not None and poverty_data is not None and census_data is not None:
    # Extract relevant columns from COMPAS
    try:
        extracted_data_race = compas_data[['race', 'two_year_recid', 'v_decile_score']]
    except KeyError as e:
        print(f"Error: Missing column(s) in COMPAS dataset - {e}")
        extracted_data_race = None

    if extracted_data_race is not None:
        # Filter for individuals with nonzero 'two_year_recid'
        recidivated_data = extracted_data_race[extracted_data_race['two_year_recid'] != 0]

        # Group COMPAS data by race and count recidivated entries
        recidivated_counts = recidivated_data.groupby('race').size()

        # Group Census data by race and get population
        census_population = census_data.groupby('Race')['Population'].sum()

        # Calculate total number of entries for each race
        total_entries_per_race = extracted_data_race.groupby('race').size()

        # Filter for v_decile_score between 8 and 10 (inclusive) and group by race
        high_decile_scores = extracted_data_race[
            (extracted_data_race['v_decile_score'] >= 8) & (extracted_data_race['v_decile_score'] <= 10)
        ].groupby('race').size()

        # Group Poverty data by race
        poverty_population = poverty_data.groupby('race')['population'].sum()

        # Ensure consistent race labels
        races = ['African-American', 'Asian', 'Caucasian', 'Hispanic', 'Native American', 'Other']
        recidivated_counts = recidivated_counts.reindex(races, fill_value=0)
        census_population = census_population.reindex(races, fill_value=0)
        total_entries_per_race = total_entries_per_race.reindex(races, fill_value=0)
        high_decile_scores = high_decile_scores.reindex(races, fill_value=0)
        poverty_population = poverty_population.reindex(races, fill_value=0)

        # Save total counts
        total_recidivated_counts = recidivated_counts.sum()
        total_population_counts = census_population.sum()
        total_entries_count = total_entries_per_race.sum()
        total_high_decile_scores = high_decile_scores.sum()
        total_poverty_population = poverty_population.sum()

        # Plot the connected scatterplot for poverty population
        plt.figure(figsize=(10, 6))
        plt.plot(races, poverty_population, marker='o', label='Poverty Population', color='orange')
        for i, count in enumerate(poverty_population):
            plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
        plt.title('Poverty Population by Race')
        plt.xlabel('Race')
        plt.ylabel('Number of Individuals')
        plt.legend()
        plt.show()

        # Plot the connected scatterplot for high decile scores
        plt.figure(figsize=(10, 6))
        plt.plot(races, high_decile_scores, marker='o', label='High v_decile_scores (8-10)', color='purple')
        for i, count in enumerate(high_decile_scores):
            plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
        plt.title('High v_decile_scores (8-10) by Race')
        plt.xlabel('Race')
        plt.ylabel('Number of Entries')
        plt.legend()
        plt.show()

        # Plot the connected scatterplot for total entries
        plt.figure(figsize=(10, 6))
        plt.plot(races, total_entries_per_race, marker='o', label='Total Entries', color='green')
        for i, count in enumerate(total_entries_per_race):
            plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
        plt.title('Total Entries by Race')
        plt.xlabel('Race')
        plt.ylabel('Number of Entries')
        plt.legend()
        plt.show()

        # Plot the connected scatterplot for recidivated population
        plt.figure(figsize=(10, 6))
        plt.plot(races, recidivated_counts, marker='o', label='Recidivated Population', color='red')
        for i, count in enumerate(recidivated_counts):
            plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
        plt.title('Recidivated Population by Race')
        plt.xlabel('Race')
        plt.ylabel('Number of Recidivated Individuals')
        plt.legend()
        plt.show()

        # Plot the connected scatterplot for general population
        plt.figure(figsize=(10, 6))
        plt.plot(races, census_population, marker='o', label='General Population', linestyle='--', color='blue')
        for i, count in enumerate(census_population):
            plt.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)  # Adjusted y-position
        plt.title('General Population by Race')
        plt.xlabel('Race')
        plt.ylabel('Number of Individuals')
        plt.legend()
        plt.show()

        # Print the total counts
        print("\nTotal number of entries (recidivated or not) by race:")
        for race, count in total_entries_per_race.items():
            print(f"{race}: {count}")

        print(f"\nTotal number of high v_decile_scores (8-10) by race:")
        for race, count in high_decile_scores.items():
            print(f"{race}: {count}")

        print(f"\nTotal number of poverty population by race:")
        for race, count in poverty_population.items():
            print(f"{race}: {count}")

        print(f"\nTotal recidivated individuals: {total_recidivated_counts}")
        print(f"Total general population: {total_population_counts}")
        print(f"Total entries in the dataset: {total_entries_count}")
        print(f"Total high v_decile_scores (8-10): {total_high_decile_scores}")
        print(f"Total poverty population: {total_poverty_population}")


In [None]:
# To display general and poverty pop. data:

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the COMPAS dataset
compas_file_path = 'compas-scores-two-years.csv'

# Read the COMPAS CSV file into a DataFrame
try:
    compas_data = pd.read_csv(compas_file_path)
    print("COMPAS CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{compas_file_path}' was not found.")
    compas_data = None

# Load the PovertyUSA dataset
poverty_file_path = 'PovertyUSA.csv'

# Read the Poverty data into a DataFrame
try:
    poverty_data = pd.read_csv(poverty_file_path)
    print("PovertyUSA CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{poverty_file_path}' was not found.")
    poverty_data = None

# Load the Florida Census dataset
census_file_path = 'FL_census.csv'

# Read the Census data into a DataFrame
try:
    census_data = pd.read_csv(census_file_path)
    print("Census CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{census_file_path}' was not found.")
    census_data = None

# Process the data
if compas_data is not None and poverty_data is not None and census_data is not None:
    # Extract relevant columns from COMPAS
    try:
        extracted_data_race = compas_data[['race', 'two_year_recid', 'v_decile_score']]
    except KeyError as e:
        print(f"Error: Missing column(s) in COMPAS dataset - {e}")
        extracted_data_race = None

    if extracted_data_race is not None:
        # Group Census data by race and get population
        census_population = census_data.groupby('Race')['Population'].sum()

        # Group Poverty data by race
        poverty_population = poverty_data.groupby('race')['population'].sum()

        # Ensure consistent race labels
        races = ['African-American', 'Asian', 'Caucasian', 'Hispanic', 'Native American', 'Other']
        census_population = census_population.reindex(races, fill_value=0)
        poverty_population = poverty_population.reindex(races, fill_value=0)

        # Save total counts
        total_population_counts = census_population.sum()
        total_poverty_population = poverty_population.sum()

        # Plot the connected scatterplot comparing poverty and general population
        plt.figure(figsize=(10, 6))
        plt.plot(races, poverty_population, marker='o', label='Poverty Population', color='orange')
        plt.plot(races, census_population, marker='o', label='General Population', linestyle='--', color='blue')
        for i, (poverty_count, population_count) in enumerate(zip(poverty_population, census_population)):
            plt.text(i, poverty_count + 20, str(poverty_count), ha='center', va='bottom', fontsize=9, color='orange')  # Poverty labels
            plt.text(i, population_count + 20, str(population_count), ha='center', va='bottom', fontsize=9, color='blue')  # General Population labels
        plt.title('Poverty vs General Population by Race')
        plt.xlabel('Race')
        plt.ylabel('Number of Individuals')
        plt.legend()
        plt.show()

        # Print the total counts
        print(f"\nTotal general population: {total_population_counts}")
        print(f"Total poverty population: {total_poverty_population}")

        print("\nPoverty vs General Population by Race:")
        for race, poverty, population in zip(races, poverty_population, census_population):
            print(f"{race}: Poverty Population = {poverty}, General Population = {population}")