# TCJS data

In [1]:
from pathlib import Path, PosixPath

import numpy as np
import pandas as pd

# 
from county_mapping import MAPPING

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_columns', 30)

In [2]:
# manually create header since it was excluded from the Tabula parsing (because it didn't work)
HEADER = ['county', 'pretrial_felons', 'convicted_felons', 'convicted_felons_sentenced_to_county_jail_time',
          'parole_violators', 'parole_violators_with_new_charge', 'pretrial_misdemeanor', 'convicted_misdemeanor',
          'bench_warrants', 'federal', 'pretrial_state_jail_felony', 'convicted_sjf_sentenced_to_county_jail_time',
          'convicted_sjf_sentenced_to_state_jail_time', 'total_others', 'total_local', 'total_contract', 'total_pop',
          'total_capacity', 'pct_of_capacity', 'avail_beds']

DATA = Path.cwd() / 'csvs'
URL = 'https://www.tcjs.state.tx.us/wp-content/uploads/{year}/{month}/AbbreRptCurrent.pdf'

# `inmate_population_snapshots` table cols
TABLE_COLS = ['id', 'snapshot_date', 'total', 'total_off_site', 'male', 'female', 'other_gender', 'white', 'black', 'hispanic',
              'asian', 'american_indian', 'mexican_american', 'multi_racial', 'other_race', 'on_probation', 'on_parole',
              'felony', 'misdemeanor', 'other_offense', 'convicted_or_sentenced', 'detained_or_awaiting_trial',
              'first_time_incarcerated', 'employed', 'unemployed', 'citizen', 'noncitizen', 'juvenile', 'juvenile_male',
              'juvenile_female', 'death_row_condemned', 'solitary_confinement', 'technical_parole_violators', 'source_url',
              'source_url_2', 'civil_offense', 'federal_offense']

In [3]:
dfs = []
for f in DATA.glob('*.csv'):
    year_month = f.name.replace('tabula-', '').replace('.csv', '')
    year, month = year_month.split('-')
    source = URL.format(year=year, month=month)

    t = pd.read_csv(f, names=HEADER, on_bad_lines='warn')
    t['snapshot_date'] = f'{year_month}-01'
    t['source_url_2'] = source
    dfs.append(t)

df = pd.concat(dfs).sort_values(['snapshot_date', 'county'])
print(f'total rows: {len(df):,}')

df['id'] = df['county'].apply(lambda s: MAPPING.get(s, np.nan))
df['source_url'] = 'https://www.tcjs.state.tx.us/population-reports/'

# only include rows with existing jail ids
print(f'removed {df["id"].isnull().sum():,} rows with no id')
df = df[df["id"].notnull()]

# filter out jails with no data across all rows
total_grouped = df.groupby('id')['total_pop'].sum()
empty_ids = total_grouped[total_grouped == 0].index.tolist()
print(f'jails with no data: {len(empty_ids):,}')

df = df[~(df['id'].isin(empty_ids))]
print(f'remaining rows: {len(df):,}')

total rows: 13,658
removed 4,988 rows with no id
jails with no data: 1
remaining rows: 8,619


### check totals

There are two counties where local + contract does not equal the total population. They could be typos or incorrectly reported data. Otherwise these equal up for all other rows.

In [4]:
# check totals
totals_not_equal = (df[['total_local', 'total_contract']].sum(axis=1) - df['total_pop']) != 0
df[totals_not_equal][['id', 'county', 'total_others', 'total_local', 'total_contract', 'total_pop', 'source_url_2']]

Unnamed: 0,id,county,total_others,total_local,total_contract,total_pop,source_url_2
198,9232,Polk,8,186,26,211,https://www.tcjs.state.tx.us/wp-content/uploads/2019/08/AbbreRptCurrent.pdf
198,9232,Polk,1,186,23,211,https://www.tcjs.state.tx.us/wp-content/uploads/2019/09/AbbreRptCurrent.pdf
198,9232,Polk,2,153,44,211,https://www.tcjs.state.tx.us/wp-content/uploads/2019/10/AbbreRptCurrent.pdf
198,9232,Polk,2,169,37,211,https://www.tcjs.state.tx.us/wp-content/uploads/2019/11/AbbreRptCurrent.pdf
198,9232,Polk,1,150,36,211,https://www.tcjs.state.tx.us/wp-content/uploads/2019/12/AbbreRptCurrent.pdf
12,9064,Bee,0,52,26,68,https://www.tcjs.state.tx.us/wp-content/uploads/2020/04/AbbreRptCurrent.pdf


## rename cols

In [5]:
convert_cols = {
  'total_pop': 'total',
  'total_contract': 'total_off_site',
  'convicted_felons': 'felony',
  'convicted_misdemeanor': 'misdemeanor',
  'federal': 'federal_offense',
}

df = df.rename(columns=convert_cols)

df['technical_parole_violators'] = df['parole_violators'] + df['parole_violators_with_new_charge']

convicted_cols = [c for c in df.columns if c.startswith('convicted')]
df['convicted_or_sentenced'] = df[convicted_cols].sum(axis=1)

pretrial_cols = [c for c in df.columns if c.startswith('pretrial')]
df['detained_or_awaiting_trial'] = df[pretrial_cols].sum(axis=1)

output_cols = set(TABLE_COLS) & set(df.columns)
output_cols_sorted = [c for c in TABLE_COLS if c in output_cols]

df = df[output_cols_sorted].sort_values(['snapshot_date', 'id']).reset_index().drop('index', axis=1)

df.head(2)

Unnamed: 0,id,snapshot_date,total,total_off_site,felony,misdemeanor,convicted_or_sentenced,detained_or_awaiting_trial,technical_parole_violators,source_url,source_url_2,federal_offense
0,10302,2018-01-01,33,9,0,0,0,23,0,https://www.tcjs.state.tx.us/population-reports/,https://www.tcjs.state.tx.us/wp-content/uploads/2018/01/AbbreRptCurrent.pdf,0
1,9054,2018-01-01,30,0,9,0,2,28,5,https://www.tcjs.state.tx.us/population-reports/,https://www.tcjs.state.tx.us/wp-content/uploads/2018/01/AbbreRptCurrent.pdf,0


In [6]:
print(f"Snapshot data from {df['snapshot_date'].min()} to {df['snapshot_date'].max()}")

Snapshot data from 2018-01-01 to 2022-04-01


### check totals

Not sure if these are ever going to add up. Some of the small totals line up exactly but it is hit or miss. Maybe from double counting across categories. As someone not familiar with all the categories I'm not sure how to interpret the [form the jails use](https://www.tcjs.state.tx.us/wp-content/uploads/2019/10/Jail-Population-report-Printable.pdf) to categorize inmates.

In [7]:
sum_cols = ['felony', 'misdemeanor', 'convicted_or_sentenced', 'detained_or_awaiting_trial', 'technical_parole_violators', 'federal_offense']

equal = (df[sum_cols].sum(axis=1) == df[['total', 'total_off_site']].sum(axis=1))
less_than = (df[sum_cols].sum(axis=1) < df[['total', 'total_off_site']].sum(axis=1))
greater_than = (df[sum_cols].sum(axis=1) > df[['total', 'total_off_site']].sum(axis=1))

print(f'total rows: {len(df):,}')
print(f'total = sum cols: {equal.sum():,}')
print(f'total < sum cols: {less_than.sum():,}')
print(f'total > sum cols: {greater_than.sum():,}')

total rows: 8,619
total = sum cols: 1,059
total < sum cols: 5,887
total > sum cols: 1,673


## export

In [8]:
df.to_csv('inmate_population_snapshots.csv', index=False)