In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import re
pd.set_option('display.max_rows', 500)

In [2]:
# If we only want high security prisons
# high_security = ["CAC", "CCI", "COR", "LAC", "SAC", "HDSP", "KVSP", "PBSP", "SVSP", "SATF"]
# df_highsecurity = df.drop(df[~df['Institution'].isin(high_security)].index, axis=0)


In [3]:
# Create a new data type for months in order to sort by month
months_dtype = CategoricalDtype(categories=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], ordered=True)

In [4]:
# Load in 2009-2019 data
df_2009_2019 = pd.read_csv("finished-csvs/finished-2009-2019-everything.csv", encoding="utf-8", dtype={'Month':months_dtype})
df_2009_2019.head()

Unnamed: 0,Institution,Year,Top Category,Subcategory,Metric Title,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,CCWF,2009,Custody Operations,Total Bed Capacity,Contract Beds,0,0,0,0,0,0,0,0,0,0,0,0
1,CCWF,2009,Custody Operations,Total Bed Capacity,Design Beds,1916,1953,1945,1961,1969,1999,2007,1986,1966,2006,1969,1993
2,CCWF,2009,Custody Operations,Total Bed Capacity,Non-Traditional Beds,265,62,72,72,36,0,0,72,67,0,0,0
3,CCWF,2009,Custody Operations,Total Bed Capacity,Overcrowding Beds,1801,1886,1851,1884,1894,1897,1927,1919,1884,1926,1902,1920
4,CCWF,2009,Custody Operations,Total Bed Capacity,Temporary Beds (FamilyVisiting/Tank Beds),0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Drop Top Category column, which doesn't exist in later datasets
df_2009_2019_no_top = df_2009_2019.drop(columns=['Top Category'], axis=1)
df_2009_2019_no_top

Unnamed: 0,Institution,Year,Subcategory,Metric Title,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,CCWF,2009,Total Bed Capacity,Contract Beds,0,0,0,0,0,0,0,0,0,0,0,0
1,CCWF,2009,Total Bed Capacity,Design Beds,1916,1953,1945,1961,1969,1999,2007,1986,1966,2006,1969,1993
2,CCWF,2009,Total Bed Capacity,Non-Traditional Beds,265,62,72,72,36,0,0,72,67,0,0,0
3,CCWF,2009,Total Bed Capacity,Overcrowding Beds,1801,1886,1851,1884,1894,1897,1927,1919,1884,1926,1902,1920
4,CCWF,2009,Total Bed Capacity,Temporary Beds (FamilyVisiting/Tank Beds),0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201282,SVSP,2019,Adverse Actions,Per 100 Staff,0.44,0.09,0.51,0.00,0.00,0.26,0.69,0.09,0.35,0.34,0.60,0.51
201283,SVSP,2019,Adverse Actions,Dismissals (Non-Medical),0,1,1,0,0,0,0,0,0,0,0,0
201284,SVSP,2019,Adverse Actions,Adverse Actions Total (Medical),2,1,1,1,0,0,0,0,0,1,0,0
201285,SVSP,2019,Adverse Actions,Per 100 Staff,0.17,0.09,0.09,0.08,0.00,0.00,0.00,0.00,0.00,0.09,0.00,0.00


In [6]:
# Melt the 2009-2019 to a long format
df_2009_2019_melted = pd.melt(df_2009_2019_no_top, id_vars=['Institution','Subcategory','Metric Title','Year'], value_name='Value', var_name='Month', ignore_index=True)
df_2009_2019_melted

Unnamed: 0,Institution,Subcategory,Metric Title,Year,Month,Value
0,CCWF,Total Bed Capacity,Contract Beds,2009,Jan,0
1,CCWF,Total Bed Capacity,Design Beds,2009,Jan,1916
2,CCWF,Total Bed Capacity,Non-Traditional Beds,2009,Jan,265
3,CCWF,Total Bed Capacity,Overcrowding Beds,2009,Jan,1801
4,CCWF,Total Bed Capacity,Temporary Beds (FamilyVisiting/Tank Beds),2009,Jan,0
...,...,...,...,...,...,...
2415439,SVSP,Adverse Actions,Per 100 Staff,2019,Dec,0.51
2415440,SVSP,Adverse Actions,Dismissals (Non-Medical),2019,Dec,0
2415441,SVSP,Adverse Actions,Adverse Actions Total (Medical),2019,Dec,0
2415442,SVSP,Adverse Actions,Per 100 Staff,2019,Dec,0.00


In [7]:
# Convert month to a categorical type
df_2009_2019_melted = df_2009_2019_melted.astype({'Month':months_dtype})
df_2009_2019_melted.dtypes

Institution       object
Subcategory       object
Metric Title      object
Year               int64
Month           category
Value             object
dtype: object

In [8]:
# Read 2020-2023 population data
df_2020_2023_POP = pd.read_csv("finished-csvs/finished-2020-2023-population.csv", encoding="utf-8", dtype={'Month':months_dtype})
df_2020_2023_POP

Unnamed: 0,Institution,Category,Subcategory,Metric Title,Month,Year,Value
0,CCWF,Custody Operations,Total Bed Capacity,Blueprint Crowding Capacity,Jan,2020,2964
1,CCWF,Custody Operations,Total Bed Capacity,Blueprint Crowding Capacity,Feb,2020,2964
2,CCWF,Custody Operations,Total Bed Capacity,Blueprint Crowding Capacity,Mar,2020,2964
3,CCWF,Custody Operations,Total Bed Capacity,Blueprint Crowding Capacity,Apr,2020,2964
4,CCWF,Custody Operations,Total Bed Capacity,Blueprint Crowding Capacity,May,2020,2964
...,...,...,...,...,...,...,...
874555,WSP,Administration,Adverse Actions,Dismissals (Medical) 2,Feb,2023,0
874556,WSP,Administration,Adverse Actions,Dismissals (Medical) 2,Mar,2023,0
874557,WSP,Administration,Adverse Actions,Dismissals (Medical) 2,Apr,2023,0
874558,WSP,Administration,Adverse Actions,Dismissals (Medical) 2,May,2023,0


In [9]:
# drop category column, as it doesn't exist in other datasets for concatenation
df_2020_2023_POP_no_cat = df_2020_2023_POP.drop(columns=['Category'], axis=1)
df_2020_2023_POP_no_cat

Unnamed: 0,Institution,Subcategory,Metric Title,Month,Year,Value
0,CCWF,Total Bed Capacity,Blueprint Crowding Capacity,Jan,2020,2964
1,CCWF,Total Bed Capacity,Blueprint Crowding Capacity,Feb,2020,2964
2,CCWF,Total Bed Capacity,Blueprint Crowding Capacity,Mar,2020,2964
3,CCWF,Total Bed Capacity,Blueprint Crowding Capacity,Apr,2020,2964
4,CCWF,Total Bed Capacity,Blueprint Crowding Capacity,May,2020,2964
...,...,...,...,...,...,...
874555,WSP,Adverse Actions,Dismissals (Medical) 2,Feb,2023,0
874556,WSP,Adverse Actions,Dismissals (Medical) 2,Mar,2023,0
874557,WSP,Adverse Actions,Dismissals (Medical) 2,Apr,2023,0
874558,WSP,Adverse Actions,Dismissals (Medical) 2,May,2023,0


In [10]:
# Read the incident data from same year as population, which we'll merge
df_2020_2023_Incidents = pd.read_csv("finished-csvs/finished-2020-2023-incidents.csv", encoding="utf-8", dtype={'Month':months_dtype})
df_2020_2023_Incidents

Unnamed: 0,Subcategory,Institution,Metric Title,Month,Year,Value
0,Number of Incidents,ASP,Number of Incidents,Jan,2020,22
1,Number of Incidents,ASP,Number of Incidents,Feb,2020,22
2,Number of Incidents,ASP,Number of Incidents,Mar,2020,17
3,Number of Incidents,ASP,Number of Incidents,Apr,2020,19
4,Number of Incidents,ASP,Number of Incidents,May,2020,17
...,...,...,...,...,...,...
89041,Mental Health,WSP,UOF Incidents Involving Mental Health Inmates,Jun,2023,24
89042,Mental Health,WSP,UOF Incidents Involving Mental Health Inmates,Jul,2023,22
89043,Mental Health,WSP,UOF Incidents Involving Mental Health Inmates,Aug,2023,17
89044,Mental Health,WSP,UOF Incidents Involving Mental Health Inmates,Sep,2023,23


In [11]:
# Drop any duplicate rows in each of these three
df_2009_2019_melted.drop_duplicates(inplace=True, ignore_index=True)
df_2020_2023_POP_no_cat.drop_duplicates(inplace=True, ignore_index=True)
df_2020_2023_Incidents.drop_duplicates(inplace=True, ignore_index=True)

In [12]:
df = pd.concat([df_2009_2019_melted, df_2020_2023_POP_no_cat, df_2020_2023_Incidents], ignore_index=True, verify_integrity=True)
df

Unnamed: 0,Institution,Subcategory,Metric Title,Year,Month,Value
0,CCWF,Total Bed Capacity,Contract Beds,2009,Jan,0
1,CCWF,Total Bed Capacity,Design Beds,2009,Jan,1916
2,CCWF,Total Bed Capacity,Non-Traditional Beds,2009,Jan,265
3,CCWF,Total Bed Capacity,Overcrowding Beds,2009,Jan,1801
4,CCWF,Total Bed Capacity,Temporary Beds (FamilyVisiting/Tank Beds),2009,Jan,0
...,...,...,...,...,...,...
3350025,WSP,Mental Health,UOF Incidents Involving Mental Health Inmates,2023,Jun,24
3350026,WSP,Mental Health,UOF Incidents Involving Mental Health Inmates,2023,Jul,22
3350027,WSP,Mental Health,UOF Incidents Involving Mental Health Inmates,2023,Aug,17
3350028,WSP,Mental Health,UOF Incidents Involving Mental Health Inmates,2023,Sep,23


In [13]:
df_sorted = df.sort_values(by=['Year','Month','Institution','Subcategory','Metric Title'], ignore_index=True)
df_sorted

Unnamed: 0,Institution,Subcategory,Metric Title,Year,Month,Value
0,ASP,ASU EOP Hub,Actual Population,2009,Jan,0
1,ASP,ASU Overflow,Actual Population,2009,Jan,0
2,ASP,Academic Programs -\nAttendance Tracking,Average # Days in School (DIS),2009,Jan,98
3,ASP,Academic Programs -\nAttendance Tracking,Average Daily Attendance,2009,Jan,663
4,ASP,Academic Programs -\nAttendance Tracking,Avg. Length of Time in Assignment(LTA),2009,Jan,154
...,...,...,...,...,...,...
3350025,WSP,Type of Force,37 mm/40 mm,2023,Oct,2
3350026,WSP,Type of Force,Baton,2023,Oct,1
3350027,WSP,Type of Force,Non-Conventional Force,2023,Oct,0
3350028,WSP,Type of Force,OC,2023,Oct,8


In [23]:
df_sorted.to_csv("finished-csvs/all-data-combined-long-format.csv", encoding="utf-8", index=False)