In [90]:
import pandas as pd
import numpy as np
import glob
import re

We first merge individual CSVs to create a tidy dataset. Our units of interest (comprising one row) consist of unique combinations of state, gender, and year. Since the goal of this project is to look at recidivism, we are interested only in convicted individuals in prison. We have caste, education, and other demographic information for only a subset of the 

In [91]:
fileList = glob.glob("./Data/*.csv")
dfNames = [re.search("[(\\\)](.*)\.",file).group(1) for file in fileList]

for idx in range(len(dfNames)):
    df = pd.read_csv(fileList[idx])
    name = dfNames[idx]
    vars()[name] = df
    print(vars()['name'])


Age_group
Caste
Death_sentence
Domicile
Education
Education_facilities
Inmates_death
Inmates_escapee
IPC_crime_inmates_convicted
IPC_crime_inmates_under_trial
Jail_wise_population_of_prison_inmates
Prison_details_2015
Recidivism
Rehabilitation
Religion
Sentence_period
Tranquillity
Vocational_training
Wages


## IPC Crimes (Convicted Prisoners)
- aggregated into Theft, Business, Sexual, and Violent crimes
- column names prefixed by IPC

In [92]:
IPC_crime_inmates_convicted.groupby(["crime_head"]).sum(). \
sort_values(by=['Grand Total'])

Unnamed: 0_level_0,year,Male 16-18 years,Female 16-18 years,Total 16-18 years,Male 18-30 years,Female 18-30 years,Total 18-30 years,Male 30-50 years,Female 30-50 years,Total 30-50 years,Male Above 50 years,Female Above 50 years,Total Above 50 years,Total Male,Total Female,Grand Total
crime_head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Eve-Teasing,897149,0.0,0,0,412,21,433,364,6,370,100,8,108,876,35,911
Arson,897149,1.0,0,1,1240,31,1271,1185,22,1207,361,8,369,2787,61,2848
Extortion,897149,3.0,0,3,1189,13,1202,1512,42,1554,377,5,382,3081,60,3141
Molestation,897149,1.0,0,1,1286,51,1337,1418,67,1485,448,25,473,3153,143,3296
Criminal Breach Of Trust,897149,0.0,0,0,1282,13,1295,1607,34,1641,583,11,594,3472,58,3530
Riots,897149,11.0,0,11,1679,7,1686,1570,21,1591,430,12,442,3690,40,3730
Counter Feiting,897149,0.0,0,0,1705,22,1727,2053,73,2126,615,50,665,4373,145,4518
Prep. And Assembly For Dacoity,897149,2.0,0,2,3753,16,3769,3708,32,3740,958,6,964,8421,54,8475
Cheating,897149,9.0,0,9,4277,99,4376,4631,200,4831,1215,29,1244,10132,328,10460
Burglary,897149,34.0,0,34,6075,30,6105,5760,81,5841,1543,65,1608,13412,176,13588


In [93]:
ViolentCrimes=["Murder","Attempt To Commit Murder","C.H. Not Amounting To Murder","Kidnapping And Abduction","Dacoity","Arson"]
SexualCrimes=["Rape","Dowry Deaths","Cruelty By Husband Or Relative Of Husband","Molestation","Eve-Teasing"]
BusinessCrimes=["Cheating","Counter Feiting","Criminal Breach Of Trust"]
TheftCrimes=["Thefts","Robbery","Burglary", "Prep. And Assembly For Dacoity", "Extortion"]

IPC_crime_inmates_convicted_fin = IPC_crime_inmates_convicted.assign(CrimeType= #aggregating crimes into crime types
                                   np.select([IPC_crime_inmates_convicted["crime_head"].isin(ViolentCrimes),
                                             IPC_crime_inmates_convicted["crime_head"].isin(SexualCrimes),
                                             IPC_crime_inmates_convicted["crime_head"].isin(BusinessCrimes),
                                             IPC_crime_inmates_convicted["crime_head"].isin(TheftCrimes)],
                                            ["Violent","Sexual","Business","Theft"],
                                            default="Unknown")). \
filter(["state_name","year","Total 16-18 years", "Total 18-30 years","Total 30-50 years","Total Above 50 years","CrimeType"]). \
groupby(["state_name","year","CrimeType"]).sum(). \
assign(Total=lambda df: df.sum(axis=1)). \
pivot_table(index=["state_name","year"],
           columns=["CrimeType"]). \
assign(All=lambda df: df.sum(axis=1))

IPC_crime_inmates_convicted_fin=IPC_crime_inmates_convicted_fin.div(IPC_crime_inmates_convicted_fin['All'],axis=0). \
drop(['All'],axis=1)

In [94]:
#IPC_crime_inmates_convicted_fin.columns = [' '.join(col).strip() for col in IPC_crime_inmates_convicted_fin.columns.values]

valColNames=["IPC-prop-{}-{}".format(b_, a_) for a_, b_ in zip(IPC_crime_inmates_convicted_fin.columns.get_level_values(0),
         IPC_crime_inmates_convicted_fin.columns.get_level_values(1))]

IPC_crime_inmates_convicted_fin.columns=valColNames
IPC_crime_inmates_convicted_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,IPC-prop-Business-Total,IPC-prop-Sexual-Total,IPC-prop-Theft-Total,IPC-prop-Unknown-Total,IPC-prop-Violent-Total,IPC-prop-Business-Total 16-18 years,IPC-prop-Sexual-Total 16-18 years,IPC-prop-Theft-Total 16-18 years,IPC-prop-Unknown-Total 16-18 years,IPC-prop-Violent-Total 16-18 years,...,IPC-prop-Business-Total 30-50 years,IPC-prop-Sexual-Total 30-50 years,IPC-prop-Theft-Total 30-50 years,IPC-prop-Unknown-Total 30-50 years,IPC-prop-Violent-Total 30-50 years,IPC-prop-Business-Total Above 50 years,IPC-prop-Sexual-Total Above 50 years,IPC-prop-Theft-Total Above 50 years,IPC-prop-Unknown-Total Above 50 years,IPC-prop-Violent-Total Above 50 years
state_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A & N Islands,2001,0.000000,0.000000,0.000000,0.140625,0.359375,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.101562,0.218750,0.000000,0.000000,0.000000,0.015625,0.023438
A & N Islands,2002,0.000000,0.021739,0.000000,0.181159,0.297101,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.014493,0.000000,0.094203,0.173913,0.000000,0.000000,0.000000,0.021739,0.021739
A & N Islands,2003,0.000000,0.021429,0.042857,0.135714,0.300000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.021429,0.000000,0.042857,0.142857,0.000000,0.000000,0.000000,0.007143,0.007143
A & N Islands,2004,0.008929,0.035714,0.044643,0.080357,0.330357,0.0,0.0,0.0,0.0,0.0,...,0.008929,0.017857,0.000000,0.035714,0.169643,0.000000,0.000000,0.000000,0.000000,0.000000
A & N Islands,2005,0.000000,0.036585,0.073171,0.000000,0.390244,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.024390,0.048780,0.000000,0.268293,0.000000,0.012195,0.000000,0.000000,0.073171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Bengal,2009,0.003019,0.093939,0.023456,0.006967,0.372620,0.0,0.0,0.0,0.0,0.0,...,0.002090,0.040409,0.009289,0.002903,0.216094,0.000581,0.018579,0.002787,0.001161,0.073502
West Bengal,2010,0.003511,0.080068,0.025142,0.003284,0.387995,0.0,0.0,0.0,0.0,0.0,...,0.002492,0.030804,0.012118,0.001586,0.234541,0.000566,0.015968,0.001812,0.000000,0.080634
West Bengal,2011,0.008107,0.071934,0.025348,0.014387,0.380224,0.0,0.0,0.0,0.0,0.0,...,0.003768,0.034483,0.010162,0.010048,0.182576,0.002169,0.014273,0.005481,0.000571,0.087577
West Bengal,2012,0.015197,0.063052,0.021233,0.017461,0.383057,0.0,0.0,0.0,0.0,0.0,...,0.006790,0.023604,0.008946,0.007329,0.199612,0.000862,0.011856,0.002156,0.000539,0.071567


# All-Surveyed-Prisoner Demographics

## Jail-Wise Population
- breakdown of surveyed prisoners among different jail types
- column names prefixed by JWP

In [95]:
Jail_wise_population_of_prison_inmates_fin = Jail_wise_population_of_prison_inmates[["state_name","year","Jail_Type","Total_Convicts"]]. \
pivot_table(index=["state_name","year"], 
           columns="Jail_Type", values="Total_Convicts")

Jail_wise_population_of_prison_inmates_fin=Jail_wise_population_of_prison_inmates_fin. \
div(Jail_wise_population_of_prison_inmates_fin['Total'], axis=0). \
drop(['Total'],axis=1)

Jail_wise_population_of_prison_inmates_fin.columns= ["JWP-prop-"+s for s in Jail_wise_population_of_prison_inmates_fin.columns]
Jail_wise_population_of_prison_inmates_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,JWP-prop-Borstal School,JWP-prop-Central Jail,JWP-prop-District Jail,JWP-prop-Open Jail,JWP-prop-Others,JWP-prop-Special Jail,JWP-prop-Sub_Jail,JWP-prop-Women Jail
state_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A & N Islands,2001,0.0,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
A & N Islands,2002,0.0,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
A & N Islands,2003,0.0,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
A & N Islands,2004,0.0,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
A & N Islands,2005,0.0,0.000000,0.984848,0.000000,0.0,0.000000,0.015152,0.000000
...,...,...,...,...,...,...,...,...,...
West Bengal,2009,0.0,0.772597,0.146815,0.016557,0.0,0.033621,0.011488,0.018922
West Bengal,2010,0.0,0.738491,0.196435,0.016615,0.0,0.021807,0.006750,0.019903
West Bengal,2011,0.0,0.756890,0.168375,0.017491,0.0,0.021908,0.008481,0.026855
West Bengal,2012,0.0,0.776697,0.158159,0.012316,0.0,0.012640,0.016367,0.023821


## Religion
- column names prefixed by REL

In [96]:
Religion_fin = Religion.query("is_state==1"). \
drop(["is_state","under_trial","detenues","others"],axis=1). \
pivot_table(index=["state_name","year"],
           columns=["gender","religion"],
           values="convicts"). \
assign(All=lambda df: df.sum(axis=1))

Religion_fin=Religion_fin.div(Religion_fin['All'], axis=0). \
drop(['All'],axis=1)

valColNames=["REL-prop-{}-{}".format(b_, a_) for a_, b_ in zip(Religion_fin.columns.get_level_values(0),
         Religion_fin.columns.get_level_values(1))]

Religion_fin.columns=valColNames

Religion_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,REL-prop-Christian-Female,REL-prop-Hindu-Female,REL-prop-Muslim-Female,REL-prop-Others-Female,REL-prop-Sikh-Female,REL-prop-Christian-Male,REL-prop-Hindu-Male,REL-prop-Muslim-Male,REL-prop-Others-Male,REL-prop-Sikh-Male
state_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Andhra Pradesh,2001,0.000000,0.021085,0.002843,0.000237,0.000000,0.068941,0.786070,0.106136,0.011846,0.002843
Andhra Pradesh,2002,0.000616,0.027915,0.002463,0.000000,0.000000,0.096880,0.773194,0.093596,0.002874,0.002463
Andhra Pradesh,2003,0.002331,0.026035,0.005051,0.000000,0.000000,0.091510,0.781037,0.090538,0.001943,0.001554
Andhra Pradesh,2004,0.002308,0.035257,0.003778,0.000000,0.000000,0.088353,0.782162,0.085414,0.000000,0.002728
Andhra Pradesh,2005,0.002594,0.033127,0.003792,0.000000,0.000000,0.096188,0.759928,0.101377,0.000000,0.002993
...,...,...,...,...,...,...,...,...,...,...,...
West Bengal,2009,0.000845,0.034127,0.032607,0.000845,0.000000,0.009292,0.431999,0.479980,0.007096,0.003210
West Bengal,2010,0.000692,0.032364,0.032883,0.000000,0.000000,0.022672,0.447906,0.445656,0.005365,0.012461
West Bengal,2011,0.000707,0.033392,0.029682,0.000000,0.000177,0.021555,0.482332,0.428799,0.002473,0.000883
West Bengal,2012,0.000648,0.039216,0.025280,0.000000,0.000324,0.020094,0.492141,0.418895,0.001945,0.001458


In [97]:
Religion.query("is_state==1"). \
drop(["is_state","under_trial","detenues","others"],axis=1). \
groupby(["state_name","year","gender"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,convicts
state_name,year,gender,Unnamed: 3_level_1
Andhra Pradesh,2001,Female,102
Andhra Pradesh,2001,Male,4119
Andhra Pradesh,2002,Female,151
Andhra Pradesh,2002,Male,4721
Andhra Pradesh,2003,Female,172
...,...,...,...
West Bengal,2011,Male,5298
West Bengal,2012,Female,404
West Bengal,2012,Male,5767
West Bengal,2013,Female,367


## Education
- column names prefixed by EDUC

In [98]:
Education_fin = Education.query("is_state==1"). \
drop(["is_state","under_trial","detenues","others"],axis=1). \
pivot_table(index=["state_name","year"],
           columns=["gender","education"],
           values="convicts"). \
assign(All=lambda df: df.sum(axis=1))

Education_fin=Education_fin.div(Education_fin['All'], axis=0). \
drop(['All'],axis=1)

valColNames=["EDUC-prop-{}-{}".format(b_, a_) for a_, b_ in zip(Education_fin.columns.get_level_values(0),
         Education_fin.columns.get_level_values(1))]

Education_fin.columns=valColNames

Education_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,EDUC-prop-Below Class X-Female,EDUC-prop-Class X and above but below graduate-Female,EDUC-prop-Graduate-Female,EDUC-prop-Holding technical degree/diploma etc-Female,EDUC-prop-Illiterate-Female,EDUC-prop-Post-Graduate-Female,EDUC-prop-Below Class X-Male,EDUC-prop-Class X and above but below graduate-Male,EDUC-prop-Graduate-Male,EDUC-prop-Holding technical degree/diploma etc-Male,EDUC-prop-Illiterate-Male,EDUC-prop-Post-Graduate-Male
state_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Andhra Pradesh,2001,0.004738,0.000000,0.000000,0.000000,0.019427,0.000000,0.409619,0.126036,0.070836,0.010898,0.341388,0.017058
Andhra Pradesh,2002,0.011084,0.000821,0.000000,0.000000,0.019089,0.000000,0.385673,0.132800,0.038793,0.011084,0.396346,0.004310
Andhra Pradesh,2003,0.013406,0.001943,0.000194,0.000000,0.017680,0.000194,0.450165,0.142996,0.040800,0.005246,0.321158,0.006217
Andhra Pradesh,2004,0.010493,0.000210,0.000630,0.000000,0.030010,0.000000,0.437356,0.135362,0.033998,0.006296,0.339140,0.006506
Andhra Pradesh,2005,0.013770,0.000200,0.000399,0.000000,0.025145,0.000000,0.453203,0.139892,0.036121,0.006586,0.317701,0.006985
...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Bengal,2009,0.026187,0.004055,0.001014,0.000000,0.036831,0.000338,0.509883,0.098834,0.027707,0.001689,0.290759,0.002703
West Bengal,2010,0.021980,0.003461,0.001558,0.000000,0.038941,0.000000,0.365524,0.114746,0.025961,0.003115,0.420907,0.003808
West Bengal,2011,0.023322,0.006184,0.001060,0.000000,0.033039,0.000353,0.475442,0.145406,0.029152,0.001767,0.282332,0.001943
West Bengal,2012,0.024307,0.005834,0.003079,0.000486,0.031275,0.000486,0.470750,0.116999,0.026576,0.002593,0.311943,0.005672


In [99]:
Education.query("is_state==1"). \
drop(["is_state","under_trial","detenues","others"],axis=1). \
groupby(["state_name","year","gender"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,convicts
state_name,year,gender,Unnamed: 3_level_1
Andhra Pradesh,2001,Female,102
Andhra Pradesh,2001,Male,4119
Andhra Pradesh,2002,Female,151
Andhra Pradesh,2002,Male,4721
Andhra Pradesh,2003,Female,172
...,...,...,...
West Bengal,2011,Male,5298
West Bengal,2012,Female,404
West Bengal,2012,Male,5767
West Bengal,2013,Female,367


## Sentence Period

- Missing values in the wide-pivoted data are equivalent to zero counts. 
- Total counts of female and male prisoners by age group are usually larger than in the Age_group dataframe
- Column names prefixed by SP


In [100]:
Sentence_period_fin = Sentence_period.query('is_state==1'). \
drop(["is_state"],axis=1). \
pivot_table(index=["state_name","year"], 
                   columns = ["gender","sentence_period"]). \
fillna(0). \
assign(All=lambda df: df.sum(axis=1))

Sentence_period_fin=Sentence_period_fin.div(Sentence_period_fin['All'], axis=0). \
drop(['All'],axis=1)

valColNames=["SP-prop-{}-{}".format(b_, a_) for a_, b_ in zip(Sentence_period_fin.columns.get_level_values(0),
         Sentence_period_fin.columns.get_level_values(1))]

Sentence_period_fin.columns=valColNames

Sentence_period_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,SP-prop-Female-age_16_18_years,...,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above,SP-prop-Male-age_50_above
state_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Andhra Pradesh,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002369,0.004501,0.009950,0.000000,0.006397,0.009713,0.001895,0.000000,0.004027,0.050225
Andhra Pradesh,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005747,0.006773,0.001232,0.000000,0.005952,0.006979,0.008415,0.000000,0.002668,0.045361
Andhra Pradesh,2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002914,0.009132,0.005051,0.000000,0.006800,0.005246,0.006412,0.000000,0.012046,0.064504
Andhra Pradesh,2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003148,0.010493,0.006716,0.000000,0.007555,0.011752,0.005037,0.000000,0.003358,0.037985
Andhra Pradesh,2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003592,0.004989,0.002195,0.000000,0.003792,0.004191,0.002195,0.000000,0.000599,0.054081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Bengal,2009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009123,0.020105,0.001689,0.000000,0.014023,0.005575,0.013009,0.000000,0.003210,0.086332
West Bengal,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009346,0.012807,0.001038,0.000000,0.009692,0.002596,0.009692,0.000000,0.004154,0.092766
West Bengal,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014841,0.008834,0.003534,0.000000,0.008127,0.005124,0.020671,0.000000,0.004240,0.111131
West Bengal,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.023659,0.011019,0.002431,0.000000,0.007778,0.009399,0.012640,0.000162,0.002755,0.066278


In [101]:
Sentence_period.query('is_state==1').drop(['is_state'],axis=1).groupby(["state_name","year","gender"]).sum(). \
assign(total= lambda df: df.iloc[:,:].sum(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age_16_18_years,age_18_30_years,age_30_50_years,age_50_above,total
state_name,year,gender,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Andhra Pradesh,2001,Female,0,28,50,24,102
Andhra Pradesh,2001,Male,5,1639,2087,388,4119
Andhra Pradesh,2002,Female,0,44,71,36,151
Andhra Pradesh,2002,Male,3,1986,2292,440,4721
Andhra Pradesh,2003,Female,0,53,73,46,172
...,...,...,...,...,...,...,...
West Bengal,2011,Male,0,1801,2461,1036,5298
West Bengal,2012,Female,0,133,143,128,404
West Bengal,2012,Male,0,2235,2660,872,5767
West Bengal,2013,Female,0,123,147,97,367


## Age_group
- Same information (but somewhat mismatched and lower prisoner counts) as sentence period
- Do not use this dataframe

In [102]:
Age_group.query('type=="Convicts" and is_state==1 and category!="Foreigners"'). \
drop(['is_state','category','type'],axis=1). \
assign(total= lambda df: df.iloc[:,3:].sum(axis=1))

Unnamed: 0,state_name,year,gender,age_16_18,age_18_30,age_30_50,age_50_above,total
216,Andhra Pradesh,2001,Female,0.0,28,50,24,102.0
217,Andhra Pradesh,2001,Male,5.0,1638,2085,388,4116.0
232,Andhra Pradesh,2002,Female,0.0,44,71,36,151.0
233,Andhra Pradesh,2002,Male,3.0,1982,2292,440,4717.0
248,Andhra Pradesh,2003,Female,0.0,53,73,46,172.0
...,...,...,...,...,...,...,...,...
7113,West Bengal,2011,Male,0.0,1369,2219,956,4544.0
7128,West Bengal,2012,Female,0.0,94,116,115,325.0
7129,West Bengal,2012,Male,0.0,1650,2363,793,4806.0
7144,West Bengal,2013,Female,0.0,78,112,84,274.0


## Caste

- SC/ST refers to class traditionally known as untouchables/Dalits
- OBC is an umbrella class of other economically/socially disadvantaged citizens
- Others refers to relatively well-off citizens
- column names prefixed by CTE

In [103]:
Caste_fin = Caste.iloc[:,:6].query('is_state==1'). \
pivot_table(index=["state_name","year"], 
                   columns = ["gender","caste"],
                   values="convicts"). \
assign(All=lambda df: df.sum(axis=1))

Caste_fin=Caste_fin.div(Caste_fin['All'], axis=0). \
drop(['All'],axis=1)

valColNames=["CST-prop-{}-{}".format(b_, a_) for a_, b_ in zip(Caste_fin.columns.get_level_values(0),
         Caste_fin.columns.get_level_values(1))]

Caste_fin.columns=valColNames

Caste_fin

Unnamed: 0_level_0,Unnamed: 1_level_0,CST-prop-OBC-Female,CST-prop-Others-Female,CST-prop-SC-Female,CST-prop-ST-Female,CST-prop-OBC-Male,CST-prop-Others-Male,CST-prop-SC-Male,CST-prop-ST-Male
state_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Andhra Pradesh,2001,0.011846,0.002843,0.007818,0.001658,0.361526,0.222459,0.292822,0.099029
Andhra Pradesh,2002,0.013342,0.005952,0.006979,0.004721,0.376232,0.272373,0.241585,0.078818
Andhra Pradesh,2003,0.020400,0.006023,0.004857,0.002137,0.442782,0.225763,0.224014,0.074024
Andhra Pradesh,2004,0.024344,0.006506,0.007345,0.003148,0.450367,0.252676,0.192655,0.062959
Andhra Pradesh,2005,0.023947,0.003792,0.008182,0.003592,0.439234,0.261824,0.199960,0.059469
...,...,...,...,...,...,...,...,...,...
West Bengal,2009,0.004899,0.044095,0.011826,0.007603,0.067748,0.639635,0.141578,0.082615
West Bengal,2010,0.008827,0.041537,0.011596,0.003981,0.105054,0.592246,0.141571,0.095189
West Bengal,2011,0.006007,0.042403,0.010424,0.005124,0.056360,0.601943,0.132686,0.145053
West Bengal,2012,0.002431,0.051531,0.007940,0.003565,0.074380,0.604764,0.159131,0.096257


In [104]:
Caste.query('is_state==1').drop(['is_state','under_trial','detenues','others'],axis=1). \
groupby(["state_name","year","gender"]).sum() 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,convicts
state_name,year,gender,Unnamed: 3_level_1
Andhra Pradesh,2001,Female,102
Andhra Pradesh,2001,Male,4119
Andhra Pradesh,2002,Female,151
Andhra Pradesh,2002,Male,4721
Andhra Pradesh,2003,Female,172
...,...,...,...
West Bengal,2011,Male,5298
West Bengal,2012,Female,404
West Bengal,2012,Male,5767
West Bengal,2013,Female,367


# Ambiguous Count (from surveyed or population?)

In [105]:
Education_facilities

Unnamed: 0,state_name,year,elementary_education,adult_education,higher_education,computer_course
0,Andhra Pradesh,2001,2699,2161,301,52
1,Andhra Pradesh,2002,2420,4745,396,34
2,Andhra Pradesh,2003,1606,7594,369,39
3,Andhra Pradesh,2004,5950,8213,305,30
4,Andhra Pradesh,2005,3720,10962,389,1
...,...,...,...,...,...,...
440,D & N Haveli,2013,0,0,0,0
441,Daman & Diu,2013,0,0,0,0
442,Delhi,2013,1458,1187,908,174
443,Lakshadweep,2013,0,0,0,0


## Tranquility

- aggregated over prisoner and personnel injuries/deaths as well as types of unrest
- about 3/4 of regions have 0 recorded incidents of violence 

In [132]:
Tranquility_fin = Tranquillity.assign(injured=lambda df: df["inmate_injured"]+df["jail_personnel_injured"],
                   killed=lambda df: df["inmate_killed"]+df["jail_personnel_killed"]). \
drop(["inmate_injured","jail_personnel_injured","inmate_killed","jail_personnel_killed"],axis=1). \
groupby(["state_name","year"]).sum()

## Vocational Training
- Hundreds of unique training jobs; need to compress information to keep model identifiable
- Some prisoners are in multiple training problems: in Punjab, for example, 
- For now, I use total number of inmates getting vocational training as a variable, not accounting for double counting

In [107]:
Vocational_training_fin = Vocational_training.groupby(["state_name","year"]).sum()
#.pivot_table(index=["state_name","year"],
#                                columns="vocational_trainings_program",
#                                values="inmates_trained")

In [122]:
len(Vocational_training.vocational_trainings_program.unique())

1271

## Rehabilitation
- This dataframe is already in a nice wide form.

In [109]:
Rehabilitation

Unnamed: 0,state_name,year,financial_assistance_provided,rehabilitated,legal_aid_provided
0,Andhra Pradesh,2001,14,75,2594
1,Andhra Pradesh,2002,37,108,1876
2,Andhra Pradesh,2003,0,0,2126
3,Andhra Pradesh,2004,0,0,909
4,Andhra Pradesh,2005,0,204,2342
...,...,...,...,...,...
440,D & N Haveli,2013,0,0,0
441,Daman & Diu,2013,0,0,0
442,Delhi,2013,0,34,34384
443,Lakshadweep,2013,0,0,0


## Recidivism

I mutate the recidivism ratio (number of habitual offenders over number of admitted convicts) for each state and year.

In [112]:
Recidivism_fin = Recidivism.assign(recidiv_ratio= \
                  lambda df: df["habitual_offenders"]/df["convicts_admitted"])
Recidivism_fin

Unnamed: 0,state_name,year,convicts_admitted,habitual_offenders,recidiv_ratio
0,Andhra Pradesh,2001,17345,1063,0.061286
1,Andhra Pradesh,2002,13322,318,0.023870
2,Andhra Pradesh,2003,15682,536,0.034179
3,Andhra Pradesh,2004,14397,399,0.027714
4,Andhra Pradesh,2005,12389,977,0.078860
...,...,...,...,...,...
442,Dadra And Nagar Haveli,2013,0,0,
443,Daman And Diu,2013,9,0,0.000000
444,Delhi,2013,7015,1103,0.157234
445,Lakshadweep,2013,0,0,


In [134]:
tidy_Indian_Prison_df = pd.merge(Recidivism_fin, Caste_fin, on=["state_name","year"]). \
merge(Sentence_period_fin, on=["state_name","year"]). \
merge(Education_fin, on=["state_name","year"]). \
merge(Religion_fin, on=["state_name","year"]). \
merge(IPC_crime_inmates_convicted_fin,on=["state_name","year"]). \
merge(Jail_wise_population_of_prison_inmates_fin, on=["state_name","year"]). \
merge(Education_facilities, on=["state_name","year"]). \
merge(Vocational_training_fin, on=["state_name","year"]). \
merge(Rehabilitation, on=["state_name","year"]). \
merge(Tranquility_fin, on=["state_name","year"])

tidy_Indian_Prison_df

Unnamed: 0,state_name,year,convicts_admitted,habitual_offenders,recidiv_ratio,CST-prop-OBC-Female,CST-prop-Others-Female,CST-prop-SC-Female,CST-prop-ST-Female,CST-prop-OBC-Male,...,adult_education,higher_education,computer_course,inmates_trained,financial_assistance_provided,rehabilitated,legal_aid_provided,incidence,injured,killed
0,Andhra Pradesh,2001,17345,1063,0.061286,0.011846,0.002843,0.007818,0.001658,0.361526,...,2161,301,52,1498.0,14,75,2594,0,0,0
1,Andhra Pradesh,2002,13322,318,0.023870,0.013342,0.005952,0.006979,0.004721,0.376232,...,4745,396,34,1235.0,37,108,1876,0,0,0
2,Andhra Pradesh,2003,15682,536,0.034179,0.020400,0.006023,0.004857,0.002137,0.442782,...,7594,369,39,112.0,0,0,2126,0,0,0
3,Andhra Pradesh,2004,14397,399,0.027714,0.024344,0.006506,0.007345,0.003148,0.450367,...,8213,305,30,1131.0,0,0,909,0,0,0
4,Andhra Pradesh,2005,12389,977,0.078860,0.023947,0.003792,0.008182,0.003592,0.439234,...,10962,389,1,1915.0,0,204,2342,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,Rajasthan,2013,5995,297,0.049541,0.008604,0.010920,0.008769,0.007942,0.348610,...,237,64,38,6475.0,0,25,244,27,12,0
314,Sikkim,2013,125,5,0.040000,0.009615,0.000000,0.009615,0.000000,0.384615,...,0,0,0,47.0,0,0,428,0,0,0
315,Tripura,2013,1485,28,0.018855,0.009960,0.005976,0.011952,0.000000,0.239044,...,13,4,2,329.0,0,0,12,0,0,0
316,Uttar Pradesh,2013,30159,1081,0.035843,0.015014,0.012051,0.009957,0.001027,0.365863,...,8093,1353,212,3346.0,347,0,2461,3,5,0
