# Code Base

### Global Data

Run the following initialization before running any other code given below

In [8]:
import datetime
import pprint
import pandas
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', None)
pandas.set_option('display.max_colwidth', -1)

paths = {
    "cases_2010" : "./data/cases/cases_2010.csv",
    "cases_2011" : "./data/cases/cases_2011.csv",
    "cases_2012" : "./data/cases/cases_2012.csv",
    "cases_2013" : "./data/cases/cases_2013.csv",
    "cases_2014" : "./data/cases/cases_2014.csv",
    "cases_2015" : "./data/cases/cases_2015.csv",
    "cases_2016" : "./data/cases/cases_2016.csv",
    "cases_2017" : "./data/cases/cases_2017.csv",
    "cases_2018" : "./data/cases/cases_2018.csv",
    "judges" : "./data/judges_clean/judges_clean.csv",
    "judges_key" : "./data/keys/judge_case_merge_key.csv",
    "cases_state_key" : "./data/keys/cases_state_key.csv",
    "type_name_key" : "./data/keys/type_name_key.csv",
    "act_key"         :     "./data/keys/act_key.csv",
    "acts_sections" : "./data/acts_sections/acts_sections.csv",
    "section_key"     :     "./data/section_key.csv"
}

years = [2010, 2011, 2012, 2013 ,2014,2015 ,2016, 2017,2018]
cases = ["cases_2010","cases_2011","cases_2012", "cases_2013","cases_2014","cases_2015","cases_2016","cases_2017","cases_2018"]
# cases = ["cases_2010"]

  pandas.set_option('display.max_colwidth', -1)


### Working Days Dilemma

So to find the no of working days, I just went through every date that appears in 'date of first hearing' column and then later found out the no of distinct days that appeared. The column's Datatype has been converted to DateTime to ensure consistancy. The result did not changed when I inclued other columns having date as this column alone had all days of the year

In [10]:
import polars as pd
print("year,no_of_working_days")
for x in cases:
    days = set()
    data = pd.read_csv(paths[x],parse_dates=True)
    data = data.filter(pd.col("date_first_list").is_null() == False)
    year = data['year'][1]

    data1 = data.filter(pd.col('date_first_list') <  (datetime.date(year+1,1,1)))
    data1 = data1.filter(pd.col('date_first_list') >= (datetime.date(year,1,1)))

    for i in data1['date_first_list']:
                days.add(i)

    a = len(days)
    print(year,",",a,sep="")

year,no_of_working_days
2010,365
2011,365
2012,366
2013,365
2014,365
2015,365
2016,366


: 

: 

### Open Cases Explosion

I consider a case to be open at the end of the year X if its date_of_decision falls on an year > X and its date_of_filing falls on <= X. 

In [5]:
import polars as pd

open_case_dict = dict(zip(years,[0 for x in range(9)]))
total_new_case_dict = dict(zip(years,[0 for x in range(9)]))

for x in cases:
    data = pd.read_csv(paths[x], parse_dates=True)
    year = int(data['year'][1])
    total_new_case_dict[year] += len(data)
    data1 = data.filter(pd.col("date_of_decision").is_null())
    for i in range(year,2019):
        open_case_dict[i] += data1.shape[0]
    data = data.filter(pd.col("date_of_decision").is_null() == False)
    for y in range(year,2019):
        data1 = data.filter((pd.col("date_of_decision").is_between(datetime.date(y+1,1,1),datetime.date(2030,1,1) )))
        open_case_dict[y] += len(data1)
    

import pandas
df = pandas.DataFrame([open_case_dict,total_new_case_dict])
df.index = ['open_cases','total_new_cases']
print(df)



                    2010     2011      2012      2013      2014      2015  \
open_cases       3618507  7310832  10684277  13603847  15968834  18349382   
total_new_cases  4281327  5208653  6400783   7555617   8874616   10475876   

                     2016      2017      2018  
open_cases       20627713  22377894  23749571  
total_new_cases  11349260  13065513  13724299  


### Superficial Data might be misleading

I consider the duration of a case to be the time between when the case was filed and when the case reached a decision. I calculate this for all all cases in a year and then take the average.

In [6]:
import polars as pd
avg_case_duration = dict(zip(years,[0 for x in range(9)]))
for x in cases:
    data = pd.read_csv(paths[x], parse_dates=True)
    year = data['year'][1]
    data = data.with_column((pd.col('date_of_decision') - pd.col('date_of_filing')).alias("diff"))
    data = data.filter(pd.col('diff').is_null() == False)
    data = data.groupby("year").mean()
    avg_case_duration[year] = data['diff'][0].days

# print(avg_case_duration)
import pandas
df = pandas.DataFrame([avg_case_duration])
df.index = ['avg_case_duration']
# df.to_csv('1.csv')
print(df)


                   2010  2011  2012  2013  2014  2015  2016  2017  2018
avg_case_duration  1070  921   779   600   467   367   258   152   76  


### Flaws of the Indian Penal Code ? : Theft

Amongst all cases, I filter out non-bailable cases and then group them by their section year wise and then count the number of cases in each group.

In [11]:
import polars as pd
import pandas
acts = pd.read_csv(paths["acts_sections"], dtypes={ 'section' : int})
acts = acts.filter(pd.col("bailable_ipc") == 'non-bailable')
act_key = pd.read_csv(paths["act_key"], dtypes={'count' : float, 'section' : int})
acts = acts.join(act_key, on="act", how="inner")

section_key = pd.read_csv(paths["section_key"])
acts = acts.join(section_key, on="section", how="inner")
ans = pandas.DataFrame({
                    'act_s': pandas.Series(dtype='str'),
                    'section_s': pandas.Series(dtype='str'),
                    'year': pandas.Series(dtype='int'),
                    'count': pandas.Series(dtype='uint32')
                    })
ans = pd.from_pandas(ans)
# print(ans)
for x in cases:
    case = pd.read_csv(paths[x])
    year = case['year'][1]
    data = case.select(["ddl_case_id","disp_name","year"])
    data = data.join(acts, on="ddl_case_id")
    data = data.groupby(['act_s','section_s','year']).count()
    ans = pd.concat([ans,data])
print(ans)

shape: (0, 4)
┌───────┬───────────┬──────┬───────┐
│ act_s ┆ section_s ┆ year ┆ count │
│ ---   ┆ ---       ┆ ---  ┆ ---   │
│ str   ┆ str       ┆ i64  ┆ u32   │
╞═══════╪═══════════╪══════╪═══════╡
└───────┴───────────┴──────┴───────┘
shape: (1097, 4)
┌───────────────────────┬───────────┬──────┬───────┐
│ act_s                 ┆ section_s ┆ year ┆ count │
│ ---                   ┆ ---       ┆ ---  ┆ ---   │
│ str                   ┆ str       ┆ i64  ┆ u32   │
╞═══════════════════════╪═══════════╪══════╪═══════╡
│ The Indian Penal Code ┆ 399       ┆ 2010 ┆ 931   │
│ The Indian Penal Code ┆ 254       ┆ 2010 ┆ 58    │
│ The Indian Penal Code ┆ 128       ┆ 2010 ┆ 72    │
│ The Indian Penal Code ┆ 454       ┆ 2010 ┆ 2263  │
│ ...                   ┆ ...       ┆ ...  ┆ ...   │
│ The Indian Penal Code ┆ 366       ┆ 2018 ┆ 75631 │
│ The Indian Penal Code ┆ 301       ┆ 2018 ┆ 12    │
│ The Indian Penal Code ┆ 256       ┆ 2018 ┆ 30    │
│ The Indian Penal Code ┆ 242       ┆ 2018 ┆ 9     │
└────

For the trend of non-bailable cases where people won - 

In [4]:
import polars as pd
import pandas
acts = pd.read_csv(paths["acts_sections"], dtypes={ 'section' : int})
acts = acts.filter(pd.col("bailable_ipc") == 'non-bailable')
act_key = pd.read_csv(paths["act_key"], dtypes={'count' : float, 'section' : int})
acts = acts.join(act_key, on="act", how="inner")

section_key = pd.read_csv(paths["section_key"])
acts = acts.join(section_key, on="section", how="inner")
ans = pandas.DataFrame({
                    'act_s': pandas.Series(dtype='str'),
                    'section_s': pandas.Series(dtype='str'),
                    'year': pandas.Series(dtype='int'),
                    'count': pandas.Series(dtype='uint32')
                    })
ans = pd.from_pandas(ans)
# print(ans)
for x in cases:
    case = pd.read_csv(paths[x])
    year = case['year'][1]
    data = case.select(["ddl_case_id","disp_name","year"])
    
    data = data.filter(
        (pd.col('disp_name') == 4) |
        (pd.col('disp_name') == 22) |
        (pd.col('disp_name') == 41) |
        (pd.col('disp_name') == 51) 
    )

    data = data.join(acts, on="ddl_case_id")

    data = data.groupby(['act_s','section_s','year']).count()

    ans = pd.concat([ans,data])
print(ans)
ans.write_csv("26a.csv")

shape: (0, 4)
┌───────┬───────────┬──────┬───────┐
│ act_s ┆ section_s ┆ year ┆ count │
│ ---   ┆ ---       ┆ ---  ┆ ---   │
│ str   ┆ str       ┆ i64  ┆ u32   │
╞═══════╪═══════════╪══════╪═══════╡
└───────┴───────────┴──────┴───────┘
shape: (1050, 4)
┌───────────────────────┬───────────┬──────┬───────┐
│ act_s                 ┆ section_s ┆ year ┆ count │
│ ---                   ┆ ---       ┆ ---  ┆ ---   │
│ str                   ┆ str       ┆ i64  ┆ u32   │
╞═══════════════════════╪═══════════╪══════╪═══════╡
│ The Indian Penal Code ┆ 307       ┆ 2010 ┆ 2511  │
│ The Indian Penal Code ┆ 372       ┆ 2010 ┆ 24    │
│ The Indian Penal Code ┆ 240       ┆ 2010 ┆ 9     │
│ The Indian Penal Code ┆ 407       ┆ 2010 ┆ 67    │
│ ...                   ┆ ...       ┆ ...  ┆ ...   │
│ The Indian Penal Code ┆ 411       ┆ 2018 ┆ 669   │
│ The Indian Penal Code ┆ 369       ┆ 2018 ┆ 2     │
│ The Indian Penal Code ┆ 267       ┆ 2018 ┆ 2     │
│ The Indian Penal Code ┆ 328       ┆ 2018 ┆ 261   │
└────

### Higher Latency in Non - Bailable cases

I consider the duration of a case to be the time between when the case was filed and when the case reached a decision. I calculate this for all all cases in a year and then take the average. I also filter out non bailable cases and apply the same procedure on them.

In [5]:
import polars as pd
ans = -1
flag = 0

for x in cases:
    case = pd.read_csv(paths[x],parse_dates=True)
    # print(case)
    case = case.filter(pd.col('date_of_decision').is_null() == False)
    case = case.filter(pd.col('date_of_filing').is_null() == False)
    case = case.with_column((pd.col('date_of_decision') - pd.col('date_of_filing')).apply(lambda x: x.days).alias("duration"))
    data = case.select(["ddl_case_id","disp_name","year","duration"])
    data = data.filter(pd.col("duration") != 0)
    data = data.select(["year","duration"])
    data = data.groupby("year").agg(pd.col("duration").mean().alias("avg_length"))
    data = data.select(["year","avg_length"])
    if flag == 0:
        flag = 1
        ans = data
    else:
        ans = pd.concat([ans,data])
print(ans)

shape: (4281327, 19)
┌───────────┬──────┬──────────┬───────────┬─────┬────────────┬────────────┬────────────┬────────────┐
│ ddl_case_ ┆ year ┆ state_co ┆ dist_code ┆ ... ┆ date_of_de ┆ date_first ┆ date_last_ ┆ date_next_ │
│ id        ┆ ---  ┆ de       ┆ ---       ┆     ┆ cision     ┆ _list      ┆ list       ┆ list       │
│ ---       ┆ i64  ┆ ---      ┆ i64       ┆     ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│ str       ┆      ┆ i64      ┆           ┆     ┆ date       ┆ date       ┆ date       ┆ date       │
╞═══════════╪══════╪══════════╪═══════════╪═════╪════════════╪════════════╪════════════╪════════════╡
│ 01-01-01- ┆ 2010 ┆ 1        ┆ 1         ┆ ... ┆ 2011-06-19 ┆ 2011-06-08 ┆ 2011-06-20 ┆ 2011-06-24 │
│ 200308002 ┆      ┆          ┆           ┆     ┆            ┆            ┆            ┆            │
│ 162010    ┆      ┆          ┆           ┆     ┆            ┆            ┆            ┆            │
│ 01-01-01- ┆ 2010 ┆ 1        ┆ 1         ┆ ... ┆ 2010-11-21 

In [6]:
import polars as pd
acts = pd.read_csv(paths["acts_sections"], dtypes={ 'section' : int})
acts = acts.filter(pd.col("bailable_ipc") == 'non-bailable')
ans = -1
flag = 0

for x in cases:
    case = pd.read_csv(paths[x],parse_dates=True)
    print(case)
    case = case.filter(pd.col('date_of_decision').is_null() == False)
    case = case.filter(pd.col('date_of_filing').is_null() == False)
    case = case.with_column((pd.col('date_of_decision') - pd.col('date_of_filing')).apply(lambda x: x.days).alias("duration"))
    data = case.select(["ddl_case_id","disp_name","year","duration"])
    data = data.filter(pd.col("duration") != 0)
    data = data.join(acts, on="ddl_case_id", how="inner")
    data = data.select(["year","duration"])
    data = data.groupby("year").agg(pd.col("duration").mean().alias("avg_length"))
    data = data.select(["year","avg_length"])
    if flag == 0:
        flag = 1
        ans = data
    else:
        ans = pd.concat([ans,data])
print(ans)


shape: (4281327, 19)
┌───────────┬──────┬──────────┬───────────┬─────┬────────────┬────────────┬────────────┬────────────┐
│ ddl_case_ ┆ year ┆ state_co ┆ dist_code ┆ ... ┆ date_of_de ┆ date_first ┆ date_last_ ┆ date_next_ │
│ id        ┆ ---  ┆ de       ┆ ---       ┆     ┆ cision     ┆ _list      ┆ list       ┆ list       │
│ ---       ┆ i64  ┆ ---      ┆ i64       ┆     ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│ str       ┆      ┆ i64      ┆           ┆     ┆ date       ┆ date       ┆ date       ┆ date       │
╞═══════════╪══════╪══════════╪═══════════╪═════╪════════════╪════════════╪════════════╪════════════╡
│ 01-01-01- ┆ 2010 ┆ 1        ┆ 1         ┆ ... ┆ 2011-06-19 ┆ 2011-06-08 ┆ 2011-06-20 ┆ 2011-06-24 │
│ 200308002 ┆      ┆          ┆           ┆     ┆            ┆            ┆            ┆            │
│ 162010    ┆      ┆          ┆           ┆     ┆            ┆            ┆            ┆            │
│ 01-01-01- ┆ 2010 ┆ 1        ┆ 1         ┆ ... ┆ 2010-11-21 

### Raising and Falling Crimes - Classifying them

I use the data from 'Flaws in Indian Penal Code'. Only here I plot each crime individually instead of everything together. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("28a.csv") # From part 2 of Flaws in Indian Penal COde
data = data.drop(["act_s"], axis=1)

print(data)
pd.pivot_table(data.reset_index(),
               index='year', columns='section_s', values='count'
              ).plot(subplots=True,figsize=(7.5, 600))
              
plt.show()

### Popular Crimes

I join cases table with the acts table. I filter out only criminal cases. I then group by the section name and them find the count for each section. This is then repeated for all years of cases

In [3]:
import polars as pd
ans = -1
for x in cases:
    case = pd.read_csv(paths[x])
    year = case['year'][1]
    case = case.select(['ddl_case_id','state_code','year'])
    acts = pd.read_csv(paths["acts_sections"], dtypes={ 'section' : int})
    acts = acts.filter(pd.col("criminal") == 1)
    act_key = pd.read_csv(paths["act_key"], dtypes={'count' : float, 'section' : int})
    acts = acts.join(act_key, on="act", how="inner")

    section_key = pd.read_csv(paths["section_key"])
    section_key = section_key.filter(pd.col("section_s").is_not_null())
    data = acts.join(section_key, on="section", how="inner")
    data = data.join(case,on="ddl_case_id")

    data = data.select(['section_s'])
    data = data.groupby('section_s').count()
    data = data.rename({'count' : str(year)})

    if x != 'cases_2010':
        ans = ans.join(data,on ='section_s')
    else:
        ans = data
print(ans)

To get the section's name in IPC, join this table too. 

In [None]:
data1 = pd.read_csv("section_name_key.csv")
ans = ans.join(data1,on='section_s')

print(ans)

### Women in Judiciary

For finding percentage of female judges, I simply count the number of entries in judges table where female_judge was 1. For every year I count the number of active female judges. I figure out if they are active through their end_date.

In [None]:
import polars as pd
print("year,female_judges,total_judges,percent")
data = pd.read_csv(paths['judges'],parse_dates=True)

at = 0
tt = 0

for year in years:
    data = data.filter(pd.col('end_date') > (datetime.date(year,1,1)))
    total = data.shape[0]
    tt += total
    data1 = data.filter(pd.col("female_judge") == '1 female')
    a = data1.shape[0]
    at += a

    print(year,",",a,",",total,",",(a*100)/total,sep="")

print("2019",",",at,",",tt,",",(at*100)/tt,sep="")

Amongst all entries in the judges table, I group them according to their state_name which I get upon joining with cases_state_key. I then calculate the required percentage for each state.

In [None]:
import polars as pd
data = pd.read_csv(paths["judges"],parse_dates=True)
state_key = pd.read_csv(paths['cases_state_key'])
data = data.join(state_key,on='state_code')

data1 = data.groupby('state_name').count()
data1 = data1.rename({'count' : 'total_judges'})

data = data.filter(pd.col('female_judge') == '1 female')
data = data.groupby('state_name').count()
data = data.rename({'count' : 'female_judges'})

data = data.join(data1,on='state_name')
data = data.with_column(((pd.col(('female_judges')) * 100 ) / pd.col('total_judges')).alias("percentage"))
data = data.drop(['female_judges','total_judges'])

print(data)

### Women in Courts

Finding percentage of female defendants over the years can be simply done by iterating of year-wise cases and calculating the required percentage.

In [None]:
import polars as pd
print("year,female_defendants,male_defendants,total,female_percentage")
for x in cases:
    data = pd.read_csv(paths[x])
    total = data.shape[0]
    year = data['year'][1]
    data1 = data.filter(pd.col("female_defendant") == '1 female')
    data2 = data.filter(pd.col("female_defendant") == '0 male')
    a = data1.shape[0]
    b = data2.shape[0]
    print(year,",",a,",",b,",",a + b,",",(a * 100)/(a + b),sep="")

To calculate the percentage of women who won, just iterate through all the cases year-wise and filter out cases with female defendants and find the appropriate percentage.

In [None]:
import polars as pd
print("year,female_defendant,lost,percentage")
for x in cases:
    data = pd.read_csv(paths[x])
    year = data['year'][1]

    data = data.filter(pd.col("female_defendant") == '1 female')
    a = data.shape[0]
    data = data.filter(
        (pd.col('disp_name') == 4) |
        (pd.col('disp_name') == 22) |
        (pd.col('disp_name') == 41) |
        (pd.col('disp_name') == 51) 
    )
    b = data.shape[0]

    print(year,",",a,",",b,",",(b * 100)/(a),sep="")