In [1]:
import pandas as pd
import glob
import pathlib as pl

from notebooks.preprocessing.data_manipulation import transform_one_df

### 1) Combine all the pks datasets

In [5]:
PATH_TO_PROCESSED_DATA = pl.Path('../../../datasets/processed')

cases_basic_table_files = glob.glob('../../../datasets/raw/police_stats/pks20[0-9][0-9]CasesBasicTable_excel.xlsx')
cases_development_files = glob.glob('../../../datasets/raw/police_stats/pks20[0-9][0-9]CasesDevelopment_excel.xlsx')
suspects_files = glob.glob('../../../datasets/raw/police_stats/pks20[0-9][0-9]Suspects_excel.xlsx')
len(cases_basic_table_files), len(cases_development_files), len(suspects_files)

(10, 10, 10)

 - first combine all the disticnt tables from all the years to 3 tables throughout the years

#### 1.11 PKS basic table

In [6]:
# select just a few types of offences (there are hundreds of them in the dataset)
# in no universe we have enough time to analyze all of them
# There will be more cleaning afterward, when we start the modeling 
interesting_values_basic_table = [
    "total offences",
    
    # murder and its variants (according to german law I suppose)
    # I didnt use all the variants (1*)
    "offences against life",
    "murder (sect. 211 PC)",
    "robbery attended with murder",
    "sexual murder",
    
    # Rape and its variants
    # the same applies here as in (1*)....
    "offences against sexual self-determination",
    "offences against sexual self-determination with use of violence or exploiting a state of dependence (sects. 174, 174a, 174b, 174c, 177, 178 PC)",
    "rape by sudden attack (individual offender) (sect. 177 (2) no. 1, (3 and 4) PC)",
    "rape by sudden attack (group of offenders) (sect. 177 (2) no. 2 PC)",
    
    # sexual abuse
    # (1*) you know the drill...
    "sexual abuse (sects. 176, 176a, 176b, 179, 182, 183, 183a PC)",
    "sexual abuse of children (sects. 176, 176a, 176b PC)",
    
    "pimping (sect. 181a PC)",
    
    "robbery, extortion resembling robbery, and assault on motorists with intent to rob (sects. 249-252, 255, 316a PC)",
    "serious robbery of financial institutions (banks/savings banks) (sect. 250 PC)",
    "handbag robbery",
    
    "bodily injury (sects. 223-227, 229, 231 PC)",
    
    "kidnapping, child abduction, trafficking in children (sects. 234-236 PC)"
    
    "stalking (sect. 238 PC)",
    
    # Theft (1*)
    "total number of thefts, namely:"
    "simple theft of bicycles",
    "simple theft of motor vehicles"
    "simple theft of non-cash means of payment",
    
    "serious shoplifting",
    
    "daytime burglary of a residence",
    
    "property and forgery offences",
    "fraud (sects. 263, 263a, 264, 264a, 265, 265a, 265b PC)",
    
    # drugs
    "unauthorised trafficking in, and smuggling of drugs under sect. 29 NCA",
    "unauthorised trafficking, production of, dispensing, and possession of drugs (Sect. 29a (1) no. 2 NCA) (significant amount)",
    
    "street crime"
    
    
    
]

useful_columns_basic_pks = {
    "B": "offence or offence category",
    "C": "recorded cases",
    "G": "less than 20,000 inhabitants",
    "H": "20,000 up to 100,000 inhabitants",
    "I": "100,000 up to 500,000 inhabitants",
    "J": "500,000 inhabitants and up",
    "Q": "male",
    "R": "female",
    "S": "number of non-German suspects",
}

# Assign the cleaned header and drop the original header rows
df = pd.read_excel(cases_basic_table_files[0], header=None, skiprows=8, usecols=','.join(list(useful_columns_basic_pks.keys())))
df.columns = useful_columns_basic_pks.values()
specifier_key = "offence or offence category"

df_transformed = transform_one_df(df, specifier_key, interesting_values_basic_table, 2014)

df_transformed


Unnamed: 0,year,total offences: recorded cases,"total offences: less than 20,000 inhabitants","total offences: 20,000 up to 100,000 inhabitants","total offences: 100,000 up to 500,000 inhabitants","total offences: 500,000 inhabitants and up",total offences: male,total offences: female,total offences: number of non-German suspects,offences against life: recorded cases,...,"unauthorised trafficking, production of, dispensing, and possession of drugs (Sect. 29a (1) no. 2 NCA) (significant amount): female","unauthorised trafficking, production of, dispensing, and possession of drugs (Sect. 29a (1) no. 2 NCA) (significant amount): number of non-German suspects",street crime: recorded cases,"street crime: less than 20,000 inhabitants","street crime: 20,000 up to 100,000 inhabitants","street crime: 100,000 up to 500,000 inhabitants","street crime: 500,000 inhabitants and up",street crime: male,street crime: female,street crime: number of non-German suspects
0,2014,6082064.0,1404723.0,1602590.0,1261205.0,1755141.0,1597241.0,552263.0,617392.0,2962.0,...,1186.0,2638.0,1342905.0,266329.0,350467.0,287668.0,434587.0,173095.0,22656.0,48851.0


In [8]:
def combine_(table, df2):
    if table is not None:
        return pd.concat([table, df2], axis=0, ignore_index=True)
    else:
        return df2

def combine_pks_tables(fnames: list[str], interesting_rows: list, interesting_cols: dict, specifier_key: str) -> pd.DataFrame:
    table = None

    for i, fname in enumerate(fnames):
        print(fname)
        
        year = 2014 + i
        
        df_i = pd.read_excel(cases_basic_table_files[i], header=None, skiprows=8, usecols=','.join(list(interesting_cols.keys())))
        df_i.columns = interesting_cols.values()
        
        df_transformed = transform_one_df(df_i, specifier_key, interesting_rows, year)
        
        table = combine_(table, df_transformed)
    table.reset_index(inplace=True, drop=True)
    return table




Note: some of the older datasets do not contain all the data, therefore their ends are filled with NaN values, 
This is not a problem, as these are really niche signal variable subtypes. the important types such as "offences against life", "murder", or "robbery attended with murder" are on the top. These are really the data we need.

In [10]:

basic_table = combine_pks_tables(cases_basic_table_files, interesting_values_basic_table, useful_columns_basic_pks,
                                 "offence or offence category")

basic_table.to_csv(PATH_TO_PROCESSED_DATA / 'final.csv')

basic_table

../../../datasets/raw/police_stats\pks2014CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2015CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2016CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2017CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2018CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2019CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2020CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2021CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2022CasesBasicTable_excel.xlsx
../../../datasets/raw/police_stats\pks2023CasesBasicTable_excel.xlsx


Unnamed: 0,year,total offences: recorded cases,"total offences: less than 20,000 inhabitants","total offences: 20,000 up to 100,000 inhabitants","total offences: 100,000 up to 500,000 inhabitants","total offences: 500,000 inhabitants and up",total offences: male,total offences: female,total offences: number of non-German suspects,offences against life: recorded cases,...,street crime: female,street crime: number of non-German suspects,pimping (sect. 181a PC): recorded cases,"pimping (sect. 181a PC): less than 20,000 inhabitants","pimping (sect. 181a PC): 20,000 up to 100,000 inhabitants","pimping (sect. 181a PC): 100,000 up to 500,000 inhabitants","pimping (sect. 181a PC): 500,000 inhabitants and up",pimping (sect. 181a PC): male,pimping (sect. 181a PC): female,pimping (sect. 181a PC): number of non-German suspects
0,2014,6082064.0,1404723.0,1602590.0,1261205.0,1755141.0,1597241.0,552263.0,617392.0,2962.0,...,22656.0,48851.0,,,,,,,,
1,2015,6330649.0,1463274.0,1675207.0,1297402.0,1812853.0,1781388.0,587648.0,911864.0,2991.0,...,21552.0,53834.0,,,,,,,,
2,2016,6372526.0,1590491.0,1663537.0,1222724.0,1813547.0,1767739.0,593067.0,953744.0,3242.0,...,21958.0,59423.0,,,,,,,,
3,2017,5761984.0,1291748.0,1542409.0,1131226.0,1682438.0,1586137.0,526578.0,736265.0,3227.0,...,21177.0,61756.0,,,,,,,,
4,2018,5555520.0,1263346.0,1483814.0,1106157.0,1600300.0,1541130.0,510136.0,708380.0,3254.0,...,20470.0,60934.0,,,,,,,,
5,2019,5436401.0,1243451.0,1451426.0,1061629.0,1569157.0,1514667.0,504544.0,699261.0,3054.0,...,20800.0,58496.0,136.0,19.0,32.0,28.0,54.0,120.0,24.0,83.0
6,2020,5310621.0,1224799.0,1409479.0,1034848.0,1517467.0,1481252.0,488365.0,663199.0,3289.0,...,20532.0,55959.0,137.0,28.0,21.0,29.0,51.0,125.0,18.0,82.0
7,2021,5047860.0,1170651.0,1350051.0,973017.0,1419707.0,1419594.0,472409.0,639127.0,2980.0,...,19693.0,51911.0,113.0,15.0,27.0,29.0,40.0,101.0,13.0,64.0
8,2022,5628584.0,1304904.0,1508727.0,1099424.0,1572975.0,1565240.0,528542.0,783876.0,3077.0,...,21199.0,61251.0,,,,,,,,
9,2023,5940667.0,1331638.0,1588511.0,1200417.0,1696162.0,1675541.0,571226.0,923269.0,3083.0,...,22339.0,70075.0,,,,,,,,


#### 1.2 PKS Cases Development - 
===== >>>> there is actually overlap with the basic table - this means we don't need to transform this table at all - we simply won't use it

In [6]:
df = pd.read_excel(cases_development_files[0], skiprows=16, usecols='C,D,H', header=None)
df

Unnamed: 0,offence or offence category,recorded cases,clearance rate %
0,total offences,6082064,54.9
1,offences against life,2962,93.2
2,murder (sect. 211 PC),664,95.3
3,other types of murder,602,95.5
4,robbery attended with murder,44,93.2
...,...,...,...
1024,environmental offences pursuant to supplementa...,14890,75.3
1025,street crime,1342905,16.5
1026,damage to property by graffiti in total,95160,18.8
1027,illegal entry/illegal stay under the Aliens Ac...,135743,99.8


#### 1.3 Suspects

As suspect is not really a criminal, we just suspect them from doing something against the law, we also do not need this table.
So in the end we only need the basic table from the pks datasets. 

It also contains the sex of the felon. Only thing is does nto contain is his age. 

In [12]:
df = pd.read_excel(suspects_files[0], skiprows=9, usecols='B,C,D,O,X', header=None)
df

Unnamed: 0,offence or offence category,sex,total suspects number,children under 21,adults 21>
0,total offences,m %,7.430742e+01,73.020783,7.464900e+01
1,total offences,f %,2.569258e+01,26.979217,2.535100e+01
2,total offences,x %,1.000000e+02,20.978607,7.902139e+01
3,total offences,m,1.597241e+06,329277.000000,1.267964e+06
4,total offences,f,5.522630e+05,121659.000000,4.306040e+05
...,...,...,...,...,...
3085,illegal entry/illegal stay under the Aliens Ac...,f,3.405200e+04,7525.000000,2.652700e+04
3086,illegal entry/illegal stay under the Aliens Ac...,x,1.314580e+05,33110.000000,9.834800e+04
3087,hightec/computer crime in the narrower sense,m,9.106000e+03,1274.000000,7.832000e+03
3088,hightec/computer crime in the narrower sense,f,2.664000e+03,353.000000,2.311000e+03
