In [1]:
import pandas as pd 
import numpy as np

## Introduction

#### Output crimi_2021
- (combined dataframes)
- criminality for 2021 (ratio, meaning crimes proportional to population within subdistrict)
- density as a number
- population (needed to recalculate crimes ratio)
- area of subdistrict (in ha)
- green spaces

In [2]:
# lor: information about districts and their surface
# column "area" in ha

df = pd.read_excel('../Notebook/data_clean/lor.xlsx')
df.head(2)

Unnamed: 0,lor,subdistrict,area
0,11001,Tiergarten Süd,517
1,11002,Regierungsviertel,270


In [3]:
df.shape

(143, 3)

In [4]:
df.isna().sum().sum()

0

In [5]:
# crimes in 2021

crime = pd.read_excel('../Notebook/data_clean/crime_2021.xlsx')
crime.head(2)

Unnamed: 0,lor,subdistrict,district,crime_total_2021,robbery_2021,theft_bike_2021,theft_other_2021,bulglary_2021,graffiti_2021,drug_offences_2021,hard_crime_2021,other_crime_2021
0,11001,Tiergarten Süd,Mitte,3838,35,235,1250,24,44,200,444,1606
1,11002,Regierungsviertel,Mitte,5956,33,246,1786,16,108,172,563,3032


In [6]:
crime = crime.drop(["lor", "subdistrict"], axis=1)
crime.head(2)

Unnamed: 0,district,crime_total_2021,robbery_2021,theft_bike_2021,theft_other_2021,bulglary_2021,graffiti_2021,drug_offences_2021,hard_crime_2021,other_crime_2021
0,Mitte,3838,35,235,1250,24,44,200,444,1606
1,Mitte,5956,33,246,1786,16,108,172,563,3032


In [7]:
crime_2 = crime.drop(["district"], axis=1)
crime_2.head(2)

Unnamed: 0,crime_total_2021,robbery_2021,theft_bike_2021,theft_other_2021,bulglary_2021,graffiti_2021,drug_offences_2021,hard_crime_2021,other_crime_2021
0,3838,35,235,1250,24,44,200,444,1606
1,5956,33,246,1786,16,108,172,563,3032


In [8]:
# density for 2021

In [9]:
density = pd.read_excel('../Notebook/data_clean/density.xlsx')
density.head(2)

Unnamed: 0,lor,subdistrict,den_2013,den_2014,den_2015,den_2016,den_2017,den_2018,den_2019,den_2020,den_2021
0,11001,Tiergarten Süd,25.91,26.96,27.95,28.36,28.23,28.62,28.98,29.12,29.5
1,11002,Regierungsviertel,34.9,36.7,38.16,41.72,43.97,46.04,47.19,46.99,48.41


In [10]:
# population for 2021

population = pd.read_excel('../Notebook/data_clean/population.xlsx')
population.head(2)

Unnamed: 0,lor,subdistrict,pop_2013,pop_2014,pop_2015,pop_2016,pop_2017,pop_2018,pop_2019,pop_2020,pop_2021
0,11001,Tiergarten Süd,13395.0,13938.0,14451.0,14663.0,14594.0,14798.0,14985.0,15057.0,15249
1,11002,Regierungsviertel,9422.0,9909.0,10302.0,11265.0,11872.0,12431.0,12742.0,12688.0,13071


#### Concatinating into one dataframe
- (without green spaces yet)

In [11]:
# crime_2021_n : crime_2021 new

crime_2021_n = pd.concat([df, crime["district"], density["den_2021"], population["pop_2021"], crime_2], axis=1)

In [12]:
crime_2021_n.head(2)

Unnamed: 0,lor,subdistrict,area,district,den_2021,pop_2021,crime_total_2021,robbery_2021,theft_bike_2021,theft_other_2021,bulglary_2021,graffiti_2021,drug_offences_2021,hard_crime_2021,other_crime_2021
0,11001,Tiergarten Süd,517,Mitte,29.5,15249,3838,35,235,1250,24,44,200,444,1606
1,11002,Regierungsviertel,270,Mitte,48.41,13071,5956,33,246,1786,16,108,172,563,3032


#### Calculating crime ratio
- so far crimes are as total numbers (obsetrvations). At this step they will be recalculated to ratio, so observations between districts can be better compared.
- crime_ratio = (crimes/population)*100.000
- crime_ratio means how many crimes happened within the population of 100.000 inhabitants.
- from now on I will work with dataset for 2021 with crime ratio only. Informations will be stored in columns named: crime_total, robbery, etc.

In [13]:
crime_2021_n["crime_total"] = ((crime_2021_n["crime_total_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["robbery"] = ((crime_2021_n["robbery_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["theft_bike"] = ((crime_2021_n["theft_bike_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["theft_other"] = ((crime_2021_n["theft_other_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["bulglary"] = ((crime_2021_n["bulglary_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["graffiti"] = ((crime_2021_n["graffiti_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["drug_offences"] = ((crime_2021_n["drug_offences_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["hard_crime"] = ((crime_2021_n["hard_crime_2021"]/crime_2021_n["pop_2021"])*100000).round(0)
crime_2021_n["other_crime"] = ((crime_2021_n["other_crime_2021"]/crime_2021_n["pop_2021"])*100000).round(0)

In [14]:
crime_2021_n.head(2)

Unnamed: 0,lor,subdistrict,area,district,den_2021,pop_2021,crime_total_2021,robbery_2021,theft_bike_2021,theft_other_2021,...,other_crime_2021,crime_total,robbery,theft_bike,theft_other,bulglary,graffiti,drug_offences,hard_crime,other_crime
0,11001,Tiergarten Süd,517,Mitte,29.5,15249,3838,35,235,1250,...,1606,25169.0,230.0,1541.0,8197.0,157.0,289.0,1312.0,2912.0,10532.0
1,11002,Regierungsviertel,270,Mitte,48.41,13071,5956,33,246,1786,...,3032,45567.0,252.0,1882.0,13664.0,122.0,826.0,1316.0,4307.0,23196.0


In [15]:
# Removing columns with numbers of crimes

In [16]:
crime_2021_n = crime_2021_n.drop(["crime_total_2021", "robbery_2021", "theft_bike_2021", "theft_other_2021", "bulglary_2021",\
                          "graffiti_2021", "drug_offences_2021", "hard_crime_2021", "other_crime_2021"], axis=1)

In [17]:
crime_2021_n.head(2)

Unnamed: 0,lor,subdistrict,area,district,den_2021,pop_2021,crime_total,robbery,theft_bike,theft_other,bulglary,graffiti,drug_offences,hard_crime,other_crime
0,11001,Tiergarten Süd,517,Mitte,29.5,15249,25169.0,230.0,1541.0,8197.0,157.0,289.0,1312.0,2912.0,10532.0
1,11002,Regierungsviertel,270,Mitte,48.41,13071,45567.0,252.0,1882.0,13664.0,122.0,826.0,1316.0,4307.0,23196.0


In [18]:
# Change names and order of columns
# check order of columns

In [19]:
crime_2021_n = crime_2021_n.rename(columns={'den_2021':'density',
                            'pop_2021':'popul'})

In [20]:
crime_2021_n.head(10)

Unnamed: 0,lor,subdistrict,area,district,density,popul,crime_total,robbery,theft_bike,theft_other,bulglary,graffiti,drug_offences,hard_crime,other_crime
0,11001,Tiergarten Süd,517,Mitte,29.5,15249,25169.0,230.0,1541.0,8197.0,157.0,289.0,1312.0,2912.0,10532.0
1,11002,Regierungsviertel,270,Mitte,48.41,13071,45567.0,252.0,1882.0,13664.0,122.0,826.0,1316.0,4307.0,23196.0
2,11003,Alexanderplatz,623,Mitte,95.66,59594,26454.0,258.0,1248.0,9451.0,129.0,530.0,1438.0,3101.0,10298.0
3,11004,Brunnenstraße Süd,176,Mitte,179.07,31516,10788.0,133.0,1149.0,3544.0,102.0,324.0,400.0,1088.0,4049.0
4,12005,Moabit West,431,Mitte,105.97,45674,13207.0,105.0,849.0,3339.0,151.0,217.0,1141.0,1738.0,5666.0
5,12006,Moabit Ost,392,Mitte,107.12,41991,22752.0,155.0,1041.0,6542.0,171.0,181.0,1322.0,2403.0,10938.0
6,13007,Osloer Straße,237,Mitte,159.15,37719,14823.0,207.0,581.0,5477.0,114.0,260.0,565.0,2317.0,5302.0
7,13008,Brunnenstraße Nord,332,Mitte,119.4,39640,15651.0,182.0,658.0,5252.0,159.0,520.0,838.0,2311.0,5732.0
8,14009,Parkviertel,713,Mitte,63.74,45450,12079.0,123.0,513.0,4095.0,119.0,99.0,433.0,2029.0,4669.0
9,14010,Wedding Zentrum,250,Mitte,223.15,55788,13722.0,226.0,731.0,4069.0,86.0,289.0,1007.0,2276.0,5037.0


In [22]:
crime_2021_n.to_excel('crime_2021_for_model.xlsx', index=False)