# Cleaning an Analysing data

Goal: the data needs to be clean so that it ca be used for the next steps
Requirements:
- Remove “U-bahn”
- Add “n” to the end of the location if it’s missing (Gesundbrunnen, Neukoeln, Tiergarten, etc. )
- timestamp (hh:mm:ss) - change e.g. "Mitternacht" and "After Mitternacht"
- date format (yyyy-mm-dd)
- Modify the last three unrelated crime : they can be deleted if A. it's a sole incident (not combined with other crimes) AND B. the number of the crime is relatively minor



In [50]:
# load csv into df
import pandas as pd
import re
import numpy as np
import os
from datetime import datetime
from datetime import timedelta
from time import strftime
from sklearn.preprocessing import MultiLabelBinarizer

In [51]:
# folder_path = '/Users/ellenlee/code/hclpush/finding-conan/raw-data/structured-data/csv-output'
# data = []
# for file in os.listdir(folder_path):
#     if file.endswith('.csv'):
#         file_path = os.path.join(folder_path, file)
#         df = pd.read_csv(file_path)
#         data.append(df)
# concatenated_df = pd.concat(data, ignore_index=True)



In [52]:
# cur_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")
# output_path = f'/Users/ellenlee/code/hclpush/finding-conan/raw-data/structured-data/concacted_labeled_cases_{cur_datetime}.csv'
# concatenated_df.to_csv(output_path, index=False)

In [53]:
# df_original = pd.read_csv(output_path)
# df = df_original.copy()

In [54]:
output_path = 'concacted_labeled_cases_2023-06-13_13-45.csv'
concatenated_df = pd.read_csv(output_path)

In [55]:
df = concatenated_df.copy()

In [56]:
df.head()

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,1
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,1
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14.01,1.15 Uhr,Male,Male,1,3
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20.06,Unknown,Male,Male,1,2
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13.01,2.40 Uhr,Unknown,Male,Unknown,1


In [57]:
df.shape

(2405, 11)

In [58]:
# Number of unique_case_id before cleaning = 2303
df['unique_case_id'].nunique()

2303

## One Hot Encoding

### To be archived

In [59]:
# crime_columns = ['Homicide', 'Hate Crime - Disability', 'Hate Crime - Gender', 'Hate Crime - Gender Identity', 'Hate Crime - Religious',
#                   'Hate Crime - Sexual orientation', 'Hate Crime - Racial/Ethnicity', 'Hate Crime - Ethnicity', 'Verbal Abuse/Verbal Assault',
#                   'Property Damage', 'Drug Offenses', 'General Assault', 'Sexual Assault', 'Sexual Harassment', 'Property Crimes',
#                   'Domestic Violence', 'Missing Person', 'Traffic Incident', 'General Assault', 'Unclassified']

In [60]:
# for index, row in df.iterrows():
#     types_of_crime = str(row['type_of_crime']).split(", ")
#     for column in crime_columns:
#         df.loc[index, column] = 0
#     for crime in types_of_crime:
#         if crime in crime_columns:
#             df.loc[index, crime] = 1
# # df.to_csv('concatenated.csv', index=False)

### Try

In [61]:
df.head(2)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,1
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,1


# Clean the data 

In [62]:
# Create a new DataFrame to store the cleaned data
# cleaned_df = pd.DataFrame()
cleaned_df = df.copy()

In [63]:
cleaned_df.head(2)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,1
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,1


### Drop duplicates

In [64]:
cleaned_df.drop_duplicates(subset=['official_case_id'], keep='last', inplace=True)
cleaned_df.shape # We have 2301 unqique rows/cases

(2301, 11)

### One-hot encoding

In [65]:
def transform_strs_to_lst(strings):
    return [ele.strip() for ele in strings.split(',')]

In [66]:
cleaned_df['lst_type_of_crime'] = cleaned_df['type_of_crime'].apply(transform_strs_to_lst)
cleaned_df.head()

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders,lst_type_of_crime
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,1,[Property Damage]
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,1,[Homicide]
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14.01,1.15 Uhr,Male,Male,1,3,"[General Assault, Verbal Abuse, Property Damag..."
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20.06,Unknown,Male,Male,1,2,"[Property Crime, General Assault]"
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13.01,2.40 Uhr,Unknown,Male,Unknown,1,[Property Crime]


In [67]:
mlb = MultiLabelBinarizer()
ohe_df = cleaned_df.join(pd.DataFrame(mlb.fit_transform(cleaned_df.pop('lst_type_of_crime')),
                          columns=mlb.classes_,
                          index=cleaned_df.index))


In [68]:
ohe_df.head(2)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Verstoßes gegen das Waffengesetz,Verstöße gegen das Sprengstoffgesetz,Verwahrungsbruch,Violation of Assembly Freedom Law,Violation of Infection Protection Measures,War Crimes,Widerstand gegen Vollstreckungsbeamte,Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte,and Manufacturing of Illegal Substances,üble Nachrede und Verleumdung
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,...,0,0,0,0,0,0,0,0,0,0
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,...,0,0,0,0,0,0,0,0,0,0


In [69]:
ohe_df.shape

(2301, 148)

In [70]:
#ohe_df['Unclassified'].sum()

In [71]:
list(ohe_df.columns)

['unique_case_id',
 'official_case_id',
 'type_of_crime',
 'location',
 'year',
 'date',
 'time',
 'victim_sex',
 'offender_sex',
 'number_of_victims',
 'number_of_offenders',
 'Arso',
 'Arson',
 'Assault',
 'Assault on a Police Officer',
 'Attempted Arson',
 'Attempted Bodily Harm',
 'Attempted Burglary',
 'Attempted Evasion of Arrest',
 'Attempted Homicide',
 'Attempted Murder',
 'Attempted Nötigung',
 'Attempted Property Crime',
 'Attempted Property Damage',
 'Attempted Robbery',
 'Attempted Suicide',
 'Attempted Theft',
 'Attempted Vehicle Theft',
 'Bedrohung mit Waffen',
 'Begünstigung',
 'Beleidigung',
 'Bestechlichkeit',
 'Breach of Confidentiality',
 'Breach of Peace',
 'Bribery',
 'Burglary',
 'Computer Sabotage',
 'Distribution',
 'Driving Under the Influence',
 'Driving Without a License',
 'Driving without a License',
 'Driving without a valid license',
 'Drug Offenses',
 'Evading Police',
 'Evading Police Control',
 'Extortion',
 'Falschbeurkundung im Amt',
 'False Impriso

In [72]:
filtered_df_Unclassified = ohe_df[ohe_df['Hate Crime - Racial/Ethnicity'] >= 1]

# Print the filtered DataFrame
filtered_df_Unclassified

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Verstoßes gegen das Waffengesetz,Verstöße gegen das Sprengstoffgesetz,Verwahrungsbruch,Violation of Assembly Freedom Law,Violation of Infection Protection Measures,War Crimes,Widerstand gegen Vollstreckungsbeamte,Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte,and Manufacturing of Illegal Substances,üble Nachrede und Verleumdung
159,000013_1234844,1234844,"Property Damage, Hate Crime - Racial/Ethnicity","Köllnische Straße, Rudower Straße, Niederschön...",2022,11.08 - 13.08,11.30 Uhr - 21.00 Uhr,Unknown,Unknown,79,...,0,0,0,0,0,0,0,0,0,0
191,000011_1177075,1177075,"Verbal Abuse, Hate Crime - Racial/Ethnicity",Alt-Moabit Straße,2022,16.02,14.20 Uhr,Female,"Female, Male",1,...,0,0,0,0,0,0,0,0,0,0
224,000012_1198794,1198794,"General Assault, Verbal Abuse, Hate Crime - Ra...",U-Bahnhof Heinrich-Heine-Straße,2022,23.04,20.10 Uhr,Male,Male,1,...,0,0,0,0,0,0,0,0,0,0
289,000013_1261936,1261936,"General Assault, Verbal Abuse, Hate Crime - Ra...","Hobrechtsfelder Chaussee, Buch",2022,6.11,13.30 Uhr,Male,Male,2,...,0,0,0,0,0,0,0,0,0,0
318,000042_1259714,1259714,"General Assault, Verbal Abuse, Hate Crime - Ra...",Tempelhof-Schöneberg,2022,29.1,22.40 Uhr,Female,Male,1,...,0,0,0,0,0,0,0,0,0,0
364,000088_1256227,1256227,"Verbal Abuse, Hate Crime - Racial/Ethnicity",Mitte,2022,19.1,Unknown,Unknown,Male,Unknown,...,0,0,0,0,0,0,0,0,0,0
378,000002_1178494,1178494,"Hate Crime - Racial/Ethnicity, Verbal Abuse",Perleberger Straße,2022,19.02,17.30 Uhr,Female,Female,1,...,0,0,0,0,0,0,0,0,0,0
402,000010_1226645,1226645,"Verbal Abuse, General Assault, Hate Crime - Ra...",Friedrichstraße,2022,15.07,16.40 Uhr,Female,Male,1,...,0,0,0,0,0,0,0,0,0,0
432,000040_1224983,1224983,"Verbal Abuse, Hate Crime - Racial/Ethnicity",Dunckerstraße,2022,09.07,16.40 Uhr,Female,Male,2,...,0,0,0,0,0,0,0,0,0,0
433,000041_1224980,1224980,"Hate Crime - Racial/Ethnicity, General Assault...","Langhansstraße, Weißensee",2022,09.07,Kurz nach Mitternacht,Female,Female,1,...,0,0,0,0,0,0,0,0,0,0


In [73]:
cleaned_df_mlh = ohe_df.copy()

In [74]:
cleaned_df_mlh.head(2)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Verstoßes gegen das Waffengesetz,Verstöße gegen das Sprengstoffgesetz,Verwahrungsbruch,Violation of Assembly Freedom Law,Violation of Infection Protection Measures,War Crimes,Widerstand gegen Vollstreckungsbeamte,Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte,and Manufacturing of Illegal Substances,üble Nachrede und Verleumdung
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,...,0,0,0,0,0,0,0,0,0,0
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# List of columns to mergs
columns_to_merge_toUC = [
    'Driving Without a License',
    'Driving without a License',
    'Attempted Evasion of Arrest',
    'Attempted Nötigung',
    'Attempted Suicide',
    'Begünstigung',
    'Bestechlichkeit',
    'Breach of Confidentiality',
    'Breach of Peace',
    'Bribery', "Bribery",
    "Computer Sabotage",
    "Distribution",
    "Driving without a valid license",
    "Evading Police",
    "Evading Police Control",
    "Falschbeurkundung im Amt",
    "False Imprisonment",
    "Forgery",
    "Fraud",
    "Gefährlicher Eingriff in den Straßenverkehr",
    "Gefangenenbefreiung",
    "Hate Crime - Political",
    "Hausfriedensbruch",
    "Hehlerei",
    "Hit and Run",
    "Human Trafficking",
    "Illegal Possession of a Weapon",
    "Illegal Racing",
    "Landfriedensbruch",
    "Misuse of Emergency Services",
    "Possession",
    "Possession of a Firearm",
    "Possession of a Prohibited Weapon",
    "Possession of a Weapon",
    "Possession of Illegal Firearms",
    "Possession of Illegal Items",
    "Possession of Illegal Weapon",
    "Possession of Illegal Weapons",
    "Possession of Weapon",
    "Possession of Weapons",
    "Property Crime (Speeding)",
    "Property Damage (Firearm)",
    "Property Damage (Messer wurden sichergestellt)",
    "Property Damage (Messerstich)",
    "Property Damage (Modellwaffe aus Kunststoff)",
    "Property Damage (Pfefferspray)",
    "Property Damage (Reizstoffsprühgerät)",
    "Property Damage (Schusswaffe)",
    "Property Damage (use of a weapon)",
    "Public Intoxication",
    "Räuberische Erpressung",
    "Resisting Arrest",
    "Speeding",
    "Störung des öffentlichen Friedens durch Androhung von Straftaten",
    "Trick Theft",
    "Unknown",
    "Unlawful Possession of a Weapon",
    "Urkundenfälschung",
    "Urkundenfälschung (Impfpässe)",
    "Use of False Health Certificates",
    "Use of Symbols of Unconstitutional and Terrorist Organizations",
    "Verkehrsverstöße",
    "Verstöße gegen das Sprengstoffgesetz",
    "Verstoßes gegen das Waffengesetz",
    "Verwahrungsbruch",
    "Violation of Assembly Freedom Law",
    "Violation of Infection Protection Measures",
    "War Crimes",
    "Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte",
    'Unclassified', 
    'Bedrohung mit Waffen', 
    'Widerstand gegen Vollstreckungsbeamte',
    'Unclassified' 
    ]

columns_to_merge_to_drugO = ['and Manufacturing of Illegal Substances',
                             'Driving Under the Influence',
                             'Possession of Illegal Substances',
                             'Drug Offenses']

columns_to_merge_to_General_Assault = ['Assault', 
                                       'Assault on a Police Officer', 
                                       'Attempted Bodily Harm', 
                                       'Freiheitsberaubung',
                                       'Körperverletzung',
                                       'Körperverletzung im Amt',
                                       'General Assault']

columns_to_merge_to_Hate_CrimeGender = ['Hate Crime - Gender Identity (Anti-Transgender)',
                                        'Hate Crime Gender Identity',
                                        'Hate Crime - Gender Identity']

columns_to_merge_to_Hate_CrimeRE = ['Hate Crime - Ethnicity',
                                    'Hate Crime - Racial/Ethnicity (unknown)',
                                    'Hate Crime Ethnicity',
                                    'Hate Crime Racial',
                                    'Hate Crime - Racial/Ethnicity']

columns_to_merge_to_Hate_Reli = ['Hate Crime - Religious (Anti-Jewish)',
                                 'Hate Crime - Religious (unknown)', 
                                 'Hate Crime Religious',
                                 'Hate Crime - Religious']

columns_to_merge_toSEX = ['Hate Crime Sexual Orientatio', 
                          'Homophobia',
                          'Hate Crime - Sexual Orientation']

columns_to_merge_toHomicide = ['Attempted Murder', 
                               'Attempted Homicide',
                               'Homicide']

columns_to_merge_toPropertyC = ['Property Crimes', 
                               'Property Crime (Arson)',
                               'Property Crime (Theft)', 
                               'Property Crimes (Attempted Robbery)', 
                               'Attempted Burglary', 
                               'Attempted Property Crime',
                               'Attempted Robbery', 
                               'Attempted Theft', 
                               'Attempted Vehicle Theft', 
                               'Burglary', 
                               'Theft',
                               'Trespassing', 
                               'Unauthorized Entry',
                               'Robbery', 
                               'Robbery with a Firearm',
                               'Property Crime']

columns_to_merge_toPropertyD = ['Arso', 
                                'Arson',
                                'Attempted Arson', 
                                'Attempted Property Damage',
                                'Property Theft',
                                'Sachbeschädigung', 
                                'Vandalism',
                                'Property Damage']

columns_to_merge_toSA = ['Sexual Assault']

columns_to_merge_toSH = ['Sexual Harassment']

columns_to_merge_toVerbalAS = ['Threatening', 
                              'Threatening with a Weapon', 
                              'Threatening with Weapons',
                              'Threats', 
                              'Verbal Abuse', 
                              'Beleidigung', 
                              'Extortion', 
                              'üble Nachrede und Verleumdung',
                               #'Verbal Assault'
                              ]



In [76]:
cleaned_df_mlh.head(2)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Verstoßes gegen das Waffengesetz,Verstöße gegen das Sprengstoffgesetz,Verwahrungsbruch,Violation of Assembly Freedom Law,Violation of Infection Protection Measures,War Crimes,Widerstand gegen Vollstreckungsbeamte,Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte,and Manufacturing of Illegal Substances,üble Nachrede und Verleumdung
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,...,0,0,0,0,0,0,0,0,0,0
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,...,0,0,0,0,0,0,0,0,0,0


In [77]:
cleaned_df_mlh["Unclassified"] = cleaned_df_mlh[columns_to_merge_toUC].sum(axis = 1)

In [78]:
cleaned_df_mlh["Drug Offenses"] = cleaned_df_mlh[columns_to_merge_to_drugO].sum(axis = 1)

In [79]:
cleaned_df_mlh["General Assault"] = cleaned_df_mlh[columns_to_merge_to_General_Assault].sum(axis = 1)

In [80]:
cleaned_df_mlh["Hate Crime - Gender Identity"] = cleaned_df_mlh[columns_to_merge_to_Hate_CrimeGender].sum(axis = 1)

In [81]:
cleaned_df_mlh["Hate Crime - Racial/Ethnicity"] = cleaned_df_mlh[columns_to_merge_to_Hate_CrimeRE].sum(axis = 1)

In [82]:
cleaned_df_mlh["Hate Crime - Religious"] = cleaned_df_mlh[columns_to_merge_to_Hate_Reli].sum(axis = 1)

In [83]:
cleaned_df_mlh["Hate Crime - Sexual Orientation"] = cleaned_df_mlh[columns_to_merge_toSEX].sum(axis = 1)

In [84]:
cleaned_df_mlh["Homicide"] = cleaned_df_mlh[columns_to_merge_toHomicide].sum(axis = 1)

In [85]:
cleaned_df_mlh["Property Crime"] = cleaned_df_mlh[columns_to_merge_toPropertyC].sum(axis = 1)

In [86]:
cleaned_df_mlh["Property Damage"] = cleaned_df_mlh[columns_to_merge_toPropertyD].sum(axis = 1)

In [87]:
cleaned_df_mlh["Sexual Assault"] = cleaned_df_mlh[columns_to_merge_toSA].sum(axis = 1)

In [88]:
cleaned_df_mlh["Sexual Harassment"] = cleaned_df_mlh[columns_to_merge_toSH].sum(axis = 1)

In [89]:
cleaned_df_mlh["Verbal Assault"] = cleaned_df_mlh[columns_to_merge_toVerbalAS].sum(axis = 1)

In [90]:
cleaned_df_mlh.head(1)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Verstöße gegen das Sprengstoffgesetz,Verwahrungsbruch,Violation of Assembly Freedom Law,Violation of Infection Protection Measures,War Crimes,Widerstand gegen Vollstreckungsbeamte,Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte,and Manufacturing of Illegal Substances,üble Nachrede und Verleumdung,Verbal Assault
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,...,0,0,0,0,0,0,0,0,0,0


In [91]:
# List of columns to drop
columns_to_merge_toUC = [
    'Driving Without a License',
    'Driving without a License',
    'Attempted Evasion of Arrest',
    'Attempted Nötigung',
    'Attempted Suicide',
    'Begünstigung',
    'Bestechlichkeit',
    'Breach of Confidentiality',
    'Breach of Peace',
    'Bribery', "Bribery",
    "Computer Sabotage",
    "Distribution",
    "Driving without a valid license",
    "Evading Police",
    "Evading Police Control",
    "Falschbeurkundung im Amt",
    "False Imprisonment",
    "Forgery",
    "Fraud",
    "Gefährlicher Eingriff in den Straßenverkehr",
    "Gefangenenbefreiung",
    "Hate Crime - Political",
    "Hausfriedensbruch",
    "Hehlerei",
    "Hit and Run",
    "Human Trafficking",
    "Illegal Possession of a Weapon",
    "Illegal Racing",
    "Landfriedensbruch",
    "Misuse of Emergency Services",
    "Possession",
    "Possession of a Firearm",
    "Possession of a Prohibited Weapon",
    "Possession of a Weapon",
    "Possession of Illegal Firearms",
    "Possession of Illegal Items",
    "Possession of Illegal Weapon",
    "Possession of Illegal Weapons",
    "Possession of Weapon",
    "Possession of Weapons",
    "Property Crime (Speeding)",
    "Property Damage (Firearm)",
    "Property Damage (Messer wurden sichergestellt)",
    "Property Damage (Messerstich)",
    "Property Damage (Modellwaffe aus Kunststoff)",
    "Property Damage (Pfefferspray)",
    "Property Damage (Reizstoffsprühgerät)",
    "Property Damage (Schusswaffe)",
    "Property Damage (use of a weapon)",
    "Public Intoxication",
    "Räuberische Erpressung",
    "Resisting Arrest",
    "Speeding",
    "Störung des öffentlichen Friedens durch Androhung von Straftaten",
    "Trick Theft",
    "Unknown",
    "Unlawful Possession of a Weapon",
    "Urkundenfälschung",
    "Urkundenfälschung (Impfpässe)",
    "Use of False Health Certificates",
    "Use of Symbols of Unconstitutional and Terrorist Organizations",
    "Verkehrsverstöße",
    "Verstöße gegen das Sprengstoffgesetz",
    "Verstoßes gegen das Waffengesetz",
    "Verwahrungsbruch",
    "Violation of Assembly Freedom Law",
    "Violation of Infection Protection Measures",
    "War Crimes",
    "Widerstand gegen und tätlichen Angriffs auf Vollstreckungsbeamte",
    'Bedrohung mit Waffen', 
    'Widerstand gegen Vollstreckungsbeamte',
    ]

columns_to_merge_to_drugO = ['and Manufacturing of Illegal Substances',
                             'Driving Under the Influence',
                             'Possession of Illegal Substances']

columns_to_merge_to_General_Assault = ['Assault', 
                         'Assault on a Police Officer', 
                         'Attempted Bodily Harm', 
                         'Freiheitsberaubung',
                         'Körperverletzung',
                         'Körperverletzung im Amt']

columns_to_merge_to_Hate_CrimeGender = ['Hate Crime - Gender Identity (Anti-Transgender)',
                                        'Hate Crime Gender Identity']

columns_to_merge_to_Hate_CrimeRE = ['Hate Crime - Ethnicity',
                                    'Hate Crime - Racial/Ethnicity (unknown)',
                                    'Hate Crime Ethnicity',
                                    'Hate Crime Racial']

columns_to_merge_to_Hate_Reli = ['Hate Crime - Religious (Anti-Jewish)',
                                 'Hate Crime - Religious (unknown)', 
                                 'Hate Crime Religious']

columns_to_merge_toSEX = ['Hate Crime Sexual Orientatio', 
                          'Homophobia']

columns_to_merge_toHomocide = ['Attempted Murder', 
                               'Attempted Homicide']

columns_to_merge_toPropertyC = ['Property Crimes', 
                               'Property Crime (Arson)',
                               'Property Crime (Theft)', 
                               'Property Crimes (Attempted Robbery)', 
                               'Attempted Burglary', 
                               'Attempted Property Crime',
                               'Attempted Robbery', 
                               'Attempted Theft', 
                               'Attempted Vehicle Theft', 
                               'Burglary', 
                               'Theft',
                               'Trespassing', 
                               'Unauthorized Entry',
                               'Robbery', 
                               'Robbery with a Firearm']

columns_to_merge_toPropertyD = ['Arso', 
                                'Arson',
                                'Attempted Arson', 
                                'Attempted Property Damage',
                                'Property Theft',
                                'Sachbeschädigung', 
                                'Vandalism']


columns_to_merge_toVerbalAS = ['Threatening', 
                              'Threatening with a Weapon', 
                              'Threatening with Weapons',
                              'Threats', 
                              'Verbal Abuse', 
                              'Beleidigung', 
                              'Extortion', 
                              'üble Nachrede und Verleumdung'
                              ]



In [92]:
cleaned_df_mlh.drop(columns=columns_to_merge_toUC, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_to_drugO, inplace=True) 
cleaned_df_mlh.drop(columns=columns_to_merge_to_General_Assault, inplace=True) 
cleaned_df_mlh.drop(columns=columns_to_merge_to_Hate_CrimeGender, inplace=True) 
cleaned_df_mlh.drop(columns=columns_to_merge_to_Hate_CrimeRE, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_to_Hate_Reli, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_toSEX, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_toHomocide, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_toPropertyC, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_toPropertyD, inplace=True)
#cleaned_df_mlh.drop(columns=columns_to_merge_toVerbalAb, inplace=True)
cleaned_df_mlh.drop(columns=columns_to_merge_toVerbalAS, inplace=True)
#cleaned_df_mlh.drop(columns=columns_to_merge_toSA, inplace=True)

In [94]:
cleaned_df_mlh.head(4)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Racial/Ethnicity,Hate Crime - Religious,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,2.15 Uhr,"Female, Male",Male,3,...,0,0,0,0,0,1,0,0,0,0
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00 Uhr,Male,Male,1,...,0,0,0,1,0,0,0,0,0,0
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14.01,1.15 Uhr,Male,Male,1,...,0,0,0,0,0,1,0,0,0,1
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20.06,Unknown,Male,Male,1,...,0,0,0,0,1,0,0,0,0,0


In [97]:
cleaned_filtered_df = cleaned_df_mlh.copy()

In [99]:
cleaned_filtered_df[cleaned_filtered_df['Property Crime'] >= 1]

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Racial/Ethnicity,Hate Crime - Religious,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20.06,Unknown,Male,Male,1,...,0,0,0,0,1,0,0,0,0,0
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13.01,2.40 Uhr,Unknown,Male,Unknown,...,0,0,0,0,1,0,0,0,0,0
5,000001_1318967,1318967,"General Assault, Property Crime, Verbal Abuse","Erwin-Barth-Platz, Charlottenburg-Wilmersdorf",2023,29.04,13.40 Uhr,Male,Male,1,...,0,0,0,0,1,0,0,0,0,1
7,000003_1317835,1317835,"Property Crime, Hehlerei","Wilhelmstadt, Sandstraße",2023,27.04,22 Uhr,Unknow,Male,1,...,0,0,0,0,1,0,0,0,1,0
12,000008_1317038,1317038,"Property Crime, General Assault","Keithstraße, Tempelhof-Schöneberg",2023,25.04,17 Uhr,"Male, Female",Male,2,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,000002_1327802,1327802,"Property Crimes, Drug Offenses",Neuköll,2023,24.05,23.00 Uhr,Unknow,Male,Unknow,...,0,0,0,0,1,0,0,0,0,0
2398,000005_1327344,1327344,"General Assault, Verbal Abuse, Robbery",Marzahn-Hellersdorf,2023,23.05,23.45 Uhr,Male,Male,1,...,0,0,0,0,1,0,0,0,0,1
2399,000006_1327305,1327305,"Robbery with a Firearm, Property Crimes",Reinickendorf,2023,23.05,7.30 Uhr,Male,Unknow,1,...,0,0,0,0,2,0,0,0,0,0
2402,000009_1327174,1327174,"General Assault, Verbal Abuse, Property Crimes...",Steglitz-Zehlendorf,2023,23.05,12.30 Uhr,Male,Male,2,...,0,0,0,0,1,0,0,0,0,1


In [628]:
#list(cleaned_df_mlh.columns)

In [629]:
#cleaned_df = ohe_df.copy


### Combine' Hate Crime - Racial/Ethnicity' and 'Hate Crime - Ethnicity'

In [100]:
cleaned_df = cleaned_filtered_df.copy()

In [631]:
#not needed anymore
# conditions = [(cleaned_df['Hate Crime - Racial/Ethnicity'] == 0) & (cleaned_df['Hate Crime - Ethnicity'] == 0),
#               (cleaned_df['Hate Crime - Racial/Ethnicity'] == 1) & (cleaned_df['Hate Crime - Ethnicity'] == 0),
#               (cleaned_df['Hate Crime - Racial/Ethnicity'] == 0) & (cleaned_df['Hate Crime - Ethnicity'] == 1),
#               (cleaned_df['Hate Crime - Racial/Ethnicity'] == 1) & (cleaned_df['Hate Crime - Ethnicity'] == 1)]
# choices = [0, 1, 1, 1]
# cleaned_df['updated_re'] = np.select(conditions, choices)

In [632]:
# Quality check
#cleaned_df.loc[(cleaned_df['Hate Crime - Racial/Ethnicity'] == 1) & (cleaned_df['Hate Crime - Ethnicity'] == 0)].head() 

In [633]:
# Replace the original column
#cleaned_df = cleaned_df.drop(columns=['Hate Crime - Racial/Ethnicity', 'Hate Crime - Ethnicity'])
#cleaned_df['Hate Crime - Racial/Ethnicity'] = cleaned_df['updated_re']
# cleaned_df.loc[cleaned_df['updated_re'] ==0] # Quality check

In [634]:
#cleaned_df = cleaned_df.drop(columns=['updated_re'])

In [635]:
#cleaned_df.columns

In [101]:
# Task 1: Convert 'Mitternacht' to '00:00'
cleaned_df['time'] = df['time'].replace('Mitternacht', '00.00')

In [102]:
# Task 2: Convert 'kurz nach Mitternacht' to '00:05'
cleaned_df['time'] = df['time'].replace('kurz nach Mitternacht', '00.05')

# Task 3: Convert 'Nachmittag' to '15:00'
cleaned_df['time'] = cleaned_df['time'].replace('Nachmittag', '15.00')

# Task 3: Convert 'Nacht' to '00:00', 'Mitta' to 15.00, 'Nach' zu 00.00 and 'Vormi' zu 11.00
cleaned_df['time'] = cleaned_df['time'].replace('Nacht', '00.00')
cleaned_df['time'] = cleaned_df['time'].replace('Mitta', '15.00')
cleaned_df['time'] = cleaned_df['time'].replace('Nach', '00.00')
cleaned_df['time'] = cleaned_df['time'].replace('Vormi', '11.00')

# Task 4: Remove the second time in a timespan
cleaned_df['time'] = cleaned_df['time'].apply(lambda x: x.split('-')[0].strip() if isinstance(x, str) and '-' in x else x)

In [103]:
# Task 5: Remove 'Uhr' from the time values
cleaned_df['time'] = cleaned_df['time'].str.replace(' Uhr', '')

In [104]:
# Task 6 Add '.00' to times without minutes
cleaned_df['time'] = cleaned_df['time'].apply(lambda x: x + '.00' if isinstance(x, str) and re.match(r'^\d+(:\d+)?$', x) else x)

In [105]:
# Task 7: Remove the second time in a timespan
cleaned_df['date'] = cleaned_df['date'].apply(lambda x: x.split('-')[0].strip() if isinstance(x, str) and '-' in x else x)
cleaned_df['date'] = cleaned_df['date'].apply(lambda x: x.split('&')[0].strip() if isinstance(x, str) and '&' in x else x)
cleaned_df['date'] = cleaned_df['date'].apply(lambda x: x.split(',')[0].strip() if isinstance(x, str) and ',' in x else x)

In [106]:
# Task 8: split and add 0 date
cleaned_df['date'] = cleaned_df['date'].apply(lambda x: ('0' + x) if len(x.split('.')[0]) < 2 else (x.split('.')[0]+'.0'+x.split('.')[1]) if len(x.split('.')[1]) < 2 else ('0'+x.split('.')[0]+'.0'+x.split('.')[1]) if (len(x.split('.')[0]) < 2 and len(x.split('.')[1]) < 2) else x)

In [107]:
# Task 9: split and add 0 tim3
cleaned_df['time'] = cleaned_df['time'].apply(lambda x: ('0' + x) if len(x.split('.')) > 1 and len(x.split('.')[0]) < 2 else (x.split('.')[0]+'.0'+x.split('.')[1]) if len(x.split('.')) > 1 and len(x.split('.')[1]) < 2 else ('0'+x.split('.')[0]+'.0'+x.split('.')[1]) if len(x.split('.')) > 1 and len(x.split('.')[0]) < 2 and len(x.split('.')[1]) < 2 else x)



In [108]:
# Check status
cleaned_df['unique_case_id'].nunique()
cleaned_df.head(10)
cleaned_df.shape

(2301, 26)

In [109]:
#add back sexual harassment, unclassifid
cleaned_df.columns

Index(['unique_case_id', 'official_case_id', 'type_of_crime', 'location',
       'year', 'date', 'time', 'victim_sex', 'offender_sex',
       'number_of_victims', 'number_of_offenders', 'Drug Offenses',
       'General Assault', 'Hate Crime - Disability', 'Hate Crime - Gender',
       'Hate Crime - Gender Identity', 'Hate Crime - Racial/Ethnicity',
       'Hate Crime - Religious', 'Hate Crime - Sexual Orientation', 'Homicide',
       'Property Crime', 'Property Damage', 'Sexual Assault',
       'Sexual Harassment', 'Unclassified', 'Verbal Assault'],
      dtype='object')

## Timestamp and date

In [110]:
cleaned_df.head(5)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Racial/Ethnicity,Hate Crime - Religious,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14.01,02.15,"Female, Male",Male,3,...,0,0,0,0,0,1,0,0,0,0
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,2.02,06.00,Male,Male,1,...,0,0,0,1,0,0,0,0,0,0
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14.01,01.15,Male,Male,1,...,0,0,0,0,0,1,0,0,0,1
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20.06,Unknown,Male,Male,1,...,0,0,0,0,1,0,0,0,0,0
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13.01,02.40,Unknown,Male,Unknown,...,0,0,0,0,1,0,0,0,0,0


In [111]:
# Replace "." with "-" in the 'date' column
cleaned_df['date'] = cleaned_df['date'].str.replace('.', '-')

# Create datetime column
cleaned_df['datetime'] = pd.to_datetime(cleaned_df.year + '-' + 
                                        cleaned_df.date + ' ' +
                                        cleaned_df.time, 
                                        format='%Y-%d-%m %H.%M',
                                        errors='coerce' # replace any invalid dates with NaT, but be aware some have date
                                       )
# Create date column
cleaned_df['year_date'] = pd.to_datetime(cleaned_df.year + '-' + 
                                        cleaned_df.date,
                                        format='%Y-%d-%m', 
                                        errors='coerce' # replace any invalid dates with NaT, but be aware some have date
                                       )


In [112]:
cleaned_df.head()

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault,datetime,year_date
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14-01,02.15,"Female, Male",Male,3,...,0,0,0,1,0,0,0,0,2022-01-14 02:15:00,2022-01-14
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,02-02,06.00,Male,Male,1,...,0,1,0,0,0,0,0,0,2022-02-02 06:00:00,2022-02-02
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14-01,01.15,Male,Male,1,...,0,0,0,1,0,0,0,1,2022-01-14 01:15:00,2022-01-14
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20-06,Unknown,Male,Male,1,...,0,0,1,0,0,0,0,0,NaT,2022-06-20
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13-01,02.40,Unknown,Male,Unknown,...,0,0,1,0,0,0,0,0,2022-01-13 02:40:00,2022-01-13


In [113]:
# Get number of NA for datetime (some might have date data)
cleaned_df['datetime'].isna().sum()

219

# Add the missing "n"

In [114]:
# Examine whether n is missing
print('Number of Gesundbrunne? ' + f"{cleaned_df['location'].str.contains('Gesundbrunne$', regex=True).sum()}")
print('Number of Neuköll? ' + f"{cleaned_df['location'].str.contains('Neuköll$', regex=True).sum()}")
print('Number of Tiergarte? ' + f"{cleaned_df['location'].str.contains('Tiergarte$', regex=True).sum()}")
print('Number of Berli? ' + f"{cleaned_df['location'].str.contains('Berli$', regex=True).sum()}")

Number of Gesundbrunne? 3
Number of Neuköll? 7
Number of Tiergarte? 3
Number of Berli? 2


In [115]:
import re
# List of words to modify
words_to_modify = ['Gesundbrunne', 'Neuköll', 'Tiergarte', 'Berli']

# Define the regular expression pattern
pattern = r'\b(' + '|'.join(words_to_modify) + r')\b'

# Update the values in the 'location' column using regular expressions
cleaned_df['location'] = cleaned_df['location'].apply(lambda x: re.sub(pattern, r'\1n', x))

# Print the updated dataframe
cleaned_df.head(5)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault,datetime,year_date
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14-01,02.15,"Female, Male",Male,3,...,0,0,0,1,0,0,0,0,2022-01-14 02:15:00,2022-01-14
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,02-02,06.00,Male,Male,1,...,0,1,0,0,0,0,0,0,2022-02-02 06:00:00,2022-02-02
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14-01,01.15,Male,Male,1,...,0,0,0,1,0,0,0,1,2022-01-14 01:15:00,2022-01-14
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20-06,Unknown,Male,Male,1,...,0,0,1,0,0,0,0,0,NaT,2022-06-20
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13-01,02.40,Unknown,Male,Unknown,...,0,0,1,0,0,0,0,0,2022-01-13 02:40:00,2022-01-13


In [116]:
# Examine whether still exists
print('Number of Gesundbrunne? ' + f"{cleaned_df['location'].str.contains('Gesundbrunne$', regex=True).sum()}")
print('Number of Neuköll? ' + f"{cleaned_df['location'].str.contains('Neuköll$', regex=True).sum()}")
print('Number of Tiergarte? ' + f"{cleaned_df['location'].str.contains('Tiergarte$', regex=True).sum()}")
print('Number of Berli? ' + f"{cleaned_df['location'].str.contains('Berli$', regex=True).sum()}")

Number of Gesundbrunne? 0
Number of Neuköll? 0
Number of Tiergarte? 0
Number of Berli? 0


# Remove U-Bahnhof

In [117]:
# Remove 'U-Bahnhof' from values in the 'location' column
cleaned_df['location'] = cleaned_df['location'].str.replace('U-Bahnhof', '')
cleaned_df['location'] = cleaned_df['location'].str.replace('S-Bahnhof', '')
cleaned_df['location'] = cleaned_df['location'].str.replace('U-Bahn', '')
cleaned_df['location'] = cleaned_df['location'].str.replace('S-Bahn', '')

In [118]:
# Print the updated dataframe
cleaned_df.head(20)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault,datetime,year_date
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14-01,02.15,"Female, Male",Male,3,...,0,0,0,1,0,0,0,0,2022-01-14 02:15:00,2022-01-14
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,02-02,06.00,Male,Male,1,...,0,1,0,0,0,0,0,0,2022-02-02 06:00:00,2022-02-02
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14-01,01.15,Male,Male,1,...,0,0,0,1,0,0,0,1,2022-01-14 01:15:00,2022-01-14
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20-06,Unknown,Male,Male,1,...,0,0,1,0,0,0,0,0,NaT,2022-06-20
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13-01,02.40,Unknown,Male,Unknown,...,0,0,1,0,0,0,0,0,2022-01-13 02:40:00,2022-01-13
5,000001_1318967,1318967,"General Assault, Property Crime, Verbal Abuse","Erwin-Barth-Platz, Charlottenburg-Wilmersdorf",2023,29-04,13.40,Male,Male,1,...,0,0,1,0,0,0,0,1,2023-04-29 13:40:00,2023-04-29
6,000002_1318965,1318965,"Property Damage, General Assault",Seydelstraße bis zur Alte-Jakob-Straße,2023,29-04,Mitternacht,Unknow,Male,Unknow,...,0,0,0,1,0,0,0,0,NaT,2023-04-29
7,000003_1317835,1317835,"Property Crime, Hehlerei","Wilhelmstadt, Sandstraße",2023,27-04,22.00,Unknow,Male,1,...,0,0,1,0,0,0,1,0,2023-04-27 22:00:00,2023-04-27
8,000004_1317465,1317465,"Property Damage, Hate Crime - Gender Identity",Friedhof in Lichtenberg,2023,26-04,17.45,Transgender,Unknow,1,...,0,0,0,1,0,0,0,0,2023-04-26 17:45:00,2023-04-26
9,000005_1317344,1317344,Property Damage,"Wedding, Togostraße",2023,26-04,23.00,Female,Unknow,1,...,0,0,0,1,0,0,0,0,2023-04-26 23:00:00,2023-04-26


# Save Cleaned data in new CSV

In [119]:
# Copy Dataframe
final_df = cleaned_df.copy()

In [120]:
final_df.head(5)

Unnamed: 0,unique_case_id,official_case_id,type_of_crime,location,year,date,time,victim_sex,offender_sex,number_of_victims,...,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault,datetime,year_date
0,000001_1167412,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022,14-01,02.15,"Female, Male",Male,3,...,0,0,0,1,0,0,0,0,2022-01-14 02:15:00,2022-01-14
1,000002_1167410,1167410,Homicide,Kühnemannstraße,2022,02-02,06.00,Male,Male,1,...,0,1,0,0,0,0,0,0,2022-02-02 06:00:00,2022-02-02
2,000003_1167332,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022,14-01,01.15,Male,Male,1,...,0,0,0,1,0,0,0,1,2022-01-14 01:15:00,2022-01-14
3,000004_1167270,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",2022,20-06,Unknown,Male,Male,1,...,0,0,1,0,0,0,0,0,NaT,2022-06-20
4,000005_1167075,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022,13-01,02.40,Unknown,Male,Unknown,...,0,0,1,0,0,0,0,0,2022-01-13 02:40:00,2022-01-13


In [121]:
# drop not needed columns: unique_case_id
final_df = final_df.drop(columns=['unique_case_id', 'year', 'date'])
final_df.head()

Unnamed: 0,official_case_id,type_of_crime,location,time,victim_sex,offender_sex,number_of_victims,number_of_offenders,Drug Offenses,General Assault,...,Hate Crime - Sexual Orientation,Homicide,Property Crime,Property Damage,Sexual Assault,Sexual Harassment,Unclassified,Verbal Assault,datetime,year_date
0,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",02.15,"Female, Male",Male,3,1,0,0,...,0,0,0,1,0,0,0,0,2022-01-14 02:15:00,2022-01-14
1,1167410,Homicide,Kühnemannstraße,06.00,Male,Male,1,1,0,0,...,0,1,0,0,0,0,0,0,2022-02-02 06:00:00,2022-02-02
2,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,01.15,Male,Male,1,3,1,1,...,0,0,0,1,0,0,0,1,2022-01-14 01:15:00,2022-01-14
3,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",Unknown,Male,Male,1,2,0,1,...,0,0,1,0,0,0,0,0,NaT,2022-06-20
4,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",02.40,Unknown,Male,Unknown,1,0,0,...,0,0,1,0,0,0,0,0,2022-01-13 02:40:00,2022-01-13


In [122]:
final_df.columns = map(str.lower, final_df.columns)
final_df.columns = [col.replace(' ', '_').replace('_-_', '-') for col in final_df.columns]

In [123]:
final_df.columns

Index(['official_case_id', 'type_of_crime', 'location', 'time', 'victim_sex',
       'offender_sex', 'number_of_victims', 'number_of_offenders',
       'drug_offenses', 'general_assault', 'hate_crime-disability',
       'hate_crime-gender', 'hate_crime-gender_identity',
       'hate_crime-racial/ethnicity', 'hate_crime-religious',
       'hate_crime-sexual_orientation', 'homicide', 'property_crime',
       'property_damage', 'sexual_assault', 'sexual_harassment',
       'unclassified', 'verbal_assault', 'datetime', 'year_date'],
      dtype='object')

In [124]:
arranged_col_lst = [
    'official_case_id', 'type_of_crime', 'location', 
    'datetime', 'year_date',
    'time', 
    'victim_sex', 'offender_sex', 'number_of_victims', 'number_of_offenders', 'homicide',
   'hate_crime-disability', 'hate_crime-gender',
   'hate_crime-gender_identity', 'hate_crime-religious',
   'hate_crime-sexual_orientation', 'hate_crime-racial/ethnicity',
    'verbal_assault',
   'property_damage', 'drug_offenses', 'general_assault', 'sexual_assault',
   'sexual_harassment', 'property_crime', 
#         'domestic_violence','missing_person', 'traffic_incident', 
        'unclassified'
       
                    ]

final_df = final_df[arranged_col_lst] 

In [126]:
final_df['verbal_assault'].sum()

542

In [127]:
final_df

Unnamed: 0,official_case_id,type_of_crime,location,datetime,year_date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders,...,hate_crime-sexual_orientation,hate_crime-racial/ethnicity,verbal_assault,property_damage,drug_offenses,general_assault,sexual_assault,sexual_harassment,property_crime,unclassified
0,1167412,Property Damage,"Blankenburger Pflasterweg, Heinersdorfer Straße",2022-01-14 02:15:00,2022-01-14,02.15,"Female, Male",Male,3,1,...,0,0,0,1,0,0,0,0,0,0
1,1167410,Homicide,Kühnemannstraße,2022-02-02 06:00:00,2022-02-02,06.00,Male,Male,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1167332,"General Assault, Verbal Abuse, Property Damage...",Rathausstraße,2022-01-14 01:15:00,2022-01-14,01.15,Male,Male,1,3,...,0,0,1,1,1,1,0,0,0,0
3,1167270,"Property Crime, General Assault","Gropiusstadt, Neukölln",NaT,2022-06-20,Unknown,Male,Male,1,2,...,0,0,0,0,0,1,0,0,1,0
4,1167075,Property Crime,"Hauptstraße, Tempelhof-Schöneberg",2022-01-13 02:40:00,2022-01-13,02.40,Unknown,Male,Unknown,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2400,1327301,"Sexual Assault, Verbal Abuse",Lichtenberg,NaT,2023-05-23,Unknow,Unknow,Male,Unknow,1,...,0,0,1,0,0,0,1,0,0,0
2401,1327284,"General Assault, Verbal Abuse, Hate Crime Racial","Max-Josef-Metzger-Platz, Wedding",2023-05-23 18:30:00,2023-05-23,18.30,Male and Female,Male,7,1,...,0,1,1,0,0,1,0,0,0,0
2402,1327174,"General Assault, Verbal Abuse, Property Crimes...",Steglitz-Zehlendorf,2023-05-23 12:30:00,2023-05-23,12.30,Male,Male,2,1,...,0,0,1,0,0,2,0,0,1,0
2403,1327172,"General Assault, Property Damage","Dolgenseestraße, Lichtenberg",2023-05-23 13:00:00,2023-05-23,13.00,Male and Female,Unknow,40,Unknow,...,0,0,0,1,0,1,0,0,0,0


In [128]:
cur_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")
final_df.to_csv(f"/home/marjal/code/hclpush/finding-conan/data-preparation/final-crime-data_{cur_datetime}.csv")


## Remove Berlin-wide cases

In [129]:
wip_df = final_df.copy()

In [130]:
# How many Berlinweit do we have
wip_df.loc[wip_df['location'] == 'Berlinweit'].shape[0] 

35

In [131]:
# How many Berlin do we have
wip_df.loc[wip_df['location'] == 'Berlin'].shape[0]

1

In [132]:
wip_df = wip_df.loc[wip_df['location'] != 'Berlinweit']

In [133]:
# How many Berlinweit do we have
wip_df.loc[wip_df['location'] == 'Berlinweit'].shape[0] 

0

In [134]:
# How many Berlin do we have
wip_df = wip_df.loc[wip_df['location'] != 'Berlin']

In [135]:
# How many Berlinweit do we have
wip_df.loc[wip_df['location'] == 'Berlinweit'].shape[0] 

0

In [136]:
wip_df.shape

(2265, 25)

## Adding timeslot

In [137]:
# Sort df
wip_df = wip_df.sort_values(by=['datetime'])

# Remove data outside 2021/9/13 and 2022/5/24
start_time = datetime(2021, 9, 13, 0, 0)  # Starting datetime
end_time = datetime(2023, 5, 25, 0, 0)  # Ending datetime
wip_df = wip_df.loc[(wip_df['datetime'] >= start_time) & (wip_df['datetime'] <= end_time)].reset_index(drop=True)
wip_df.head()

Unnamed: 0,official_case_id,type_of_crime,location,datetime,year_date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders,...,hate_crime-sexual_orientation,hate_crime-racial/ethnicity,verbal_assault,property_damage,drug_offenses,general_assault,sexual_assault,sexual_harassment,property_crime,unclassified
0,1125354,Property Damage,"Bugenhagenstraße, Moabit",2021-09-13 00:30:00,2021-09-13,0.3,Unknown,Male,0,3,...,0,0,0,1,0,0,0,0,0,0
1,1125574,Property Damage,"Mierendorffplatz, Charlottenburg-Wilmersdorf",2021-09-13 05:00:00,2021-09-13,5.0,Unknown,Unknown,0,Unknown,...,0,0,0,1,0,0,0,0,0,0
2,1125707,"General Assault, Property Crime","Parkanlage am Brunnenplatz, Gropiusstraße, Bad...",2021-09-13 08:30:00,2021-09-13,8.3,Male,Male,1,4,...,0,0,0,0,0,1,0,0,1,0
3,1125352,Property Damage,Platz der Republik,2021-09-13 13:40:00,2021-09-13,13.4,Unknown,Male,Unknown,1,...,0,0,0,1,0,0,0,0,0,0
4,1125567,"General Assault, Resisting Arrest",Axel-Springer-Straße,2021-09-13 17:15:00,2021-09-13,17.15,Male,Male,1,2,...,0,0,0,0,0,1,0,0,0,1


In [138]:
def get_time_slot(tval):
    """ Return the label associated with the timestamp """
    labels = ['Midnight', 'Early Morning', 'Morning', 'Afternoon', 'Evening', 'Night']
    slot_start = [(0, 0), (4, 0), (8, 0), (12, 0), (16, 0), (20, 0)]
    for lidx, tme in enumerate(slot_start):
        if tme[0] > tval.hour:
            return labels[lidx-1]
        elif tval.hour == tme[0] and tme[1] <= tval.minute:
            return labels[lidx]
    return labels[-1]  

In [139]:
wip_df['time_slot'] = wip_df.apply(lambda row: get_time_slot(row.datetime), axis=1)

In [140]:
wip_df[['datetime', 'time_slot']]

Unnamed: 0,datetime,time_slot
0,2021-09-13 00:30:00,Midnight
1,2021-09-13 05:00:00,Early Morning
2,2021-09-13 08:30:00,Morning
3,2021-09-13 13:40:00,Afternoon
4,2021-09-13 17:15:00,Evening
...,...,...
1939,2023-05-23 19:00:00,Evening
1940,2023-05-23 23:45:00,Night
1941,2023-05-24 17:00:00,Evening
1942,2023-05-24 17:45:00,Evening


In [141]:
wip_df.shape

(1944, 26)

In [142]:
# cur_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")
# wip_df.to_csv(f"/Users/ellenlee/code/hclpush/finding-conan/raw-data/structured-data/final-crime-data_{cur_datetime}.csv")


In [143]:
wip_df.head(5)

Unnamed: 0,official_case_id,type_of_crime,location,datetime,year_date,time,victim_sex,offender_sex,number_of_victims,number_of_offenders,...,hate_crime-racial/ethnicity,verbal_assault,property_damage,drug_offenses,general_assault,sexual_assault,sexual_harassment,property_crime,unclassified,time_slot
0,1125354,Property Damage,"Bugenhagenstraße, Moabit",2021-09-13 00:30:00,2021-09-13,0.3,Unknown,Male,0,3,...,0,0,1,0,0,0,0,0,0,Midnight
1,1125574,Property Damage,"Mierendorffplatz, Charlottenburg-Wilmersdorf",2021-09-13 05:00:00,2021-09-13,5.0,Unknown,Unknown,0,Unknown,...,0,0,1,0,0,0,0,0,0,Early Morning
2,1125707,"General Assault, Property Crime","Parkanlage am Brunnenplatz, Gropiusstraße, Bad...",2021-09-13 08:30:00,2021-09-13,8.3,Male,Male,1,4,...,0,0,0,0,1,0,0,1,0,Morning
3,1125352,Property Damage,Platz der Republik,2021-09-13 13:40:00,2021-09-13,13.4,Unknown,Male,Unknown,1,...,0,0,1,0,0,0,0,0,0,Afternoon
4,1125567,"General Assault, Resisting Arrest",Axel-Springer-Straße,2021-09-13 17:15:00,2021-09-13,17.15,Male,Male,1,2,...,0,0,0,0,1,0,0,0,1,Evening


In [145]:
cur_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")
wip_df.to_csv(f"/home/marjal/code/hclpush/finding-conan/data-preparation/final-crime-data_{cur_datetime}.csv")