## Data cleaning - reduce complicated and less intresting cases 

*Issue 1: Some files contain cases with additional follow-up reports*

**Example:**

     "title": "Die Mordkommission sucht nach unbekannter verdächtiger Person – Belohnung ausgelobt",
    "place": "Charlottenburg-Wilmersdorf",
    "date": "25.03.2023",
    "subtitle": "Gemeinsame Meldung der Polizei und der Staatsanwaltschaft Berlin",
    "data": [
        {
            "number": "Gemeinsame Meldung der Polizei und der Staatsanwaltschaft Berlin",
            "description": ""
       
            "number": "0430",
            "description": "Die 50-jährige Inhaberin eines Kosmetikstudios am Walter-Benjamin-Platz in Berlin-Charlottenburg, Oksana Romberg, wurde am Donnerstag, 1. April 2021, gegen 18 Uhr tot in ihrer Wohnung am Kurfürstendamm gefunden. Sie wurde dort am 31. März 2021 Opfer eines Tötungsdelikts. Der Täter setzte bei der Tatbegehung neben körperlicher Gewalt auch mittelgroße Kieselsteine ein und erbeutete unter anderem ein goldfarbenes Handy iPhone 12 Pro Max sowie einen Schlüsselbund mit einem roten, herzförmigen Anhänger.\n    Im Rahmen der Ermittlungen konnte das Video einer Überwachungskamera zu einem Innenhof des Walter-Benjamin-Platzes gesichert werden, auf dem am 31. März 2021, gegen 17.30 Uhr eine verdächtige Person in der Nähe des Tatorts zu sehen ist."              
        {
            "number": "Belohnung in Höhe von bis zu 5.000 Euro",
            "description": "Die Staatsanwaltschaft Berlin hat eine  für Hinweise zur Gewinnung von Beweismitteln, durch die der Täter überführt werden kann, ausgesetzt. Dabei ist die Belohnung ausschließlich für Personen aus der Bevölkerung bestimmt, zu deren Berufspflichten nicht die Verfolgung von Straftaten gehört. Personen, die an der Tat beteiligt waren, sind von einer Zuteilung ebenso ausgeschlossen. Die Verteilung der Belohnung findet unter Ausschluss des Rechtsweges statt."
        }


*Issue 2: We have too many cases and some are less relevant for our use-case then others. e.g. car accidents or missing persons are less relevant. Therefore, we created a file with keyword that we can find in the headlines and would like to use to exclude cases/json files.*

!! **Goal: We want to exclude all cases with multiple "cases"**

**Plan: 
- 1. loop over folder to access json files
- 2. creates new folders A and B
- 3. check for key "name" if it occurs just 1 time, if so sort file into folder A otherwise (>=2) B
- 4. Does this for all json files
- 5. Second - looks for all jsons in folder A and B if Key "title" contains one of our "don't want' keyswords
- 6. creates new folder named "irrelevant" in each A and B folder
- 7. if json contains keyword in title json is sorted into "irrelevant" folder**
     

-------

In [1]:
# Test for the years 2017-2023 seperatly # this did not work so in the end I added all json files from all years in one folder 

### **combined data cleaning - keywords and recurring cases - with detailed descriptions** 

In [3]:
#seperation of recurring files - works
import os  # Used for file and directory operations
import json  # Used for JSON file handling
import shutil  # Used for moving files between directories

def sort_json_files():
    source_folder = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 
                                 'raw-data', 
                                'unstructured-data',
                                'all-years-in-one')
    folder_a = 'Initial_Case' # The name of folder A - not recurring 
    folder_b = 'Recurring_Cases' # The name of folder B - recurring cases

    # Create folders if they don't exist
    if not os.path.exists(folder_a):
        os.makedirs(folder_a)
    if not os.path.exists(folder_b):
        os.makedirs(folder_b)

    for file_name in os.listdir(source_folder):
        file_path = os.path.join(source_folder, file_name)

        if os.path.isfile(file_path) and file_name.endswith('.json'):
            with open(file_path) as file:
                data = json.load(file)
                if 'data' in data and isinstance(data['data'], list):
                    number_count = sum('number' in item for item in data['data'])
                    if number_count > 1:
                        # Move the file to folder B
                        destination_path = os.path.join(folder_b, file_name)
                        shutil.move(file_path, destination_path)
                    else:
                        # Move the file to folder A
                        destination_path = os.path.join(folder_a, file_name)
                        shutil.move(file_path, destination_path)

sort_json_files()

In [4]:
#keyword seperation - works
import os
import json
import shutil

def separate_files():
    folder_a = 'Initial_Case'
    folder_b = 'Recurring_Cases'
    folder_irrelevant = 'irrelevant_cases'

    # Process files in folder A
    folder_a_irrelevant_path = os.path.join(folder_a, folder_irrelevant)
    if not os.path.exists(folder_a_irrelevant_path):
        os.makedirs(folder_a_irrelevant_path)

    for file_name in os.listdir(folder_a):
        file_path = os.path.join(folder_a, file_name)

        if os.path.isfile(file_path) and file_name.endswith('.json'):
            with open(file_path) as file:
                data = json.load(file)
                if 'title' in data and contains_keywords(data['title']):
                    # Move the file to the 'irrelevant' folder within folder A
                    destination_path = os.path.join(folder_a_irrelevant_path, file_name)
                    shutil.move(file_path, destination_path)

    # Process files in folder B
    folder_b_irrelevant_path = os.path.join(folder_b, folder_irrelevant)
    if not os.path.exists(folder_b_irrelevant_path):
        os.makedirs(folder_b_irrelevant_path)

    for file_name in os.listdir(folder_b):
        file_path = os.path.join(folder_b, file_name)

        if os.path.isfile(file_path) and file_name.endswith('.json'):
            with open(file_path) as file:
                data = json.load(file)
                if 'title' in data and contains_keywords(data['title']):
                    # Move the file to the 'irrelevant' folder within folder B
                    destination_path = os.path.join(folder_b_irrelevant_path, file_name)
                    shutil.move(file_path, destination_path)

def contains_keywords(title):
    keywords = [
        'Vermisst',
        'Verkehrsunfall',
        'Autofahrer',
        'Pkw',
        'Führerschein',
        'Fahrzeugführer',
        'Verfolgungsfahrt',
        'Autorennen',
        'Bus',
        'Fahrer',
        'Versammlungslage',
        'Unfall',
        'Straßenbahn',
        'Prostitution',
        'Versammlung'
    ]
    return any(keyword.lower() in title.lower() for keyword in keywords)

separate_files()


*An explanation of the code in more detail:*

- The os package is used to perform file and directory operations, such as listing files in a directory, checking if a file or directory exists, and joining file paths.
- The json package is used to handle JSON data. It provides functions like json.load() to load JSON data from a file.
- The shutil package is used to perform file operations, such as moving files between directories. It provides the shutil.move() function for moving files.


The separate_files() function is called to initiate the separation process.
The code processes files in folder A:
 - a. It iterates over the files in folder A using os.listdir().
 - b. For each file, it checks if it is a JSON file (ends with '.json') and if it exists in folder A.
 - c. It opens the file using open(file_path) and reads the JSON data using json.load(file).
 - d. It checks if the 'title' key exists in the JSON data and if it contains any of the specified keywords using the contains_keywords() function.
 - e. If the file meets the criteria, it moves the file to the "irrelevant" folder within folder A using shutil.move().

The code then processes files in folder B:
- a. It follows a similar process as for folder A to iterate over the files, check for JSON files, load the JSON data, and check the 'title' key for the specified keywords.
- b. If the file meets the criteria, it moves the file to the "irrelevant" folder within folder B.


The contains_keywords() function takes a title string as input and checks if any of the specified keywords exist in the title. It uses a case-insensitive comparison to match the keywords.

By using these Python packages and functions, the code performs the necessary operations to separate the files into folders A and B based on the presence of specific keywords in the 'title' key. It then further separates the files in each folder by moving the relevant files to the respective "irrelevant" folders

---------------------------------------------

**to do:**
- So far there is a separate "irrelevant" folder for A and one for "B" might be nicer to combine those - DECIDED NOT TO
- Rename folders DONE
- I did not separate per year - used data was 2017-2023 (>10.000 cases ended in folder A) - NO NEED
- I did not test for different word endings of the keywords yet
- check results - DONE
    


In [36]:
# different version

In [None]:
# keyword seperation - works
# import os  # Used for file and directory operations
# import json  # Used for JSON file handling
# import shutil  # Used for moving files between directories

# def separate_files():
#     folder_a = 'A'  # The name of folder A - not recurring 
#     folder_b = 'B'  # The name of folder B - recurring cases
#     folder_irrelevant = 'irrelevant'  # The name of the "irrelevant" folder

#     # Process files in folder A

#     # Iterate over the files in folder A
#     for file_name in os.listdir(folder_a):
#         file_path = os.path.join(folder_a, file_name)  # Get the full path of the file

#         # Check if the current item is a file and ends with '.json'
#         if os.path.isfile(file_path) and file_name.endswith('.json'):
#             with open(file_path) as file:
#                 data = json.load(file)  # Load the JSON data from the file
#                 if 'title' in data and contains_keywords(data['title']):
#                     # Move the file to the 'irrelevant' folder within folder A
#                     destination_path = os.path.join(folder_a, folder_irrelevant, file_name)
#                     shutil.move(file_path, destination_path)

#     # Process files in folder B

#     # Iterate over the files in folder B
#     for file_name in os.listdir(folder_b):
#         file_path = os.path.join(folder_b, file_name)  # Get the full path of the file

#         # Check if the current item is a file and ends with '.json'
#         if os.path.isfile(file_path) and file_name.endswith('.json'):
#             with open(file_path) as file:
#                 data = json.load(file)  # Load the JSON data from the file
#                 if 'title' in data and contains_keywords(data['title']):
#                     # Move the file to the 'irrelevant' folder within folder B
#                     destination_path = os.path.join(folder_b, folder_irrelevant, file_name)
#                     shutil.move(file_path, destination_path)

# def contains_keywords(title):
#     # List of specified keywords
#     keywords = [
#         'Vermisst',
#         'Verkehrsunfall',
#         'Autofahrer',
#         'Pkw',
#         'Führerschein',
#         'Fahrzeugführer',
#         'Verfolgungsfahrt',
#         'Autorennen',
#         'Bus',
#         'Fahrer',
#         'Versammlungslage',
#         'Unfall',
#         'Straßenbahn',
#       'Prostitution',
#         'Versammlung'
#     ]

#     # Check if any of the keywords are present in the title (case-insensitive)
#     return any(keyword.lower() in title.lower() for keyword in keywords)

# # Call the function to separate the files
# separate_files()
