# Cleaning RIPA Datasets
Jupyter can't handle importing all of the uncleaned RIPA data, so I need to clean them individually first, upload the cleaned version to GitHub, and then import them into my Data Exploration notebook

In [1]:
# Import statements
import pandas as pd
import numpy as np

In [2]:
# Cleaning pipeline
def clean_ripa_data(df_name, year):
    """
    df_name: Name of the csv file, eg "ripa_orange_2022.csv"
    year: Year that the dataset is from, for naming purposes when exporting
    """

    # Keep only the columns relevant to our analysis
    relevant_columns = [
    "RAE_FULL", # Race
    "REASON_FOR_STOP",
    "ADS_SEARCH_PERSON", 
    "ADS_SEARCH_PROPERTY", 
    "CED_NONE_CONTRABAND", 
    "CED_DRUGS", 
    "CED_WEAPON", 
    "CED_ALCOHOL",
    "CED_STOLEN_PROP", 
    "CED_FIREARM", 
    "CED_AMMUNITION",
    "CED_DRUG_PARAPHERNALIA", 
    "CED_MONEY", 
    "CED_OTHER_CONTRABAND"
    ]

    # Only keep rows that have a discretionary reason for stop
    discretionary_stops = [1, 2, 6, 8]

    # Load data in chunks
    cleaned_chunks = []

    for chunk in pd.read_csv(df_name, chunksize=100000):
        # Keep only relevant columns
        chunk = chunk[relevant_columns]
        chunk = chunk[chunk["REASON_FOR_STOP"].isin(discretionary_stops)]
        cleaned_chunks.append(chunk)
    cleaned_df = pd.concat(cleaned_chunks, ignore_index=True)

    # Export cleaned table
    export_name = f'cleaned_ripa_orange_{year}.csv'
    cleaned_df.to_csv(export_name, index=False)

    print(f'Saved cleaned file as {export_name}')

In [4]:
#clean_ripa_data("ripa_orange_2021.csv", "2021")
clean_ripa_data("ripa_orange_2022.csv", "2022")
#clean_ripa_data("ripa_orange_2023.csv", "2023")

  for chunk in pd.read_csv(df_name, chunksize=100000):
  for chunk in pd.read_csv(df_name, chunksize=100000):


Saved cleaned file as cleaned_ripa_orange_2022.csv
