**Data Transformation Notebook**

<div style="font-family: system-ui; padding: 20px 30px 20px 20px; background-color: #FFFFFF; border-left: 8px solid #ED9255; border-radius: 8px; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);max-width:600px;color:#212121;">

- ðŸ‘¤ Name: Lethokuhle Sikosana
- ðŸŽ¯ Purpose: Conduct Statistical Analysis on SAPS Crime Data from 2008 - 2013

<span style="display:block;line-height:1.15em;color:#666666;font-size:0.9em;">
</span>

</div>

## Imports

In [27]:
import pandas as pd
import numpy as np

## Loading the Data

In [28]:
crime_data = pd.read_csv('data/sapacr-2008-2023-v1.1.csv', encoding='latin1')
#Check if converted to DataFrame correctly
crime_data.head(3)

Unnamed: 0,year,station,loc_mn,dc_mn,longitude,latitude,other_theft,arson,assault_gbh,attempted_murder,...,cash_transit_robbery,aggr_robbery,sexual_assault,sexual_offences,police_detected_sexoff,shoplifting,stock_theft,vehicle_theft,theft_from_vehicle,truck_hijacking
0,2008/2009,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,491,3,570,25,...,1,293,7,119,0,218,0,143,272,0
1,2009/2010,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,452,5,625,9,...,1,276,13,115,0,185,0,124,300,0
2,2010/2011,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,525,1,586,15,...,0,238,23,101,0,164,0,96,256,0


## Data Transformations

### Dropping Rows with Missing Data

In [29]:
crime_data = crime_data.dropna(how='any')

### Creating Crime Categories and Totalling Them

In [30]:
#Creating columns using the SAPS defined crime categories
crime_data['contact_related_crime'] = crime_data['arson'] + crime_data['malicious_damage']

crime_data['crimes_against_property'] = (
    crime_data[['burglary_res', 'burglary_nonres', 'vehicle_theft', 'theft_from_vehicle', 'stock_theft']].sum(axis=1)
)

crime_data['contact_crime'] = (
    crime_data[['murder', 'attempted_murder', 'sexual_offences', 'assault_gbh', 'common_assault', 'common_robbery', 'aggr_robbery']].sum(axis=1)
)

crime_data['other_serious_crimes'] = (
    crime_data[['other_theft', 'commercial_crime', 'shoplifting']].sum(axis=1)
)

crime_data['crimes_dependent_on_police_action_for_detention'] = (
    crime_data[['illegal_firearms', 'dui', 'drug_crime', 'police_detected_sexoff']].sum(axis=1)
)

#Creating a total crime column based on the sum of these categories (this is citizen reported crime)
crime_data['total_crime'] = crime_data[
    ['crimes_against_property', 'contact_crime', 'other_serious_crimes', 'crimes_dependent_on_police_action_for_detention']
].sum(axis=1)

#Creating a total crime excluding police dependent reports (essentially a measure of police activity)
crime_data['total_crime_excl_police'] = (
    crime_data['total_crime'] - crime_data['crimes_dependent_on_police_action_for_detention']
)

#Check if transformed DataFrame is correct
crime_data.head(3)

Unnamed: 0,year,station,loc_mn,dc_mn,longitude,latitude,other_theft,arson,assault_gbh,attempted_murder,...,vehicle_theft,theft_from_vehicle,truck_hijacking,contact_related_crime,crimes_against_property,contact_crime,other_serious_crimes,crimes_dependent_on_police_action_for_detention,total_crime,total_crime_excl_police
0,2008/2009,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,491,3,570,25,...,143,272,0,239,745,1606,870,360,3581,3221
1,2009/2010,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,452,5,625,9,...,124,300,0,257,690,1639,791,569,3689,3120
2,2010/2011,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,525,1,586,15,...,96,256,0,209,666,1543,936,346,3491,3145


### Adding the columns with the natural log of each crime category

In [31]:
# Transforming crime category columns to logs 
# Adding 1 in case values are 0 (log of zero is undefined)

crime_data['crimes_against_property_log'] = np.log(
    crime_data['crimes_against_property'] + 1
)
crime_data['contact_crime_log'] = np.log(
    crime_data['contact_crime'] + 1
)
crime_data['contact_related_crime_log'] = np.log(
    crime_data['contact_related_crime'] + 1
)
crime_data['other_serious_crimes_log'] = np.log(
    crime_data['other_serious_crimes'] + 1
)
crime_data['crimes_dependent_on_police_action_for_detention_log'] = np.log(
    crime_data['crimes_dependent_on_police_action_for_detention'] + 1
)
crime_data['total_crime_log'] = np.log(
    crime_data['total_crime'] + 1
)
crime_data['total_crime_excl_police_log'] = np.log(
    crime_data['total_crime_excl_police'] + 1
)

### Creating the Columns for the DiD Model

In [32]:
# Define treatment (Rustenburg Local Municipality = 1, Rest of South Africa = 0)
crime_data['treated_group'] = (crime_data['loc_mn'] == 'rustenburg').astype(int)

# Get main year of the fiscal year for readability
crime_data['year_numeric'] = crime_data['year'].str[:4].astype(int)

# Define treatment period (year 2012/2013 or later)
crime_data['treatment_year'] = (
    (crime_data['year_numeric'] >= 2012) &
    (crime_data['year_numeric'] <= 2015)
).astype(int)

crime_data.head(5)

Unnamed: 0,year,station,loc_mn,dc_mn,longitude,latitude,other_theft,arson,assault_gbh,attempted_murder,...,crimes_against_property_log,contact_crime_log,contact_related_crime_log,other_serious_crimes_log,crimes_dependent_on_police_action_for_detention_log,total_crime_log,total_crime_excl_police_log,treated_group,year_numeric,treatment_year
0,2008/2009,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,491,3,570,25,...,6.614726,7.382124,5.480639,6.769642,5.888878,8.183677,8.077758,0,2008,0
1,2009/2010,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,452,5,625,9,...,6.53814,7.402452,5.55296,6.674561,6.345636,8.213382,8.045909,0,2009,0
2,2010/2011,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,525,1,586,15,...,6.50279,7.342132,5.347108,6.842683,5.849325,8.15823,8.053887,0,2010,0
3,2011/2012,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,559,2,591,18,...,6.459904,7.339538,5.361292,6.766192,6.25575,8.178919,8.021256,0,2011,0
4,2012/2013,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,562,0,556,29,...,6.684612,7.395722,5.572154,6.715383,6.287859,8.239857,8.087025,0,2012,1


## Saving the New DataFrame for Other's Replication

In [33]:
#Save as a CSV for easier analysis later
crime_data.to_csv('data/expanded_crime_data.csv', 
    index=False)