**Data Transformation Notebook**

<div style="font-family: system-ui; padding: 20px 30px 20px 20px; background-color: #FFFFFF; border-left: 8px solid #ED9255; border-radius: 8px; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);max-width:600px;color:#212121;">

- ðŸ“… Date: 19 November 2025
- ðŸ‘¤ Name: Lethokuhle Sikosana
- ðŸŽ¯ Purpose: Conduct Statistical Analysis on SAPS Crime Data from 2008 - 2013

<span style="display:block;line-height:1.15em;color:#666666;font-size:0.9em;">
</span>

</div>

## Imports

In [11]:
import pandas as pd
import statsmodels.formula.api as smf

## Loading the Data

In [12]:
rustenburg_municipality_df = pd.read_csv('data/rustenburg_municipality_crime_data.csv')
#Check if converted to DataFrame correctly
rustenburg_municipality_df.head(5)

Unnamed: 0,year,station,total_crime,contact_related_crime,crimes_against_property,other_serious_crimes,contact_crime,crimes_dependent_on_police_action_for_detention,total_crime_excl_police,crimes_against_property_log,contact_crime_log,contact_related_crime_log,other_serious_crimes_log,crimes_dependent_on_police_action_for_detention_log,total_crime_log,total_crime_excl_police_log,treated,year_numeric,post,did
0,2008/2009,bethanie,1129,82,249,265,550,65,1064,5.521461,6.311735,4.418841,5.583496,4.189655,7.029973,6.97073,0,2008,0,0
1,2008/2009,boitekong,1666,98,411,149,741,365,1301,6.021023,6.609349,4.59512,5.010635,5.902633,7.418781,7.171657,0,2008,0,0
2,2008/2009,boons,244,15,88,81,60,15,229,4.488636,4.110874,2.772589,4.406719,2.772589,5.501258,5.438079,0,2008,0,0
3,2008/2009,lethabong,573,34,131,68,227,147,426,4.882802,5.429346,3.555348,4.234107,4.997212,6.352629,6.056784,0,2008,0,0
4,2008/2009,marikana,1640,119,354,451,657,178,1462,5.872118,6.489205,4.787492,6.113682,5.187386,7.403061,7.288244,1,2008,0,0


## DiD Regressions

## Model 1 (Police Model)

In [13]:
model1 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ did + post + treated + C(year_numeric) + C(station) + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=rustenburg_municipality_df
).fit(cov_type='HC1')

In [14]:
#Save the summary
full_summary1 = model1.summary().as_text()

with open('model-summaries/police_model.txt', 'w') as f:
    f.write(full_summary1)



## Model 2 (Parallel Trends Check - Police Model)

In [15]:
#Filtering the pre-treatment years
pre_treatment_df = rustenburg_municipality_df[
    rustenburg_municipality_df['year_numeric'] < 2012
]

model2 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ treated*year_numeric + C(station) + C(year_numeric) + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=pre_treatment_df
).fit(cov_type='HC1')

In [16]:
#Save the summary
full_summary2 = model2.summary().as_text()

with open('model-summaries/parallel_trends(police_model).txt', 'w') as f:
    f.write(full_summary2)



## Model 3 (Citizen Model)

In [17]:
model3 = smf.ols(
    'total_crime_excl_police_log ~ did + post + treated + C(year_numeric) + C(station) + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df
).fit(cov_type='HC1')

In [18]:
#Save the summary
full_summary3 = model3.summary().as_text()

with open('model-summaries/citizen_model.txt', 'w') as f:
    f.write(full_summary3)



## Model 4 (Parallel Trends Check - Citizen Model)

In [19]:
model4 = smf.ols(
    'total_crime_log ~ treated*year_numeric + C(station) + C(year_numeric) + crimes_dependent_on_police_action_for_detention_log',
    data=pre_treatment_df
).fit(cov_type='HC1')

In [20]:
#Save the summary
full_summary4 = model4.summary().as_text()

with open('model-summaries/parallel_trends(citizen_model).txt', 'w') as f:
    f.write(full_summary4)

