**Data Transformation Notebook**

<div style="font-family: system-ui; padding: 20px 30px 20px 20px; background-color: #FFFFFF; border-left: 8px solid #ED9255; border-radius: 8px; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);max-width:600px;color:#212121;">

- ðŸ‘¤ Name: Lethokuhle Sikosana
- ðŸŽ¯ Purpose: Conduct Statistical Analysis on SAPS Crime Data from 2008 - 2013

<span style="display:block;line-height:1.15em;color:#666666;font-size:0.9em;">
</span>

</div>

## Imports

In [24]:
import pandas as pd
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

## Loading the Data

In [25]:
crime_data = pd.read_csv('data/expanded_crime_data.csv')
#Check if converted to DataFrame correctly
crime_data.head(5)

Unnamed: 0,year,station,loc_mn,dc_mn,longitude,latitude,other_theft,arson,assault_gbh,attempted_murder,...,crimes_against_property_log,contact_crime_log,contact_related_crime_log,other_serious_crimes_log,crimes_dependent_on_police_action_for_detention_log,total_crime_log,total_crime_excl_police_log,treated_group,year_numeric,treatment_year
0,2008/2009,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,491,3,570,25,...,6.614726,7.382124,5.480639,6.769642,5.888878,8.183677,8.077758,0,2008,0
1,2009/2010,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,452,5,625,9,...,6.53814,7.402452,5.55296,6.674561,6.345636,8.213382,8.045909,0,2009,0
2,2010/2011,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,525,1,586,15,...,6.50279,7.342132,5.347108,6.842683,5.849325,8.15823,8.053887,0,2010,0
3,2011/2012,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,559,2,591,18,...,6.459904,7.339538,5.361292,6.766192,6.25575,8.178919,8.021256,0,2011,0
4,2012/2013,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,562,0,556,29,...,6.684612,7.395722,5.572154,6.715383,6.287859,8.239857,8.087025,0,2012,1


## DiD Regressions

## Model 1 (Police Model)

In [26]:
model1 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ treated_group*treatment_year + treatment_year + treated_group + C(year_numeric) + C(station) + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=crime_data,
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
).fit(cov_type='HC1')



## Model 2 (Parallel Trends Check - Police Model)

In [27]:
#Filtering the pre-treatment years
pre_treatment_df = crime_data[
    crime_data['year_numeric'] < 2012
]

model2 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ treated_group*year_numeric + C(station) + C(year_numeric) + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=pre_treatment_df,
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
).fit(cov_type='HC1')



In [28]:
stargazer = Stargazer([model2])
html2 = stargazer.render_html()

with open('model-summaries/parallel_trends(police_model).html', "w") as f:
    f.write(html2)



## Model 3 (Citizen Model)

In [29]:
model3 = smf.ols(
    'total_crime_excl_police_log ~ treated_group*treatment_year + treatment_year + treated_group + C(year_numeric) + C(station) + crimes_dependent_on_police_action_for_detention_log',
    data=crime_data,
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
).fit(cov_type='HC1')



## Model 4 (Parallel Trends Check - Citizen Model)

In [30]:
model4 = smf.ols(
    'total_crime_log ~ treated_group*year_numeric + C(station) + C(year_numeric) + crimes_dependent_on_police_action_for_detention_log',
    data=pre_treatment_df,
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
).fit(cov_type='HC1')



In [31]:
stargazer = Stargazer([model4])
html4 = stargazer.render_html()

with open('model-summaries/parallel-trends(citizen_model).html', "w") as f:
    f.write(html4)



## Model 5 (Interreupted Time Series - Citizen Model)

In [32]:
#Filtering to just Rustenburg Municipality
rustenburg_municipality_df = crime_data[
    crime_data['loc_mn'] == 'rustenburg'
]

model5 = smf.ols(
    'total_crime_log ~ treatment_year + C(station) + C(year_numeric) + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df,
    cov_type='cluster',
    cov_kwds={'groups': rustenburg_municipality_df['station']}
).fit(cov_type='HC1')



## Model 6 (Interreupted Time Series - Police Model)

In [33]:
model6 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ treatment_year + C(year_numeric) + C(station) + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=rustenburg_municipality_df,
    cov_type='cluster',
    cov_kwds={'groups': rustenburg_municipality_df['station']}
).fit(cov_type='HC1')



## Creating the Regression Summary table

In [34]:
stargazer = Stargazer([model1, model3, model5, model6])

# Rename variables for readability
stargazer.rename_covariates({
    'treated_group': 'Treated group',
    'treatment_year': 'Treatment year',
    'treated_group:treatment_year': 'Treatment year x Treated group',

    # Controls
    'contact_related_crime_log': 'Contact-related crime (log)',
    'crimes_against_property_log': 'Property crime (log)',
    'other_serious_crimes_log': 'Other serious crime (log)',
    'contact_crime_log': 'Contact crime (log)',
    'crimes_dependent_on_police_action_for_detention_log': 'Police-detected crime (log)',
})

# Covariates across DID + ITS models
combined_order = [
    # DID terms
    'treated_group',
    'treatment_year',
    'treated_group:treatment_year',

    # Controls
    'contact_related_crime_log',
    'crimes_against_property_log',
    'other_serious_crimes_log',
    'contact_crime_log',
    'crimes_dependent_on_police_action_for_detention_log'
]

stargazer.covariate_order(combined_order)

# Display formatting
stargazer.show_degrees_of_freedom(False)
stargazer.show_model_numbers(True)
stargazer.significant_digits(3)
stargazer.title("Regression Summary: Effects of Marikana Commission and Massacre")

stargazer.custom_columns(
    ["Police Model (DiD)", "Citizen Model (DiD)", "Citizen Model (ITS)", "Police Model (ITS)"],
    [1, 1, 1, 1]
)

# Render HTML
html = stargazer.render_html()

# Add custom CSS
custom_css = """
<style>
table.stargazer {
    font-family: Helvetica, Arial, sans-serif;
    border-collapse: collapse;
    margin-left: auto;
    margin-right: auto;
    font-size: 13px;
}
table.stargazer th {
    padding: 6px 12px;
    text-align: center;
    font-weight: bold;
}
table.stargazer td {
    padding: 4px 12px;
    border-top: 1px solid #ddd;
}
table.stargazer tr:nth-child(even) {
    background-color: #fafafa;
}
caption {
    font-size: 16px;
    font-weight: bold;
    margin-bottom: 12px;
}
.note {
    font-size: 11px;
    margin-top: 12px;
    text-align: center;
    color: #555;
}
</style>
"""

html = custom_css + html + """
<div class="note">
Robust standard errors (HC1).<br>
*p < 0.10, **p < 0.05, ***p < 0.01
</div>
"""

# Save file
with open("model-summaries/multi_model_comparison.html", "w") as f:
    f.write(html)



## Appendix

### Saving the Rustenburg Municipality DataFrame

In [35]:
rustenburg_municipality_df.to_csv('data/rustenburg_municipality_df.csv', 
    index=False)