**Data Transformation Notebook**

<div style="font-family: system-ui; padding: 20px 30px 20px 20px; background-color: #FFFFFF; border-left: 8px solid #ED9255; border-radius: 8px; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);max-width:600px;color:#212121;">

- ðŸ‘¤ Name: Lethokuhle Sikosana
- ðŸŽ¯ Purpose: Conduct Statistical Analysis on SAPS Crime Data from 2008 - 2013

<span style="display:block;line-height:1.15em;color:#666666;font-size:0.9em;">
</span>

</div>

## Imports

In [1]:
import pandas as pd
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

## Loading the Data

### All of South Africa

In [2]:
crime_data = pd.read_csv('data/expanded_crime_data.csv')
#Check if converted to DataFrame correctly
crime_data.head(5)

Unnamed: 0,year,station,loc_mn,dc_mn,longitude,latitude,other_theft,arson,assault_gbh,attempted_murder,...,contact_crime_log,contact_related_crime_log,other_serious_crimes_log,crimes_dependent_on_police_action_for_detention_log,total_crime_log,total_crime_excl_police_log,treated_group,year_numeric,treatment_year,time_index
0,2008/2009,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,491,3,570,25,...,7.382124,5.480639,6.769642,5.888878,8.183677,8.077758,0,2008,0,0
1,2009/2010,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,452,5,625,9,...,7.402452,5.55296,6.674561,6.345636,8.213382,8.045909,0,2009,0,1
2,2010/2011,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,525,1,586,15,...,7.342132,5.347108,6.842683,5.849325,8.15823,8.053887,0,2010,0,2
3,2011/2012,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,559,2,591,18,...,7.339538,5.361292,6.766192,6.25575,8.178919,8.021256,0,2011,0,3
4,2012/2013,yeoville,city of johannesburg,city of johannesburg,28.06281,-26.1829,562,0,556,29,...,7.395722,5.572154,6.715383,6.287859,8.239857,8.087025,0,2012,1,4


### Rustenburg municipality

In [3]:
#Filtering to just Rustenburg Municipality
rustenburg_municipality_df = crime_data[
    crime_data['loc_mn'] == 'rustenburg'
]

## Creating Pre-treatment Years DataFrame

In [4]:
#Filtering the pre-treatment years
pre_treatment_df = crime_data[crime_data['year_numeric'] < 2012]

In [5]:
# Create interaction manually
crime_data['treated_post'] = crime_data['treated_group'] * crime_data['treatment_year']

## Difference-in-Differences

### Model 1 (Police Model)

In [6]:
model1 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ treated_group:C(station) + treatment_year:C(year) + treated_group:treatment_year + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=crime_data
).fit(
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
)

### Model 2 (Total Crime - Citizen Model)

In [7]:
model2 = smf.ols(
    'total_crime_excl_police_log ~ treated_group:C(station) + treatment_year:C(year) + treated_group:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=crime_data
).fit(
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
)

### Model 3 (Contact Crime - Citizen Model)

In [8]:
model3 = smf.ols(
    'contact_crime_log ~ treated_group:C(station) + treatment_year:C(year) + treated_group:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=crime_data
).fit(
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
)

### Model 4 (Contact Related Crime - Citizen Model)

In [9]:
model4 = smf.ols(
    'contact_related_crime_log ~ treated_group:C(station) + treatment_year:C(year) + treated_group:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=crime_data
).fit(
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
)

### Model 5 (Property Crime - Citizen Model)

In [10]:
model5 = smf.ols(
    'crimes_against_property_log ~ treated_group:C(station) + treatment_year:C(year) + treated_group:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=crime_data
).fit(
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
)

### Model 6 (Other Serious Crime - Citizen Model)

In [11]:
model6 = smf.ols(
    'other_serious_crimes_log ~ treated_group:C(station) + treatment_year:C(year) + treated_group:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=crime_data
).fit(
    cov_type='cluster',
    cov_kwds={'groups': crime_data['station']}
)

## Interrupted Time Series

### Model 7 (Total Crime - Citizen Model)

In [12]:
model7 = smf.ols(
    'total_crime_excl_police_log ~ time_index + treatment_year + time_index:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df
).fit(
    cov_type='HC1'
)

### Model 8 (Police Model)

In [13]:
model8 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ time_index + treatment_year + time_index:treatment_year + contact_related_crime_log + crimes_against_property_log + other_serious_crimes_log + contact_crime_log',
    data=rustenburg_municipality_df
).fit(
    cov_type='HC1'
)

### Model 9 (Contact Crime - Citizen Model)

In [14]:
model9 = smf.ols(
    'contact_crime_log ~ time_index + treatment_year + time_index:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df
).fit(
    cov_type='HC1'
)

### Model 10 (Contact Related Crime - Citizen Model)

In [15]:
model10 = smf.ols(
    'contact_related_crime_log ~ time_index + treatment_year + time_index:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df
).fit(
    cov_type='HC1'
)

### Model 11 (Property Crime - Citizen Model)

In [16]:
model11 = smf.ols(
    'crimes_against_property_log ~ time_index + treatment_year + time_index:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df
).fit(
    cov_type='HC1'
)

### Model 12 (Other Serious Crime - Citizen Model)

In [17]:
model12 = smf.ols(
    'other_serious_crimes_log ~ time_index + treatment_year + time_index:treatment_year + crimes_dependent_on_police_action_for_detention_log',
    data=rustenburg_municipality_df
).fit(
    cov_type='HC1'
)

## Parallel Trends Check

### Model 13 (Police Model)

In [18]:
model13 = smf.ols(
    'crimes_dependent_on_police_action_for_detention_log ~ C(year) + treated_group:C(station) + treated_group:time_index',
    data=pre_treatment_df
).fit(
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
)

### Model 14 (Total Crime - Citizen Model)

In [19]:
model14 = smf.ols(
    'total_crime_excl_police_log ~ C(year) + treated_group:C(station) + treated_group:time_index',
    data=pre_treatment_df
).fit(
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
)

### Model 15 (Contact Crime - Citizen Model)

In [20]:
model15 = smf.ols(
    'contact_crime_log ~ C(year) + treated_group:C(station) + treated_group:time_index',
    data=pre_treatment_df
).fit(
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
)

### Model 16 (Contact Related Crime - Citizen Model)

In [21]:
model16 = smf.ols(
    'contact_crime_log ~ C(year) + treated_group:C(station) + treated_group:time_index',
    data=pre_treatment_df
).fit(
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
)

### Model 17 (Property Crime - Citizen Model)

In [22]:
model17 = smf.ols(
    'crimes_against_property_log ~ C(year) + treated_group:C(station) + treated_group:time_index',
    data=pre_treatment_df
).fit(
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
)

### Model 18 (Other Serious Crime - Citizen Model)

In [23]:
model18 = smf.ols(
    'other_serious_crimes_log ~ C(year) + treated_group:C(station) + treated_group:time_index',
    data=pre_treatment_df
).fit(
    cov_type='cluster',
    cov_kwds={'groups': pre_treatment_df['station']}
)

## Creating the Regression Summaries

### DiD and ITS Models for all types of crime

In [24]:
stargazer = Stargazer([model1, model2, model7, model8])

# Rename variables for readability
stargazer.rename_covariates({
    'treated_group': 'Treated group',
    'treatment_year': 'Treatment year',
    'treated_group:treatment_year': 'DiD Estimator',
    'time_index:treatment_year': 'RDD Estimator',
    'contact_related_crime_log': 'Contact-related crime (log)',
    'crimes_against_property_log': 'Property crime (log)',
    'other_serious_crimes_log': 'Other serious crime (log)',
    'contact_crime_log': 'Contact crime (log)',
    'crimes_dependent_on_police_action_for_detention_log': 'Police-detected crime (log)'
})

# Covariates across DiD + ITS models
combined_order = [
    'treated_group:treatment_year',
    'time_index:treatment_year',
    'contact_related_crime_log',
    'crimes_against_property_log',
    'other_serious_crimes_log',
    'contact_crime_log',
    'crimes_dependent_on_police_action_for_detention_log'
]

stargazer.covariate_order(combined_order)

# Display formatting
stargazer.show_degrees_of_freedom(False)
stargazer.show_model_numbers(True)
stargazer.significant_digits(3)
stargazer.title("Regression Summary: Effects of Marikana Commission and Massacre")

# Columns
stargazer.custom_columns(
    ["Police Model (DiD)", "Citizen Model (DiD)", "Citizen Model (ITS)", "Police Model (ITS)"],
    [1, 1, 1, 1]
)

# Add fixed-effects rows to summary section
stargazer.add_line("Station fixed effects", ["Yes"] * 4)
stargazer.add_line("Year fixed effects", ["Yes"] * 4)

# Render HTML
html = stargazer.render_html()

# Add custom CSS
custom_css = """
<style>
table.stargazer {
    font-family: Helvetica, Arial, sans-serif;
    border-collapse: collapse;
    margin-left: auto;
    margin-right: auto;
    font-size: 13px;
}
table.stargazer th {
    padding: 6px 12px;
    text-align: center;
    font-weight: bold;
}
table.stargazer td {
    padding: 4px 12px;
    border-top: 1px solid #ddd;
}
table.stargazer tr:nth-child(even) {
    background-color: #fafafa;
}
caption {
    font-size: 16px;
    font-weight: bold;
    margin-bottom: 12px;
}
.note {
    font-size: 11px;
    margin-top: 12px;
    text-align: center;
    color: #555;
}
</style>
"""

# Save file
with open("model-summaries/multi_model_comparison.html", "w") as f:
    f.write(html)




### DiD and ITS for Citizen Models with different types of crime

In [25]:
stargazer2 = Stargazer([
    model3, model4, model5, model6,
    model9, model10, model11, model12
])

# Rename variables for readability
stargazer2.rename_covariates({
    'treated_group': 'Treated group',
    'treatment_year': 'Treatment year',
    'treated_group:treatment_year': 'DiD Estimator',
    'time_index:treatment_year': 'RDD Estimator',
    'crimes_dependent_on_police_action_for_detention_log': 'Police-detected crime (log)'
})

# Covariates across DID + ITS models
combined_order2 = [
    'treated_group:treatment_year',
    'time_index:treatment_year',
    'crimes_dependent_on_police_action_for_detention_log'
]

stargazer2.covariate_order(combined_order2)

# Display formatting
stargazer2.show_degrees_of_freedom(False)
stargazer2.show_model_numbers(True)
stargazer2.significant_digits(3)
stargazer2.title("Regression Summary: Effects of Marikana Commission and Massacre on Citizen Reporting")

# Columns for DID and ITS models
stargazer2.custom_columns(
    [
        "Contact Crime (DiD)", "Contact Related Crime (DiD)", 
        "Property Crimes (DiD)", "Other Serious Crimes (DiD)",
        "Contact Crime (ITS)", "Contact Related Crime (ITS)", 
        "Property Crimes (ITS)", "Other Serious Crimes (ITS)"
    ],
    [1, 1, 1, 1, 1, 1, 1, 1]
)

# Add fixed-effects rows to summary section
stargazer2.add_line("Station fixed effects", ["Yes"] * 8)
stargazer2.add_line("Year fixed effects", ["Yes"] * 8)

# Render HTML
html2 = stargazer2.render_html()

# Custom CSS for styling
custom_css = """
<style>
table.stargazer {
    font-family: Helvetica, Arial, sans-serif;
    border-collapse: collapse;
    margin-left: auto;
    margin-right: auto;
    font-size: 13px;
}
table.stargazer th {
    padding: 6px 12px;
    text-align: center;
    font-weight: bold;
}
table.stargazer td {
    padding: 4px 12px;
    border-top: 1px solid #ddd;
}
table.stargazer tr:nth-child(even) {
    background-color: #fafafa;
}
caption {
    font-size: 16px;
    font-weight: bold;
    margin-bottom: 12px;
}
.note {
    font-size: 11px;
    margin-top: 12px;
    text-align: center;
    color: #555;
}
</style>
"""

# Save HTML file
with open("model-summaries/citizen_model_comparison.html", "w") as f:
    f.write(html2)



### Parallel Trends

In [26]:
stargazer3 = Stargazer([
    model13, model14, model15, model16, model17, model18
])

# Rename the interaction term ONLY (this is the parallel trends estimator)
stargazer3.rename_covariates({
    'treated_group:time_index': 'Parallel Trends Estimator'
})

# Show ONLY the interaction term in the regression table
stargazer3.covariate_order(['treated_group:time_index'])

# Display formatting
stargazer3.show_degrees_of_freedom(False)
stargazer3.show_model_numbers(True)
stargazer3.significant_digits(3)
stargazer3.title("Parallel Trends Check: Effects of Marikana Commission and Massacre")

# Columns
stargazer3.custom_columns(
    [
        "Police Model",
        "Total Crime - Citizen Model",
        "Contact Crime - Citizen Model",
        "Contact Related Crime - Citizen Model",
        "Property Crimes - Citizen Model",
        "Other Serious Crimes - Citizen Model"
    ],
    [1, 1, 1, 1, 1, 1]
)

# Add FE indicators (each model uses station & year FE)
stargazer3.add_line("Station fixed effects", ["Yes"] * 6)
stargazer3.add_line("Year fixed effects", ["Yes"] * 6)

# Render HTML
html3 = stargazer3.render_html()

# CSS styling
custom_css = """
<style>
table.stargazer {
    font-family: Helvetica, Arial, sans-serif;
    border-collapse: collapse;
    margin-left: auto;
    margin-right: auto;
    font-size: 13px;
}
table.stargazer th {
    padding: 6px 12px;
    text-align: center;
    font-weight: bold;
}
table.stargazer td {
    padding: 4px 12px;
    border-top: 1px solid #ddd;
}
table.stargazer tr:nth-child(even) {
    background-color: #fafafa;
}
caption {
    font-size: 16px;
    font-weight: bold;
    margin-bottom: 12px;
}
.note {
    font-size: 11px;
    margin-top: 12px;
    text-align: center;
    color: #555;
}
</style>
"""

# Save file
with open("model-summaries/parallel-trends-check.html", "w") as f:
    f.write(custom_css + html3)





## Appendix

### Saving the Rustenburg Municipality DataFrame

In [27]:
rustenburg_municipality_df.to_csv('data/rustenburg_municipality_df.csv', 
    index=False)