# Sensitivity Analysis (LGA, Year)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Constants
N_SPLITS = 20
RANDOM_STATE = 42

In [3]:
# Load and preprocess data
cleaned_offences = pd.read_csv('../cleanedCsv/cleanedOffences.csv')

# Rename columns
cleaned_offences.rename(columns={
  'Local Government Area': 'LGA',
  'Year': 'year'
}, inplace=True)

# Calculate population from rate and offence count
cleaned_offences['population'] = cleaned_offences['Offence Count'] / cleaned_offences['Rate per 100,000 population'] * 100000

# Drop columns that are not needed
cleaned_offences.drop(columns=[
    'Offence Count',
    'Rate per 100,000 population',
], inplace=True)

# Drop total rows because we only need LGA
cleaned_offences = cleaned_offences[cleaned_offences['LGA'] != 'total']

# Create a new dataframe to store population data
population_data = pd.DataFrame({
    # 'last_year_population': [],
    'year': [],
    'LGA': [],
})

# Lag population by 1 year for each LGA
# Remove 2014 from table because we don't have population data for 2013
for LGA, group in cleaned_offences.groupby('LGA'):
    # group['last_year_population'] = group['population'].shift(1)
    # group['last_year_population'] = group['population'].mean()
    group = group.dropna()
    population_data = pd.concat([population_data, group], ignore_index=True)
    population_data.drop(columns=['Unnamed: 0'], inplace=True)

# Features we want to use
features = [
    # 'last_year_population',
    'year',
]

In [4]:
# Linear Regression using KFold cross validation for each LGA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

model_data = population_data.copy()

# One-hot encode LGA column
unique_lga_names = model_data['LGA'].apply(lambda x: 'LGA_' + x).unique()
model_data_encoded = pd.get_dummies(model_data, columns=['LGA'])

# Combine one-hot encoded LGA columns with other features
features_new = [
    *features,
    *unique_lga_names
]

model = LinearRegression()

rmse_values = []

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# for lga, group in model_data:
X = model_data_encoded[features_new]
y = model_data_encoded['population'] # From population_data dataframe

for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_values.append(rmse)

print("Root Mean Squared Error (RMSE):", np.mean(rmse_values))

Root Mean Squared Error (RMSE): 8065.108991073481
