In [2]:
import pandas as pd
import statsmodels as st
import numpy as np
import matplotlib as plt
import sklearn


In [8]:
data = pd.read_csv('./final.csv', index_col=0)
data['crime'] = data['Rate per 100,000 population']
data = data.drop(columns=['Rate per 100,000 population'])

def normalize(col):
    col = ''.join(col.split())
    col = ''.join(e for e in col if e.isalnum())
    out: str = col.replace(',','_').lower()
    if out[0].isdigit():
        out = '_' + out
    return out

data.rename(columns=normalize, inplace=True)
data.describe()

Unnamed: 0,year,egm,medianhouseprice,offencecount,traveltimetogpominutes,areakm2,ariamin,ariamax,ariaavg,commercialkm2,...,presentationstoemergencydepartments201213,traveltimetonearestpublichospitalwithemergencydepartment,presentationstoemergencydepartmentsduetoinjury,category45emergencydepartmentpresentations,numberofdwellings,population,locationx,locationy,absremotenesscategory,crime
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,...,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,2017.0,44791590.0,697426.3,8807.719388,87.531777,2427.304028,0.638564,0.915607,0.765623,0.015513,...,0.270842,25.794808,0.248449,0.567065,40813.517857,101211.071429,-0.204235,26.559082,0.589286,8604.032054
std,2.002556,36486470.0,466870.3,6836.585681,89.737139,4388.218811,0.926171,1.24963,1.076033,0.024319,...,0.117438,23.200132,0.039385,0.076904,24837.496782,67489.684405,103.654912,82.711984,0.702344,3506.884396
min,2014.0,1892293.0,158750.0,387.0,4.897709,20.82293,0.0,0.0,0.0,5.2e-05,...,0.050232,3.930699,0.140255,0.39925,4874.0,9873.0,-310.285714,-81.599301,0.0,3076.800763
25%,2015.0,11820500.0,352072.2,3061.75,20.246923,79.778887,0.0,0.0,0.0,0.000368,...,0.180694,8.626692,0.218529,0.513066,18526.75,41610.0,-23.545417,-15.651445,0.0,6471.102274
50%,2017.0,31080510.0,585351.3,8011.0,52.602954,667.579973,0.064858,0.193099,0.117857,0.002763,...,0.252941,16.07915,0.256317,0.567085,40520.0,94681.5,5.389039,1.222753,0.0,8194.577278
75%,2019.0,68851120.0,903731.5,12515.5,131.271874,3206.892301,1.088661,1.512202,1.384535,0.025111,...,0.375373,34.781852,0.278871,0.616169,59403.0,151932.5,27.746864,40.975396,1.0,10228.073289
max,2020.0,143045700.0,2841161.0,37886.0,384.960766,23359.313312,3.272194,4.383425,3.73719,0.127473,...,0.55326,96.843507,0.322547,0.725373,107828.0,298909.0,274.239407,343.714443,2.0,25932.263717


In [24]:
from statsmodels.formula.api import ols
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error


state = 12

# this part is using communities, finding factors that contribute
# to high crime rate, no LGA category
def feature_selection(train_set, n=3):
    num = train_set.select_dtypes(include=[np.number])
    corr = num.corr()
    highest = list(corr.nlargest(n+1, columns=['crime']).index)
    highest.remove('crime')
    return highest

def simple_model(train_set, test_set):
    # hyperparameter tune k in feature selection
    train = train_set.sample(frac=0.6, random_state=state)
    test = train_set.drop(train.index)

    pre = []

    best_k = -1
    best_mse = 1e9
    for k in range(20, 40):
        columns = set(feature_selection(train, k))

        for p in pre:
            columns.add(p)

        text = f'crime ~ {" + ".join(columns)}'
        model = ols(text, data=train).fit_regularized(alpha=0.2, L1_wt=1)
        pred = model.predict(test)
        mse = mean_squared_error(test['crime'], pred)

        if mse < best_mse:
            best_mse = mse
            best_k = k

    columns = set(feature_selection(train_set, best_k))
    for p in pre:
        columns.add(p)

    text = f'crime ~ {" + ".join(columns)}'
    model = ols(text, data=train_set).fit_regularized(alpha=0.2, L1_wt=1)
    pred = model.predict(test_set)
    mse = mean_squared_error(test_set['crime'], pred)

    return mse

def all_model(train_set, test_set):
    columns = [
        'ariamin',
        'publichospitals', 
        'homelessness', 
        'mentalhealth',
        'unemployedpersons', 
        'equivalenthouseholdincome600week',
        'dwellingswithnomotorvehicle',
        'egm', 'medianhouseprice',
    ]

    text = f'crime ~ C(lga) + {" + ".join(columns)}'
    model = ols(text, data=train_set).fit()
    pred = model.predict(test_set)
    mse = mean_squared_error(test_set['crime'], pred)

    return mse


def null_model(train_set, test_set):
    model = ols('crime ~ 1', data=train_set).fit()
    pred = model.predict(test_set)
    mse = mean_squared_error(test_set['crime'], pred)
    return mse

actual = data
actual = actual.drop(columns=['offencecount'], axis=1)
n = 56
fold = KFold(n, shuffle=True, random_state=state)

models = {
    'null': null_model,
    # 'regularized': simple_model,
    'pre_selected': all_model,
}

total_mse = {k: 0 for k in models}
for train, test in fold.split(actual):
    train_set = actual.iloc[train]
    test_set = actual.iloc[test]

    for k, v in models.items():
        mse = v(train_set, test_set)
        total_mse[k] += mse


for k, v in total_mse.items():
    print(f"model {k:15}, RMSE: {np.sqrt(v/n):14.2f}")

model null           , RMSE:        3510.39
model pre_selected   , RMSE:         840.14


In [18]:
# def interaction_model(train_set, test_set):
#     columns = list(feature_selection(train_set))
#     columns.remove('crime')

#     columns = [c + ":LGA" for c in columns]

#     text = f'crime ~ C(LGA) + {" + ".join(columns)}'
#     model = ols(text, data=train_set).fit()
#     pred = model.predict(test_set)
#     mse = mean_squared_error(test_set['crime'], pred)
#     return mse

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import normalized_mutual_info_score

import warnings


state = 21

def MI_analysis(df, n):
    wd = df.copy()
    wd = wd.drop(columns=['lga'])

    warnings.filterwarnings("ignore")
    clusters = KBinsDiscretizer(3, encode='ordinal', strategy='quantile')
    wd[wd.columns] = clusters.fit_transform(wd[wd.columns])
    
    out = {}
    for col in wd.columns:
        if col == 'crime':
            continue

        mi = normalized_mutual_info_score(wd['crime'], wd[col])
        out[col] = mi

    out = {k:v for k, v in sorted(out.items(), key=lambda v: v[1], reverse=True)}
    return list(out.keys())[:n]


def null_model(train_set, test_set):
    text = f"crime ~ C(lga)"
    model = ols(text, data=train_set).fit()
    pred = model.predict(test_set)
    mse = mean_squared_error(test_set['crime'], pred)
    return mse

def regularize_model(train_set, test_set):
    num: pd.DataFrame = train_set.select_dtypes(include=[np.number])
    columns = list(num.columns)
    columns.remove('crime')
    # columns = ['last_crime', 'medianhouseprice', 'last_house', 'egm', 'last_egm', 'distance', 'year']
 
    text = f'crime ~ C(lga) + {" + ".join(columns)} - 1'
    model = ols(text, data=train_set).fit_regularized(alpha=2, L1_wt=0)
    pred = model.predict(test_set)
    mse = mean_squared_error(test_set['crime'], pred)
    return mse

actual = data[data['year'].isin(list(range(2015, 2021)))]
actual = actual.copy()

def feature_model(train_set, test_set):
    num: pd.DataFrame = train_set.select_dtypes(include=[np.number])
    columns = list(num.columns)
    columns.remove('crime')

    # hyperparameter tune k in feature selection
    # start_state = state
    # while True:
    train: pd.DataFrame = train_set.sample(frac=0.7, random_state=state)
    test = train_set.drop(train.index)
        # if len(train['lga'].unique()) == len(test['lga'].unique()):
        #     break
        # start_state += 1

    best_k = -1
    best_mse = 1e19
    for alpha in np.linspace(0.3, 1, 10):
        text = f'crime ~  C(lga) + {" + ".join(columns)}'
        model = ols(text, data=train).fit_regularized(alpha=alpha, L1_wt=0, maxiter=10)
        pred = model.predict(test)
        mse = mean_squared_error(test['crime'], pred)

        if mse < best_mse:
            best_mse = mse
            best_k = alpha

    text = f'crime ~ C(lga) + {" + ".join(columns)}'
    model = ols(text, data=train_set).fit_regularized(alpha=best_k, L1_wt=0)
    pred = model.predict(test_set)
    mse = mean_squared_error(test_set['crime'], pred)

    return mse

# insert last year
for i, row in actual.iterrows():
    last = data[(data['year'] == row['year']-1) & (data['lga'] == row['lga'])].copy()
    distance = np.sqrt(row['locationx'] ** 2 + row['locationy'] ** 2)
    actual.loc[i, 'distance'] = distance
    last_2 = data[(data['year'] == row['year']-2) & (data['lga'] == row['lga'])].copy()
    actual.loc[i, 'last_crime'] = last['crime'].values[0]
    # actual.loc[i, 'last2_crime'] = last_2['crime'].values[0]
    actual.loc[i, 'last_house'] = last['medianhouseprice'].values[0]
    # actual.loc[i, 'last2_house'] = last_2['medianhouseprice'].values[0]
    actual.loc[i, 'last_egm'] = last['egm'].values[0]
    # actual.loc[i, 'last2_egm'] = last_2['egm'].values[0]


actual = actual.drop(columns=['offencecount', 'population', 'locationx', 'locationy'], axis=1)
n = 80
fold = KFold(n, shuffle=True, random_state=state)

models = {
    'null': null_model,
    # 'regularize': regularize_model,
    'feature': feature_model,
}



In [19]:

total_mse = {k: 0 for k in models}
total = 0
for train, test in fold.split(actual):
    total += 1
    print(f"\rprogress {total / n:.2f}", end='')
    for k, v in total_mse.items():
        print(f", model {k:10}, RMSE: {np.sqrt(v/total):6.2f}", end='')
    
    train_set = actual.iloc[train]
    test_set = actual.iloc[test]

    for k, v in models.items():
        mse = v(train_set, test_set)
        total_mse[k] += mse


progress 1.00, model null      , RMSE: 818.84, model feature   , RMSE: 916.935

In [458]:
print()
for k, v in total_mse.items():
    print(f"model {k:15}, RMSE: {np.sqrt(v/n):14.2f}")


model null           , RMSE:        1367.24
model feature        , RMSE:        1309.01


In [446]:
from multiprocessing import Pool

def compute(tup):
    train,test = tup
    train_set = actual.iloc[train]
    test_set = actual.iloc[test]

    total_mse = {k: 0 for k in models}
    for k, v in models.items():
        mse = v(train_set, test_set)
        total_mse[k] += mse
    return total_mse

with Pool(18) as p:
    out = p.map(compute, fold.split(actual))
    total_mse = {k: 0 for k in models}
    for o in out:
        for i in o:
            total_mse[i] += o[i]

# total = 0
# for train, test in fold.split(actual):
#     total += 1
#     print(f"\rprogress {total / n:.2f}", end='')
#     for k, v in total_mse.items():
#         print(f", model {k:10}, RMSE: {np.sqrt(v/total):6.2f}", end='')
    
#     train_set = actual.iloc[train]
#     test_set = actual.iloc[test]

#     for k, v in models.items():
#         mse = v(train_set, test_set)
#         total_mse[k] += mse

print()
for k, v in total_mse.items():
    print(f"model {k:15}, RMSE: {np.sqrt(v/n):14.2f}")


KeyboardInterrupt: 

KeyError: 0

In [101]:
actual[actual['LGA'] == 'yarra']

Unnamed: 0,LGA,Year,EGM,MedianHousePrice,TraveltimetoGPOminutes,Areakm2,ARIAmin,ARIAmax,ARIAavg,Commercialkm2,...,Presentationstoemergencydepartments201213,Traveltimetonearestpublichospitalwithemergencydepartment,Presentationstoemergencydepartmentsduetoinjury,Category45emergencydepartmentpresentations,NumberofDwellings,Population,Locationx,Locationy,ABSremotenesscategory,crime
50,yarra,2014,30077711.86,905990.1,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,14741.826019
106,yarra,2015,31084714.55,1008032.0,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,14405.658174
162,yarra,2016,32992353.39,1142465.0,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,15075.247056
218,yarra,2017,30801195.8,1333348.0,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,14053.52211
274,yarra,2018,31076310.57,1275831.0,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,14327.148601
330,yarra,2019,30265707.48,1247478.0,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,13987.404707
386,yarra,2020,22747249.32,1296071.0,6.485248,20.82293,0.0,0.0,0.0,0.127473,...,0.234022,5.309344,0.221164,0.620777,37684.0,82266.0,2.342926,1.607594,0,14692.433428


In [102]:
def house_model_predict(train_set):
    columns = ['C(LGA)', 'EGM', 'MedianHousePrice', 'Year']

    text = f'crime ~ {" + ".join(columns)}'
    model = ols(text, data=train_set).fit()
    return model

model = house_model_predict(actual)
sample = pd.DataFrame({
    'LGA': 'yarra',
    'EGM': 33_000_000,
    'MedianHousePrice': 1.4e+06,
    'Year': 2025
}, index=[0])
model.predict(sample)
model.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.952
Dependent Variable:,crime,AIC:,6377.264
Date:,2024-09-23 20:06,BIC:,6611.5684
No. Observations:,392,Log-Likelihood:,-3129.6
Df Model:,58,F-statistic:,134.1
Df Residuals:,333,Prob (F-statistic):,2.2e-198
R-squared:,0.959,Scale:,592880.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,-129805.9202,62790.3624,-2.0673,0.0395,-253321.6864,-6290.1541
C(LGA)[T.ballarat],7016.6386,584.6875,12.0007,0.0000,5866.4919,8166.7854
C(LGA)[T.banyule],4097.7657,625.3537,6.5527,0.0000,2867.6239,5327.9074
C(LGA)[T.basscoast],3702.8341,424.4026,8.7248,0.0000,2867.9861,4537.6822
C(LGA)[T.bawbaw],4559.6409,428.7218,10.6354,0.0000,3716.2966,5402.9853
C(LGA)[T.bayside],2245.0720,792.8830,2.8315,0.0049,685.3811,3804.7629
C(LGA)[T.benalla],5209.0434,417.3504,12.4812,0.0000,4388.0678,6030.0189
C(LGA)[T.boroondara],2200.8045,965.0775,2.2804,0.0232,302.3875,4099.2216
C(LGA)[T.brimbank],5582.9449,1059.1097,5.2714,0.0000,3499.5561,7666.3338

0,1,2,3
Omnibus:,23.68,Durbin-Watson:,1.596
Prob(Omnibus):,0.0,Jarque-Bera (JB):,76.591
Skew:,0.044,Prob(JB):,0.0
Kurtosis:,5.164,Condition No.:,93261358962.0
