In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import json

In [2]:
# To address false positive reporting in centers.py
pd.options.mode.chained_assignment = None

In [3]:
from scipy.optimize import linear_sum_assignment

import os
import sys
from center.centers import get_centroids, get_medoids, get_modes, get_std, centers_df_to_dict
from match.hungarian import match_clusters, min_cluster_assignment
from distance.emd import get_emd
from redlining.redlining import get_holc_grade, holc_grade_counts_to_dict, get_all_redlining_value_counts, \
    run_chi_square_test, normalize_count_dict, grade_keys

In [4]:
from clustering.distance.distances import haversine_np, meters_to_hav, RADIUS_OF_EARTH_AT_SPACE_NEEDLE

In [30]:
import statsmodels.api as sm

In [5]:
import inspect
lines = inspect.getsource(min_cluster_assignment)
# print(lines)

In [6]:
%load_ext autoreload

In [7]:
%autoreload 2

In [8]:
%reload_ext autoreload

In [9]:
## Helper Methods and Variables

In [10]:
data_path = '../clustering/test.csv'

In [35]:
cluster_movement_path  = 'test_output/cluster_sector_movement2.csv'

In [12]:
## Load the Data

In [13]:
df = pd.read_csv(data_path)

In [14]:
df_filtered = df[df['cluster_labs']!=-1]

In [15]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Offense ID,Offense Start DateTime,Offense End DateTime,Group A B,Crime Against Category,Offense Parent Group,Offense,Offense Code,Precinct,...,100 Block Address,Longitude,Latitude,Year,Month,Day,Time,long_rad,lat_rad,cluster_labs
0,267341,7697612406,2008-10-15 09:00:00,10/15/2013 09:00:00 AM,A,PROPERTY,EMBEZZLEMENT,Embezzlement,270,N,...,9XX BLOCK OF NE BOAT ST,-122.318116,47.653656,2008,10,2,09:00:00,-2.134854,0.831713,-1


In [16]:
df.columns

Index(['Unnamed: 0', 'Offense ID', 'Offense Start DateTime',
       'Offense End DateTime', 'Group A B', 'Crime Against Category',
       'Offense Parent Group', 'Offense', 'Offense Code', 'Precinct', 'Sector',
       'Beat', 'MCPP', '100 Block Address', 'Longitude', 'Latitude', 'Year',
       'Month', 'Day', 'Time', 'long_rad', 'lat_rad', 'cluster_labs'],
      dtype='object')

In [17]:
sectors = df['Sector'].unique()
sectors

array(['U', 'B', 'E', 'M', 'K', 'D', 'Q', 'R', 'L', 'N', 'J', 'W', 'S',
       'F', 'C', 'G', 'O'], dtype=object)

In [18]:
years = df['Year'].unique()
years.sort()

In [36]:
## Calculate Cluster Movements

In [18]:
def get_cluster_movement(df: pd.DataFrame, years: np.ndarray):
    distances = {}
    for i in range(1, len(years)-1):
        year1 = years[i]
        year2 = years[i+1]
        df1 = df[df['Year'] == year1]
        df2 = df[df['Year'] == year2]

        cluster1_ind, cluster2_ind, row_ind, col_ind = match_clusters(df1, df2)
        ind1_cluster1 = {v: k for k, v in cluster1_ind.items()}
        ind2_cluster2 = {v: k for k, v in cluster2_ind.items()}
        cluster1_no_match = set(cluster1_ind.keys()).difference(set(ind1_cluster1[i] for i in row_ind))
        cluster2_no_match = set(cluster2_ind.keys()).difference(set(ind2_cluster2[j] for j in col_ind))

        distance = 0
        count = 0
        for row, col in zip(row_ind, col_ind):
            distance += get_emd(df1[df1['cluster_labs']==ind1_cluster1[row]], df2[df2['cluster_labs']==ind2_cluster2[col]])
            count += 1
        for c1 in cluster1_no_match:
            distance += get_emd(df1[df1['cluster_labs']==c1], df2[df2['cluster_labs']==-1])
            count += 1
        for c2 in cluster2_no_match:
            distance += get_emd(df1[df1['cluster_labs']==-1], df2[df2['cluster_labs']==c2])
            count += 1
        # print(f'Years: {year1} and {year2}. Distance: {distance/count}')
        distances[(year1, year2)] = distance/count
    return distances

In [19]:
def get_sectors_cluster_movement(df: pd.DataFrame, sectors: np.ndarray, years: np.ndarray):
    sector_movement = {}
    for sector in sectors:
        print(f'Sector {sector} movement calculation initiated')
        sector_movement[sector] = get_cluster_movement(df[df['Sector']==sector], years)
        print(f'Sector {sector} movement calculation completed')
    return sector_movement

In [20]:
# get_cluster_movement(df[df['Sector']=='B'], years)

In [27]:
# sector_movement = get_sectors_cluster_movement(df, sectors, years)

In [29]:
# sector_movement

In [30]:
# sector_movement_df = pd.DataFrame(columns=['Sector', 'Year1', 'Year2', 'Movement'])

In [31]:
# for sector in sector_movement.keys():
#     movement = sector_movement[sector]
#     for window in movement.keys():
#         sector_movement_df.loc[len(sector_movement_df)] = [sector, window[0], window[1], movement[window]]

In [32]:
# sector_movement_df

In [33]:
# sector_movement_df.to_csv('test_output/cluster_sector_movement2.csv')

In [37]:
## Run test on Redlining and associate with Cluster Movement

In [109]:
all_holc_value_counts = holc_grade_counts_to_dict(pd.read_csv('redlining/all_redlining_value_counts.csv', index_col='grade')['count'])

In [110]:
all_holc_value_counts

defaultdict(int,
            {'None': 523732,
             'B': 194429,
             'C': 172657,
             'D': 142867,
             'A': 10280})

In [125]:
from scipy.stats import chisquare
def run_chi_square_test2(holc_counts_dict1, holc_counts_dict2):
    observed = [holc_counts_dict1[grade] for grade in grade_keys if grade!='None']
    expected = [holc_counts_dict2[grade] for grade in grade_keys if grade!='None']
    chi2, p = chisquare(observed, expected)
    return chi2, p

In [129]:
def test_redlining_by_sector(df, sectors, all_counts):
    for sector in sectors:
        holc_df0 = get_holc_grade(df[df['Sector']==sector], redlining_data_path='redlining/redlining_seattle.json')
        holc_counts = holc_df0['grade'].value_counts(dropna=False)
        holc_counts_dict = holc_grade_counts_to_dict(holc_counts)
        all_counts_dict = all_counts
        if (holc_counts_dict.get('None') is not None): del holc_counts_dict['None']
        if (all_counts_dict.get('None') is not None): del all_counts_dict['None']
        holc_counts_dict = normalize_count_dict(holc_counts_dict)
        all_counts_dict = normalize_count_dict(all_counts_dict)
        print(f'Starting Sector {sector}')
        print(holc_counts_dict, '\n', all_counts_dict)
        chi2, p = run_chi_square_test2(holc_counts_dict, all_counts_dict)
        print(f'Sector {sector}: {chi2} {p}\n')

In [130]:
test_redlining_by_sector(df_filtered, sectors, all_holc_value_counts)

Starting Sector U
{'B': 92.33719300833395, 'C': 7.32354893428719, 'A': 0.30238218157681246, 'D': 0.0368758758020503} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector U: 129.7964715769144 5.983205864080834e-28

Starting Sector B
{'B': 62.93729372937294, 'C': 31.2013201320132, 'D': 5.8283828382838285, 'A': 0.033003300330033} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector B: 36.55783711568133 5.7072050754800594e-08

Starting Sector E
{'D': 51.75831956573047, 'C': 48.182676421996696, 'A': 0.029502006136417278, 'B': 0.029502006136417278} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector E: 67.50150762805785 1.4626057444261168e-14

Starting Sector M
{'A': 25.0, 'B': 25.0, 'C': 25.0, 'D': 25.0} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector M: 274.6031271769

In [138]:
test_redlining_by_sector(df_filtered[df_filtered['Year']==2022], sectors, all_holc_value_counts)

Starting Sector U
{'B': 93.09182813816344, 'C': 5.8972198820556025, 'A': 0.5897219882055602, 'D': 0.42122999157540014} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector U: 133.10855607061484 1.15641592390224e-28

Starting Sector B
{'B': 73.22557976106816, 'C': 20.379479971890373, 'D': 6.043569922698524, 'A': 0.35137034434293746} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector B: 57.377052256875494 2.1351141040560273e-12

Starting Sector E
{'C': 61.13615870153291, 'D': 37.962128043282235, 'A': 0.4508566275924256, 'B': 0.4508566275924256} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector E: 65.2036427003047 4.537311491748417e-14

Starting Sector M
{'A': 25.0, 'B': 25.0, 'C': 25.0, 'D': 25.0} 
 {'B': 37.37344612894607, 'C': 33.188398275388145, 'D': 27.46211793561731, 'A': 1.9760376600484784}
Sector M: 274.6031271769

In [137]:
# NOTE!! The above test needs to be run on the clustering that uses all the data points in a sector (not just a year)

In [132]:
## Run a linear regression based redlining test

In [19]:
# for grade in grade_keys:
#     sector_movement_df[grade] = np.nan

In [20]:
# sector_movement_df.head()

In [21]:
# sector_movement_avg_df = sector_movement_df.groupby(['Sector']).agg({'Movement': 'mean'}).reset_index()

In [22]:
# sector_movement_avg_df.head()

In [23]:
# for grade in grade_keys:
#     sector_movement_avg_df[grade] = np.nan

In [24]:
# for index, row in sector_movement_avg_df.iterrows():
#     sector = row['Sector']
#     holc_df0 = get_holc_grade(df[df['Sector']==sector], redlining_data_path='redlining/redlining_seattle.json')
#     holc_counts = holc_df0['grade'].value_counts(dropna=False)
#     holc_counts_dict = holc_grade_counts_to_dict(holc_counts)
#     holc_counts_dict = normalize_count_dict(holc_counts_dict)
#     for grade in grade_keys:
#         sector_movement_avg_df.loc[index, grade] = holc_counts_dict[grade]
#     print(f'Sector {sector} done')

In [26]:
sector_movement_avg_df = pd.read_csv('test_output/sector_movement_avg_df.csv')

In [27]:
sector_movement_avg_df

Unnamed: 0.1,Unnamed: 0,Sector,Movement,A,B,C,D,None
0,0,B,183376.5,0.006468,31.499567,23.897606,1.963549,42.632811
1,1,C,362706.8,7.898366,30.157967,24.510009,32.483597,4.950061
2,2,D,1077536.0,0.007095,5.836443,3.849811,1.495651,88.811
3,3,E,92668.12,0.006776,0.006776,31.923457,29.698193,38.364797
4,4,F,75678.12,0.009889,1.789988,7.92935,77.920845,12.349928
5,5,G,30798.61,0.01142,7.799831,3.992417,84.190211,4.006121
6,6,J,128164.5,0.412341,55.563867,4.486489,0.009123,39.52818
7,7,K,132812.2,0.006944,0.006944,0.006944,7.870833,92.108333
8,8,L,231986.1,0.007873,3.304939,0.007873,0.007873,96.671443
9,9,M,32548.48,0.006894,0.006894,0.006894,0.006894,99.972424


In [28]:
# sector_movement_avg_df.to_csv('test_output/sector_movement_avg_df.csv')

In [32]:
results = sm.OLS(sector_movement_avg_df['Movement'], sm.add_constant(sector_movement_avg_df[['A', 'B', 'C', 'D']])).fit()

In [33]:
A = np.identity(len(results.params))

In [34]:
print(results.f_test(A))

<F test: F=2.4790421684635797, p=0.09174022438828049, df_denom=12, df_num=5>
