In [3]:
import pandas as pd
from util import *
from copy import deepcopy
import warnings
warnings.filterwarnings('ignore')

csv1 = './sample_data/sd_lodes_2017_raw_matrix.csv'
csv2 = './sample_data/sd_lodes_2019_raw_matrix.csv'
spssim_outflows_csv = './sample_data/spssim_results/local_spssim_test/spssim_local_outflows_lodes2017_lodes2019.csv'
spssim_inflows_csv = './sample_data/spssim_results/local_spssim_test/spssim_local_inflows_lodes2017_lodes2019.csv'

In [4]:
def calc_local_spssim(raw_matrix1_csv, raw_matrix2_csv, spssim_csv, c1=0, c2=0, inflow=True):
    # INFLOWS = Calculations down columns
    # OUTFLOWS = Calculations across rows --> Transpose to calculate down columns

    # raw matrix
    df1 = pd.read_csv(raw_matrix1_csv)
    df1 = df1.set_index('cbg_orig')
    for col in list(df1.columns):
        df1.loc[col, col] = None

    df2 = pd.read_csv(raw_matrix2_csv)
    df2 = df2.set_index('cbg_orig')
    for col in list(df2.columns):
        df2.loc[col, col] = None

    if inflow==False:
        df1 = df1.transpose()
        df2 = df2.transpose()

    # get flow probabilities
    df1.loc['row_total'] = df1.sum()
    flow1 = df1[:-1].div(df1.loc['row_total'])

    df2.loc['row_total'] = df2.sum()
    flow2 = df2[:-1].div(df2.loc['row_total'])

    # n, mean, var
    flow1.loc['n'] = flow1.count()
    flow1.loc['mean1'] = flow1[0:-1].mean()
    flow1.loc['var1'] = flow1[0:-2].var()

    flow2.loc['mean2'] = flow2.mean()
    flow2.loc['var2'] = flow2[0:-1].var()

    # prep for covar
    cov1 = flow1
    cov1 = cov1 - cov1.loc['mean1']
    cov1 = cov1[0:-3]

    cov2 = flow2
    cov2 = cov2 - cov2.loc['mean2']
    cov2 = cov2[0:-2]

    # COVARIANCE
    cov = pd.DataFrame(data=None, columns=cov1.columns, index=cov1.index)
    cov = cov1.mul(cov2)
    cov.loc['cov'] = cov.sum()/(cov.count() - 1)

    # calculate spssim
    spssim = pd.concat([flow1.tail(3), flow2.tail(2), cov.tail(1)])
    spssim.loc['c1'] = c1
    spssim.loc['c2'] = c2
    spssim.loc['local'] = ((2 * spssim.loc['mean1'] * spssim.loc['mean2'] + c1) * (2 * spssim.loc['cov'] + c2)) / ((spssim.loc['mean1'] ** 2 + spssim.loc['mean2'] ** 2 + c1) * (spssim.loc['var1'] + spssim.loc['var2'] + c2))
    spssim.loc['global'] = spssim.loc['local'].sum()/spssim.loc['local'].count()

    # transpose df
    spssim = spssim.transpose()
    spssim.to_csv(spssim_csv)
    return spssim


def calc_constants(results_dir_list):
    # read all results into one df
    for r in results_dir_list:
        tmp = combine_csv_files(r)
        if r == results_dir_list[0]:
            df = tmp.copy(deep=True)
        else:
            df = pd.concat([df, tmp], ignore_index=True)

    # find min local spssim to calculate constants
    min_local = df['local'].min()
    print('Minimum local SpSSIM = {}'.format(min_local))

    min_index = df.index[df['local'] == min_local].tolist()
    df1_mean = df['mean1'].loc[min_index[0]]
    df2_mean = df['mean2'].loc[min_index[0]]
    df1_var = df['var1'].loc[min_index[0]]
    df2_var = df['var2'].loc[min_index[0]]
    covar = df['cov'].loc[min_index[0]]

    # find constants c1 and c2 such that least similar index score = 0
    # spssim = ((2 * df1_mean * df2_mean + c1) * (2 * covar + c2)) / (df1_mean ** 2 + df2_mean ** 2 + c1) * (df1_var + df2_var + c2))
    c1a = -1 * (2 * df1_mean * df2_mean)
    c1b = -1 * (df1_mean ** 2 + df2_mean ** 2)
    c1 = max(c1a, c1b)
    if c1 < 0:
        c1 = 0

    c2a = -1 * (2 * covar)
    c2b = -1 * (df1_var + df2_var)
    c2 = max(c2a, c2b)
    if c2 < 0:
        c2 = 0

    spssim = ((2 * df1_mean * df2_mean + c1) * (2 * covar + c2)) / ((df1_mean ** 2 + df2_mean ** 2 + c1) * (df1_var + df2_var + c2))
    print('Updated minimum local SpSSIM = {}'.format(spssim))

    return c1, c2


In [3]:
# calc inflow spssim
df_in = calc_local_spssim(csv1, csv2, spssim_inflows_csv)

In [4]:
# calc outflow spssim
df_out = calc_local_spssim(csv1, csv2, spssim_outflows_csv, inflow=False)

In [5]:
# calc constants
const1, const2 = calc_constants(['./sample_data/spssim_results/local_spssim_test/'])


# rerun spssim with new constants
df_in2 = calc_local_spssim(csv1, csv2, spssim_inflows_csv, c1=const1, c2=const2)
df_in2.head()

Minimum local SpSSIM = -4.339228784077092e-18
Updated minimum local SpSSIM = 0.0


cbg_orig,n,mean1,var1,mean2,var2,cov,c1,c2,local,global
60730001001,1794.0,0.000557,2.9e-05,0.000557,5.2e-05,2.535471e-05,0.0,6.217664e-07,0.632611,0.47461
60730001002,1794.0,0.000557,1.3e-05,0.000557,1e-05,8.791829e-06,0.0,6.217664e-07,0.775977,0.47461
60730002011,1794.0,0.000557,3e-06,0.000557,2e-06,1.774884e-06,0.0,6.217664e-07,0.72271,0.47461
60730002021,1794.0,0.000557,3e-06,0.000557,2e-06,1.010381e-06,0.0,6.217664e-07,0.487073,0.47461
60730002022,1794.0,0.000557,4e-06,0.000557,6e-06,5.786295e-07,0.0,6.217664e-07,0.160484,0.47461


In [6]:
# rerun spssim with new constants
df_out2 = calc_local_spssim(csv1, csv2, spssim_outflows_csv, c1=const1, c2=const2, inflow=False)
df_out2.head()

Unnamed: 0_level_0,n,mean1,var1,mean2,var2,cov,c1,c2,local,global
cbg_orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60730001001,1794.0,0.000557,9e-06,0.000557,9e-06,8e-06,0.0,6.217664e-07,0.873479,0.873446
60730001002,1794.0,0.000557,1e-05,0.000557,1e-05,9e-06,0.0,6.217664e-07,0.901959,0.873446
60730002011,1794.0,0.000557,1e-05,0.000557,1.1e-05,9e-06,0.0,6.217664e-07,0.900158,0.873446
60730002021,1794.0,0.000557,1e-05,0.000557,1.2e-05,1e-05,0.0,6.217664e-07,0.912518,0.873446
60730002022,1794.0,0.000557,8e-06,0.000557,9e-06,7e-06,0.0,6.217664e-07,0.826683,0.873446


In [7]:
print(df_in2['local'].min(), df_out2['local'].min())


0.0 0.13796241889307356
