In [1]:
import metrics
import pandas as pd
import numpy as np

In [2]:
FIPS = {'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', 
        'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13', 
        'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', 
        'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', 
        'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', 
        'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', 
        'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', 
        'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', 
        'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', 
        'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56'}

In [3]:
# Initialize splits dataframe
df_metrics = pd.DataFrame()

# Iterate over each state
for state, fips_code in FIPS.items():
    print(state)

    # Get relevant path
    direc = 'clean_data/' + state + '/'
    class_path = direc + state + '_classifications.csv'

    # Load classifications and get counties from geoids
    df = pd.read_csv(class_path, dtype=str)
    df['GEOID10'] = df['GEOID10'].str.zfill(15)
    df['county'] = df['GEOID10'].str[2:5]
    df['pop'] = df['pop'].astype(int)

    # Iterate through redistricting plans. Redistricting plans have
    # underscore in name due to our naming convention
    plans = [x for x in df.columns if '_' in x]
    for plan in plans:
        m = metrics.calculate_all_metrics(df, plan, state=state, lclty_col='county')
        df_metrics = df_metrics.append(m, ignore_index=True)

# Sort
df_metrics = df_metrics.sort_values(by=['state', 'plan'])

# reorder columns and save splits
cols = ['state', 'plan'] + sorted([i for i in df_metrics.columns 
                                   if i not in ['state', 'plan']])

df_metrics = df_metrics[cols]
df_metrics['year'] = df_metrics['plan'].apply(lambda x: int(x.split('_')[1]))
df_metrics['plan_type'] = df_metrics['plan'].apply(lambda x: x.split('_')[0])

AL
AK
AZ
AR
CA
CO
CT
DE
FL
GA
HI
ID
IL
IN
IA
KS
KY
LA
ME
MD
MA
MI
MN
MS
MO
MT
NE
NV
NH
NJ
NM
NY
NC
ND
OH
OK
OR
PA
RI
SC
SD
TN
TX
UT
VT
VA
WA
WV
WI
WY


In [10]:
df_metrics.head()

Unnamed: 0,state,plan,conditional_entropy,conditional_entropy_sym,effective_splits,effective_splits_sym,intersections_all,intersections_pop,split_pairs,split_pairs_sym,splits_all,splits_pop,sqrt_entropy,sqrt_entropy_sym,year,plan_type
18,AK,cd_2003,0.0,1.55508,0.0,1.83562,29.0,29.0,0.0,0.392963,0.0,0.0,1.0,2.456735,2003,cd
12,AK,sldl_2000,2.432386,1.352988,27.451338,13.859359,76.0,68.0,0.676428,0.393096,19.0,15.0,2.66907,1.906251,2000,sldl
20,AK,sldl_2010,2.510229,1.412679,31.093974,15.723881,78.0,74.0,0.691109,0.406228,18.0,17.0,2.730597,1.951656,2010,sldl
15,AK,sldl_2013,2.58926,1.483611,35.321061,17.843844,76.0,72.0,0.713915,0.429815,17.0,16.0,2.770711,1.98758,2013,sldl
21,AK,sldl_2014,2.575801,1.469949,34.152138,17.259614,71.0,69.0,0.702555,0.421117,12.0,12.0,2.768464,1.982478,2014,sldl


In [5]:
df_metrics[(df_metrics['state'] == 'PA') & (df_metrics['plan'] == 'cd_2018')].iloc[0]

state                            PA
plan                        cd_2018
conditional_entropy        0.472565
conditional_entropy_sym    0.962235
effective_splits            10.1605
effective_splits_sym         6.3403
intersections_all                84
intersections_pop                84
split_pairs                0.210904
split_pairs_sym            0.346633
splits_all                       13
splits_pop                       13
sqrt_entropy                1.22573
sqrt_entropy_sym            1.55037
year                           2018
plan_type                        cd
Name: 345, dtype: object

In [6]:
pre_2012 = df_metrics[(df_metrics['year'] < 2012)]
last_pre_2012 = pre_2012.drop_duplicates(['state', 'plan_type'], keep='last')

post_2013 = df_metrics[(df_metrics['year'] >= 2013)]
first_post_2013 = post_2013.drop_duplicates(['state', 'plan_type'], keep='first')

df = pd.concat([last_pre_2012, first_post_2013])
df = df.sort_values(['state', 'plan_type', 'year'])

In [11]:
df.head()

Unnamed: 0,state,plan,conditional_entropy,conditional_entropy_sym,effective_splits,effective_splits_sym,intersections_all,intersections_pop,split_pairs,split_pairs_sym,splits_all,splits_pop,sqrt_entropy,sqrt_entropy_sym,year,plan_type
18,AK,cd_2003,0.0,1.55508,0.0,1.83562,29.0,29.0,0.0,0.392963,0.0,0.0,1.0,2.456735,2003,cd
20,AK,sldl_2010,2.510229,1.412679,31.093974,15.723881,78.0,74.0,0.691109,0.406228,18.0,17.0,2.730597,1.951656,2010,sldl
15,AK,sldl_2013,2.58926,1.483611,35.321061,17.843844,76.0,72.0,0.713915,0.429815,17.0,16.0,2.770711,1.98758,2013,sldl
23,AK,sldu_2010,1.765111,1.166175,15.422576,8.055906,58.0,56.0,0.574858,0.3875,17.0,15.0,2.044111,1.690583,2010,sldu
22,AK,sldu_2013,1.904483,1.298798,19.628811,10.265474,57.0,57.0,0.610794,0.425855,14.0,14.0,2.127034,1.768826,2013,sldu


In [8]:
plan_types = ['sldl', 'sldu', 'cd']

cols = ['conditional_entropy', 'effective_splits', 'sqrt_entropy', 'split_pairs']


changed = ['WA', 'NV', 'AZ', 'TX', 'IA', 'MO', 'IL', 'LA', 'MI', 'OH', 'PA',
           'NY', 'FL', 'GA', 'SC', 'MA', 'NJ']
 

for col in cols:
    n=0
    col2 = col +'_sym'
    matches = []
    for state in df['state'].unique():
        for plan_type in plan_types:
            sliced_df = df[(df['state'] == state) & (df['plan_type'] == plan_type)]
            if plan_type != 'cd' or state not in changed:
                if len(sliced_df) == 2:
                    match = (sliced_df.iloc[0][col] - sliced_df.iloc[1][col]) * (sliced_df.iloc[0][col2] - sliced_df.iloc[1][col2])
                    if match != 0:
                        matches.append(0.5 + 0.5*match/abs(match))
    print(col)
    print(sum(matches) / len(matches))
    print(len(matches))
    
    
out_df = pd.DataFrame()

cols = ['splits_pop', 'intersections_pop', 'effective_splits', 'conditional_entropy', 'sqrt_entropy', 'split_pairs']
out_df[''] = cols
for col in cols:
    record = []
    for col2 in cols:
        matches = []
        for state in df['state'].unique():
            for plan_type in plan_types:
                sliced_df = df[(df['state'] == state) & (df['plan_type'] == plan_type)]
                if plan_type != 'cd' or state not in changed:
                    if len(sliced_df) == 2:
                        match = (sliced_df.iloc[0][col] - sliced_df.iloc[1][col]) * (sliced_df.iloc[0][col2] - sliced_df.iloc[1][col2])
                        if match != 0:
                            matches.append(0.5 + 0.5*match/abs(match))
        print(f'{col} vs. {col2}')
        print(sum(matches) / len(matches))
        print(len(matches))
        record.append(np.round(sum(matches) / len(matches), 2))
    out_df[col] = record

conditional_entropy
0.967479674796748
123
effective_splits
0.975609756097561
123
sqrt_entropy
0.9349593495934959
123
split_pairs
0.9105691056910569
123
splits_pop vs. splits_pop
1.0
92
splits_pop vs. intersections_pop
0.8705882352941177
85
splits_pop vs. effective_splits
0.782608695652174
92
splits_pop vs. conditional_entropy
0.6195652173913043
92
splits_pop vs. sqrt_entropy
0.5978260869565217
92
splits_pop vs. split_pairs
0.6847826086956522
92
intersections_pop vs. splits_pop
0.8705882352941177
85
intersections_pop vs. intersections_pop
1.0
108
intersections_pop vs. effective_splits
0.7870370370370371
108
intersections_pop vs. conditional_entropy
0.7592592592592593
108
intersections_pop vs. sqrt_entropy
0.7685185185185185
108
intersections_pop vs. split_pairs
0.75
108
effective_splits vs. splits_pop
0.782608695652174
92
effective_splits vs. intersections_pop
0.7870370370370371
108
effective_splits vs. effective_splits
1.0
123
effective_splits vs. conditional_entropy
0.8455284552845529

In [9]:
out_df

Unnamed: 0,Unnamed: 1,splits_pop,intersections_pop,effective_splits,conditional_entropy,sqrt_entropy,split_pairs
0,splits_pop,1.0,0.87,0.78,0.62,0.6,0.68
1,intersections_pop,0.87,1.0,0.79,0.76,0.77,0.75
2,effective_splits,0.78,0.79,1.0,0.85,0.76,0.85
3,conditional_entropy,0.62,0.76,0.85,1.0,0.89,0.9
4,sqrt_entropy,0.6,0.77,0.76,0.89,1.0,0.79
5,split_pairs,0.68,0.75,0.85,0.9,0.79,1.0
