# Splitting metric comparisons #

This notebook contains the code used to compare splitting metrics in the paper:

**Split Decisions: Guidance for Measuring Locality Preservation in District Maps**

## Imports ##

In [1]:
import metrics
import pandas as pd
import numpy as np

In [2]:
FIPS = {'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', 
        'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13', 
        'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', 
        'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', 
        'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', 
        'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', 
        'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', 
        'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', 
        'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', 
        'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56'}

## Calculate all metrics for available plans ##

Below we calculate the county splitting metrics for all available plans. This takes a couple minutes to run.

In [3]:
# Initialize splits dataframe
df_metrics = pd.DataFrame()

# iterate over each state
for state, fips_code in FIPS.items():

    # get relevant path
    direc = 'clean_data/' + state + '/'
    class_path = direc + state + '_classifications.csv'

    # Load classifications and get counties from geoids
    df = pd.read_csv(class_path, dtype=str)
    df['GEOID10'] = df['GEOID10'].str.zfill(15)
    df['county'] = df['GEOID10'].str[2:5]
    df['pop'] = df['pop'].astype(int)

    # iterate through redistricting plans (redistricting plans have
    # underscore in name due to our naming convention)
    plans = [x for x in df.columns if '_' in x]
    for plan in plans:
        m = metrics.calculate_all_metrics(df, plan, state=state, lclty_col='county')
        df_metrics = df_metrics.append(m, ignore_index=True)

# sort by state and plan
df_metrics = df_metrics.sort_values(by=['state', 'plan'])

# reorder columns and and year and plan_type
cols = ['state', 'plan'] + sorted([i for i in df_metrics.columns 
                                   if i not in ['state', 'plan']])
df_metrics = df_metrics[cols]
df_metrics['year'] = df_metrics['plan'].apply(lambda x: int(x.split('_')[1]))
df_metrics['plan_type'] = df_metrics['plan'].apply(lambda x: x.split('_')[0])

In [4]:
df_metrics.head()

Unnamed: 0,state,plan,conditional_entropy,conditional_entropy_sym,effective_splits,effective_splits_sym,intersections_all,intersections_pop,intersections_pop_sym,split_pairs,split_pairs_sym,splits_all,splits_pop,splits_pop_sym,sqrt_entropy,sqrt_entropy_sym,year,plan_type
18,AK,cd_2003,0.0,1.55508,0.0,1.83562,29.0,29.0,29.0,0.0,0.392963,0.0,0.0,0.5,1.0,2.456735,2003,cd
12,AK,sldl_2000,2.432386,1.352988,27.451338,13.859359,76.0,68.0,68.0,0.676428,0.393096,19.0,15.0,13.5,2.66907,1.906251,2000,sldl
20,AK,sldl_2010,2.510229,1.412679,31.093974,15.723881,78.0,74.0,74.0,0.691109,0.406228,18.0,17.0,14.5,2.730597,1.951656,2010,sldl
15,AK,sldl_2013,2.58926,1.483611,35.321061,17.843844,76.0,72.0,72.0,0.713915,0.429815,17.0,16.0,13.5,2.770711,1.98758,2013,sldl
21,AK,sldl_2014,2.575801,1.469949,34.152138,17.259614,71.0,69.0,69.0,0.702555,0.421117,12.0,12.0,11.5,2.768464,1.982478,2014,sldl


## Analysis ##

We will look at pairs of maps for a single state+plan and study how the metrics changed relative to each other after 2011 redistricting.

### Get pairs of maps pre/post 2011 redistricting ###

In [5]:
# get the plan used before 2011 redistricting (before 2012 elections)
pre_2012 = df_metrics[(df_metrics['year'] < 2012)]
last_pre_2012 = pre_2012.drop_duplicates(['state', 'plan_type'], keep='last')

# get the plan used right after 2011 redistricting (reported to U.S. Census in 2013)
post_2013 = df_metrics[(df_metrics['year'] >= 2013)]
first_post_2013 = post_2013.drop_duplicates(['state', 'plan_type'], keep='first')

# concat these and sort by state and plan
df = pd.concat([last_pre_2012, first_post_2013])
df = df.sort_values(['state', 'plan_type', 'year'])

# get rid of the 8 records where no new plan was found (1 CD states)
df = df[df.duplicated(['state', 'plan_type'], keep=False)]

# toss out states where number of congressional districts changed after 2010 census
# for true apples-to-apples comparison
changed = ['WA', 'NV', 'AZ', 'TX', 'IA', 'MO', 'IL', 'LA', 'MI', 'OH', 'PA',
           'NY', 'FL', 'GA', 'SC', 'MA', 'NJ']
df = df[(df['plan_type'] != 'cd') | (~df['state'].isin(changed))]

In [6]:
df.head(10)

Unnamed: 0,state,plan,conditional_entropy,conditional_entropy_sym,effective_splits,effective_splits_sym,intersections_all,intersections_pop,intersections_pop_sym,split_pairs,split_pairs_sym,splits_all,splits_pop,splits_pop_sym,sqrt_entropy,sqrt_entropy_sym,year,plan_type
20,AK,sldl_2010,2.510229,1.412679,31.093974,15.723881,78.0,74.0,74.0,0.691109,0.406228,18.0,17.0,14.5,2.730597,1.951656,2010,sldl
15,AK,sldl_2013,2.58926,1.483611,35.321061,17.843844,76.0,72.0,72.0,0.713915,0.429815,17.0,16.0,13.5,2.770711,1.98758,2013,sldl
23,AK,sldu_2010,1.765111,1.166175,15.422576,8.055906,58.0,56.0,56.0,0.574858,0.3875,17.0,15.0,12.0,2.044111,1.690583,2010,sldu
22,AK,sldu_2013,1.904483,1.298798,19.628811,10.265474,57.0,57.0,57.0,0.610794,0.425855,14.0,14.0,11.0,2.127034,1.768826,2013,sldu
7,AL,cd_2003,0.260899,1.455778,6.334495,5.49667,75.0,75.0,75.0,0.128078,0.444753,8.0,8.0,7.5,1.110824,1.978699,2003,cd
1,AL,cd_2013,0.245719,1.439114,3.877009,4.283152,76.0,75.0,75.0,0.114644,0.435923,8.0,7.0,7.0,1.110028,1.97426,2013,cd
4,AL,sldl_2011,1.953535,1.202159,80.270788,40.344893,176.0,175.0,175.0,0.614035,0.405483,39.0,39.0,44.5,2.211034,1.712648,2011,sldl
3,AL,sldl_2013,2.095181,1.335157,96.48204,48.517705,207.0,207.0,207.0,0.654196,0.445264,50.0,50.0,53.5,2.304264,1.797365,2013,sldl
10,AL,sldu_2010,1.075023,1.113324,30.823123,16.096746,113.0,113.0,113.0,0.397265,0.417031,30.0,30.0,28.0,1.579365,1.596983,2010,sldu
9,AL,sldu_2013,1.1288,1.161254,35.04372,18.20622,122.0,122.0,122.0,0.425914,0.428967,33.0,33.0,29.5,1.609059,1.637159,2013,sldu


Main helper function for the metric comparisons across all available pairs of plans

In [7]:
def metric_match_proportion(df, col1, col2):
    
    # inititalize arroy of matches, which will have 0's for non-matches and 1 for matches
    matches = []
    
    # find all (state, plan_type) pairs in the data set
    data_points = df[['state', 'plan_type']].drop_duplicates()
    
    # for each (state, plan_type) pair
    for _, row in data_points.iterrows():
        
        # slice to the pre-redistricting and post-redistricting scores
        sliced_df = df[(df['state'] == row['state']) & (df['plan_type'] == row['plan_type'])]
        assert (len(sliced_df) == 2)
        
        # find the differences reported by each metric
        metric_1_diff = sliced_df.iloc[0][col] - sliced_df.iloc[1][col1] 
        metric_2_diff = sliced_df.iloc[0][col2] - sliced_df.iloc[1][col2]
        
        # get a match number (positive = match, negative = mismatch, zero = some metric unchanged)
        match = metric_1_diff * metric_2_diff
        
        # tossing out the cases when some metric was unchanged
        if match != 0:
            
            # append 0 if mismatch, 1 if match
            matches.append(0.5 + 0.5*match/abs(match))
            
    return matches

Compare each metric to its symmetric version

In [8]:
# initialize columns of interest
cols = ['splits_pop', 'intersections_pop', 'effective_splits', 
        'conditional_entropy', 'sqrt_entropy', 'split_pairs']

# intitalize DataFrame to store stats on symmetric metric matches
sym_matches_df = pd.DataFrame()
sym_matches_df[''] = cols
sym_matches_df = sym_matches_df.set_index('')

# calculate match proportions and sample sizes
for col in cols:
    matches = metric_match_proportion(df, col, col + '_sym')
    sym_matches_df.loc[col, 'match_proportion'] = np.round(sum(matches) / len(matches), 2)
    sym_matches_df.loc[col, 'sample_size'] = len(matches)
    
sym_matches_df

Unnamed: 0,match_proportion,sample_size
,,
splits_pop,0.92,87.0
intersections_pop,1.0,108.0
effective_splits,0.98,123.0
conditional_entropy,0.97,123.0
sqrt_entropy,0.93,123.0
split_pairs,0.91,123.0


Compare all pairs of metrics (non-symmetric versions)

In [9]:
# intitalize DataFrame to store pairwise proportions of metric matches
pairs_df = pd.DataFrame()
pairs_df[''] = cols
pairs_df = pairs_df.set_index('')

# intitalize DataFrame to store pairwise sample sizes
sample_sizes_df = pd.DataFrame()
sample_sizes_df[''] = cols
sample_sizes_df = sample_sizes_df.set_index('')

# calculate match proportions and sample sizes
for col in cols:
    record = []
    for col2 in cols:
        matches = metric_match_proportion(df, col, col2)
        pairs_df.loc[col, col2] = np.round(sum(matches) / len(matches), 2)
        sample_sizes_df.loc[col, col2] = len(matches)
        
display(pairs_df)
display(sample_sizes_df)

Unnamed: 0,splits_pop,intersections_pop,effective_splits,conditional_entropy,sqrt_entropy,split_pairs
,,,,,,
splits_pop,1.0,0.87,0.78,0.62,0.6,0.68
intersections_pop,0.87,1.0,0.79,0.76,0.77,0.75
effective_splits,0.78,0.79,1.0,0.85,0.76,0.85
conditional_entropy,0.62,0.76,0.85,1.0,0.89,0.9
sqrt_entropy,0.6,0.77,0.76,0.89,1.0,0.79
split_pairs,0.68,0.75,0.85,0.9,0.79,1.0


Unnamed: 0,splits_pop,intersections_pop,effective_splits,conditional_entropy,sqrt_entropy,split_pairs
,,,,,,
splits_pop,92.0,85.0,92.0,92.0,92.0,92.0
intersections_pop,85.0,108.0,108.0,108.0,108.0,108.0
effective_splits,92.0,108.0,123.0,123.0,123.0,123.0
conditional_entropy,92.0,108.0,123.0,123.0,123.0,123.0
sqrt_entropy,92.0,108.0,123.0,123.0,123.0,123.0
split_pairs,92.0,108.0,123.0,123.0,123.0,123.0
