## Statistical Distirbutions: GerryMandering Case study

In [17]:
import pandas as pd
import scipy.stats as stats

In [18]:
#read in the DSE501A3 data and set the state abbreviations as the index column
df = pd.read_csv('gerrymandering.csv', index_col =0) 
df.head()

Unnamed: 0,AVGRepC,SDRepC,Observed,AVGIBC,SDIBC,AVGIB,SDIB,IB
AL,5.984,0.126,6,1662.906,120.154,1496.664,81.743,2129.918
AZ,5.821,0.516,6,1825.825,150.233,1741.417,136.338,2300.829
CA,12.945,1.096,11,10064.62,1296.0,7275.616,369.595,7038.968
CO,3.54,0.539,3,2035.459,160.413,1445.067,131.946,1631.681
CT,0.011,0.104,0,310.439,16.557,286.935,10.019,399.025


In [19]:
# Calculate z-scores to prep for upcoming questions
df['Z-Score'] = (df['Observed'] - df['AVGRepC']) / df['SDRepC']

### Question 1: Which state delegation has the maximum z-score? & Question 2

In [20]:
max_zscore_state = df['Z-Score'].idxmax()
z_score_for_NC = df.loc['NC', 'Z-Score']
print(f"Max Z-Score State: {max_zscore_state}")
print(f'Z-Score for NC: {round(z_score_for_NC, 3)}')

Max Z-Score State: NC
Z-Score for NC: 6.142


### Question 3: Which state delegation has the minimum z-score? & Question 4

In [21]:
min_zscore_state = df['Z-Score'].idxmin()
z_score_for_MD = df.loc['MD', 'Z-Score']
print(f"Min Z-Score State: {min_zscore_state}")
print(f'Z-Score for MD: {round(z_score_for_MD, 3)}')

Min Z-Score State: MD
Z-Score for MD: -4.072


### Question 5: Which state delegation(s) has the closest z-score to 0?

In [22]:
# Calculate the absolute differences from 0
df['Z-Score_Diff'] = abs(df['Z-Score'] - 0)

# Find states with the smallest differences (closest to 0)
min_diff = df['Z-Score_Diff'].min()
close_to_zero_states = df[df['Z-Score_Diff'] == min_diff].index.tolist()

# Print the states
print(f"States with Z-Scores Closest to 0: {', '.join(close_to_zero_states)}")


States with Z-Scores Closest to 0: IA, NV


### Question 6: Which state delegations (as drawn) are significantly different than their delegations as predicted by the gpMETIS algorithm?

In [23]:
significant_difference_threshold = 2
significant_difference_states = df[abs(df['Z-Score']) > significant_difference_threshold].index.tolist()
print(f"States with Significant Differences: {significant_difference_states}")

States with Significant Differences: ['MD', 'MI', 'NC', 'NH', 'NJ', 'OH', 'TN', 'VA', 'WI']


### Question 7: Which states favor the Democratic party (namely the Republican delegation is significantly smaller than expected)?

In [24]:
significant_difference_threshold = 0.05  # Set the significance threshold for a one-tail test (left tail)

# Find states favoring Democrats (negative z-scores for a one-tail test)
democratic_favor_states = df[df['Z-Score'] < -stats.norm.ppf(1 - significant_difference_threshold)].index.tolist()

# Print the states
print(f"States Favoring Democrats: {', '.join(democratic_favor_states)}")


States Favoring Democrats: CA, MD, NH, TN


### Question 8: Which states favor the Republican party (namely the Republican delegation is significantly larger than expected)?


In [25]:
significant_difference_threshold = 0.05  # Set the significance threshold for a one-tail test (right tail)

# Find states favoring Republicans (positive z-scores for a one-tail test)
republican_favor_states = df[df['Z-Score'] > stats.norm.ppf(1 - significant_difference_threshold)].index.tolist()

# Print the states
print(f"States Favoring Republicans: {', '.join(republican_favor_states)}")


States Favoring Republicans: FL, MI, NC, NJ, OH, PA, VA, WI


### Question 9: Given the state boundary calculations from the algorithm in both cases, which of the states has the smallest effect size to differentiate?

In [26]:
# Calculate the effect size for each state and find the smallest one
df['EffectSize'] = abs(df['AVGIBC'] - df['AVGIB'])
smallest_effect_size_states = df[df['EffectSize'] == df['EffectSize'].min()].index.tolist()
print(f"States with Smallest Effect Size: {smallest_effect_size_states}")

States with Smallest Effect Size: ['MD']


### Question 10: Given the Lehr Formula for quick sample size estimates, which states could have their simulated district internal borders without county prioritization differentiated from their county prioritized borders in fewer than 1 sample?

In [27]:
# Calculate the estimated sample sizes for each state using the Lehr Formula
alpha = 0.05  # Set the significance level
sample_size_threshold = 1.0  # Set the sample size threshold

# Create a list to store states that meet the criteria
states_with_fundamentally_separated_distributions = []

for index, row in df.iterrows():
    # Calculate the sample size using the Lehr Formula
    sample_size = 21 / ((row['EffectSize'] / row['SDIB']) ** 2)

    # Check if the sample size is less than the threshold
    if sample_size < sample_size_threshold:
        states_with_fundamentally_separated_distributions.append(index)

# Print the states
print("States with Fundamentally Separated District Internal Borders =", states_with_fundamentally_separated_distributions)


States with Fundamentally Separated District Internal Borders = ['CA', 'IA', 'MN', 'NH', 'NY', 'PA', 'SC', 'TX', 'VA']


### Question 11: Is the power calculation symmetric for the types of district drawing algorithm priorities?

In [28]:
power_threshold = 0.9

z_alpha = stats.norm.ppf(1 - alpha/2)
power_with_prioritization = 1 - stats.norm.cdf(-z_alpha - (abs(df['AVGIBC'] - df['AVGIB']) / df['SDIBC']))
power_without_prioritization = 1 - stats.norm.cdf(-z_alpha - (abs(df['AVGIBC'] - df['AVGIB']) / df['SDIB']))

mean_power_with_prioritization = power_with_prioritization.mean()
mean_power_without_prioritization = power_without_prioritization.mean()
symmetry_threshold = 0.05

power_calculation_asymmetric = abs(mean_power_with_prioritization - mean_power_without_prioritization) >= symmetry_threshold
print(f"Power Calculation: {power_calculation_asymmetric}")

Power Calculation: False


### Question 12: If the answer to the previous question is no, find the state that has the biggest difference.

In [29]:
# Calculate the critical value for the county prioritized distribution
county_prioritized_mean = df['AVGIBC']
county_prioritized_std = df['SDIBC']
county_prioritized_critical_value = stats.norm.ppf(1 - alpha)

# Calculate the critical value for the non-county prioritized distribution
non_county_prioritized_mean = df['AVGIB']
non_county_prioritized_std = df['SDIB']
non_county_prioritized_critical_value = stats.norm.ppf(1 - alpha)

# Calculate the power for both scenarios
power_with_county_prioritization = 1 - stats.norm.cdf(county_prioritized_critical_value, loc=non_county_prioritized_mean, scale=non_county_prioritized_std)
power_without_county_prioritization = 1 - stats.norm.cdf(non_county_prioritized_critical_value, loc=county_prioritized_mean, scale=county_prioritized_std)

# Calculate the power difference
power_difference = abs(power_with_county_prioritization - power_without_county_prioritization)

# Find the state with the biggest power difference
max_power_difference_state = df.index[power_difference.argmax()]

# Print the state with the biggest power difference
print(f"State with the biggest power difference: {max_power_difference_state}")


State with the biggest power difference: NV


### Question 13: Which state's boundary measures are most differentiable, namely they have the highest power in both power calculations?

In [30]:
# Find states with the highest power in both calculations
max_power_with_prioritization = power_with_prioritization.max()
max_power_without_prioritization = power_without_prioritization.max()
states_with_highest_power_both = df[
    (power_with_prioritization == max_power_with_prioritization) &
    (power_without_prioritization == max_power_without_prioritization)
]

if not states_with_highest_power_both.empty:
    states_with_highest_power_both_names = states_with_highest_power_both.index.tolist()
    print(f"States with the highest power in both calculations: {states_with_highest_power_both_names}")
else:
    print("No states with the highest power in both calculations found.")

States with the highest power in both calculations: ['NH']


### Question 14: Which state has the highest potential for Type II error between both boundary drawing procedures?

In [31]:
# Calculate Type II error for both procedures
type_2_error_with_prioritization = 1 - power_with_prioritization
type_2_error_without_prioritization = 1 - power_without_prioritization

# Calculate the combined Type II error for each state
combined_type_2_error = type_2_error_with_prioritization + type_2_error_without_prioritization

# Find the state with the highest combined Type II error
max_combined_type_2_error_state = df.index[combined_type_2_error.argmax()]

# Print the state with the highest potential for Type II error
print(f"State with the highest potential for Type II error: {max_combined_type_2_error_state}")


State with the highest potential for Type II error: TN


In [1]:
# Method 1: Using string slicing
original_string = "Hello, World!"
reversed_string = original_string[::-1]
print(reversed_string)

!dlroW ,olleH
