In [51]:
import pandas as pd
import statsmodels.api as sm
import os
import statsmodels.formula.api as smf
from statsmodels.tsa.arima.model import ARIMA

In [52]:
target_data = pd.read_csv('targetdata.csv')
policy_data = pd.read_csv('repro-policies-cleaned.csv')

In [53]:
target_data.columns = ['location',
                'year',
                'num_id', # double check that this is what it is
                'rate_for_women',
                'ratio_for_women',
                'num_providers']

In [54]:
policy_data.columns = ['location',
                'year',
                'full_policy',
                'expand',
                'restrict',
                'neutral']

In [55]:
# creating an id to keep track of policies
policy_data['policy_id'] = policy_data.index

There are missing values, so, for simplicity, they were imputed with mean.

In [56]:
# input with mean
target_data['num_providers'].fillna(target_data['num_providers'].mean(), inplace=True)
target_data['ratio_for_women'].fillna(target_data['ratio_for_women'].mean(), inplace=True)

In [57]:
policy_data['year'] = policy_data['year'].astype(int)
target_data['year'] = target_data['year'].astype(int)

# perform an outer merge to include all rows from both DataFrames
combined_data = pd.merge(policy_data, target_data, on=['location', 'year'], how='outer')

In [58]:
combined_data.to_csv('combined_data.csv')

In [59]:
data_up_to_2000 = combined_data[combined_data['year'] <= 2000]
data_from_2001 = combined_data[combined_data['year'] >= 2001]

In [78]:
data_up_to_2000.to_csv('data_up_to_2000.csv')
data_from_2001.to_csv('data_from_2001.csv')

In [60]:
data_up_to_2000.isnull().sum()

location            0
year                0
full_policy        34
expand             34
restrict           34
neutral            34
policy_id          34
num_id             14
rate_for_women     14
ratio_for_women    14
num_providers      14
dtype: int64

In [61]:
data_up_to_2000

Unnamed: 0,location,year,full_policy,expand,restrict,neutral,policy_id,num_id,rate_for_women,ratio_for_women,num_providers
0,New York,1970,"New York repealed its 1830 law, and Washington...",1.0,0.0,1.0,0.0,,,,
1,New York,1970,"On April 10, the New York Senate passes a law ...",1.0,0.0,0.0,1.0,,,,
2,Federal,1973,Roe v. Wade is a landmark decision by the Supr...,1.0,0.0,0.0,2.0,744600.0,16.3,19.3,492.8
3,Kentucky,1974,Kentucky adopts a law preventing public hospit...,0.0,0.0,1.0,3.0,,,,
4,Federal,1976,Planned Parenthood v. Danforth is a Supreme Co...,1.0,0.0,1.0,4.0,1179300.0,24.2,26.5,492.8
5,Ohio,1978,"Akron, Ohio, passes a city ordinance that rest...",0.0,1.0,0.0,5.0,,,,
6,Pennsylvania,1979,Colautti v. Franklin is a Supreme Court aborti...,0.0,0.0,0.0,6.0,,,,
7,Kentucky,1980,Kentucky adopts a law preventing public hospit...,0.0,0.0,1.0,7.0,,,,
8,Pennsylvania,1982,Pennsylvania: The Abortion Control Act is pass...,0.0,0.0,0.0,8.0,,,,
9,Kentucky,1983,Kentucky: The 1981 unlawful abortion convictio...,0.0,0.0,0.0,9.0,,,,


Given the missing indicators for some policies, the best option, under short notice, is to imput the data based on the previous and future values.

In [62]:
data_up_to_2000 = data_up_to_2000.sort_values(by=['location', 'year'])

# apply forward fill followed by backward fill within each location
data_up_to_2000['num_providers'] = data_up_to_2000.groupby('location')['num_providers'].ffill().bfill()

In [63]:
data_up_to_2000['rate_for_women'] = data_up_to_2000.groupby('location')['rate_for_women'].ffill().bfill()
data_up_to_2000['ratio_for_women'] = data_up_to_2000.groupby('location')['ratio_for_women'].ffill().bfill()

Function to create subsets of policies to ease the process of DiD analayis

In [64]:
def prepare_and_save_policy_subsets(data, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # iterate over each unique policy_id in the dataset
    for policy_id in data['policy_id'].dropna().unique():
        # filter for the specific policy
        policy_row = data[data['policy_id'] == policy_id].iloc[0]
        policy_year = policy_row['year']
        policy_state = policy_row['location']

        # filter data for the specific state and set PostPolicy
        state_data = data[data['location'] == policy_state].copy()
        state_data['PostPolicy'] = 0
        state_data.loc[state_data['year'] >= policy_year, 'PostPolicy'] = 1

        filename = f"policy_{int(policy_id)}_data.csv"
        filepath = os.path.join(output_dir, filename)
        state_data.to_csv(filepath, index=False)
        print(f"Saved: {filepath}")

In [65]:
prepare_and_save_policy_subsets(data_up_to_2000, '/content/policy_subsets/')

Saved: /content/policy_subsets/policy_2_data.csv
Saved: /content/policy_subsets/policy_4_data.csv
Saved: /content/policy_subsets/policy_14_data.csv
Saved: /content/policy_subsets/policy_16_data.csv
Saved: /content/policy_subsets/policy_17_data.csv
Saved: /content/policy_subsets/policy_20_data.csv
Saved: /content/policy_subsets/policy_3_data.csv
Saved: /content/policy_subsets/policy_7_data.csv
Saved: /content/policy_subsets/policy_9_data.csv
Saved: /content/policy_subsets/policy_12_data.csv
Saved: /content/policy_subsets/policy_19_data.csv
Saved: /content/policy_subsets/policy_0_data.csv
Saved: /content/policy_subsets/policy_1_data.csv
Saved: /content/policy_subsets/policy_11_data.csv
Saved: /content/policy_subsets/policy_18_data.csv
Saved: /content/policy_subsets/policy_5_data.csv
Saved: /content/policy_subsets/policy_10_data.csv
Saved: /content/policy_subsets/policy_6_data.csv
Saved: /content/policy_subsets/policy_8_data.csv
Saved: /content/policy_subsets/policy_13_data.csv
Saved: /co

In [66]:
policy = pd.read_csv('/content/policy_subsets/policy_11_data.csv')
policy

Unnamed: 0,location,year,full_policy,expand,restrict,neutral,policy_id,num_id,rate_for_women,ratio_for_women,num_providers,PostPolicy
0,New York,1970,"New York repealed its 1830 law, and Washington...",1.0,0.0,1.0,0.0,,45.7,27.239286,289.0,0
1,New York,1970,"On April 10, the New York Senate passes a law ...",1.0,0.0,0.0,1.0,,45.7,27.239286,289.0,0
2,New York,1984,"New York: In People v. Liberta, judge Sol Wach...",0.0,0.0,0.0,11.0,,45.7,27.239286,289.0,1
3,New York,1992,,,,,,195390.0,45.7,27.239286,289.0,1
4,New York,1996,,,,,,167600.0,39.7,27.239286,266.0,1
5,New York,1997,Schenck v. Pro-Choice Network of Western New Y...,0.0,0.0,0.0,18.0,,39.7,27.239286,266.0,1
6,New York,2000,,,,,,164630.0,39.1,27.239286,234.0,1


Function for DiD analysis

In [67]:
def run_pre_post_analysis(data, policy_id, policy_year, policy_state, outcome_var):

    X = sm.add_constant(data[['PostPolicy']])
    y = data[outcome_var]
    model = sm.OLS(y, X, missing='drop').fit()

    return model.params.get('PostPolicy', None)


In [74]:
did_results = pd.DataFrame(columns=['policy_id', 'DiD_rate_for_women', 'DiD_num_providers'])

In [69]:
base_directory = '/content/policy_subsets/'

Iterates through each policy subset of data, then runs the DiD analysis and saves the results to a new dataframe.

In [None]:
did_results = pd.DataFrame(columns=['policy_id', 'DiD_rate_for_women', 'DiD_num_providers'])

file_paths = [f'policy_{i}_data.csv' for i in range(0, 21)]
for file_path in file_paths:
    policy_id = int(file_path.split('_')[1])
    full_path = base_directory + file_path
    policy_data = pd.read_csv(full_path)
    policy_year = policy_data['year'].min()
    policy_state = policy_data['location'].iloc[0]

    did_rate_coeff = run_pre_post_analysis(policy_data, policy_id, policy_year, policy_state, 'rate_for_women')
    did_providers_coeff = run_pre_post_analysis(policy_data, policy_id, policy_year, policy_state, 'num_providers')

    did_results = did_results.append({'policy_id': policy_id, 'DiD_rate_for_women': did_rate_coeff, 'DiD_num_providers': did_providers_coeff}, ignore_index=True)

In [76]:
did_results

Unnamed: 0,policy_id,DiD_rate_for_women,DiD_num_providers
0,0.0,43.042857,274.571429
1,1.0,43.042857,274.571429
2,2.0,25.089655,657.372414
3,3.0,10.1625,8.0
4,4.0,6.680769,183.561538
5,5.0,18.42,41.4
6,6.0,17.283333,76.333333
7,7.0,-1.414286,-1.142857
8,8.0,-1.58,-5.6
9,9.0,-1.65,-1.333333


In [77]:
did_results.to_csv('did_results.csv')

## Considerations

1. Varied Post-Policy Durations: Each policy in the data set has a different post-policy observation period. Some policies might have effects measured over a few years, while others could span decades. This variation complicates making uniform time-bound predictions for new policies.
2. Non-Uniform Policy Effects: The impact of policies on your outcome variable (e.g., rate of women accessing abortion services) is not uniform. Some policies might lead to immediate changes, while others have gradual impacts that become apparent over time.
3. Relative Impact Interpretation: The DiD analysis provides the **relative impact of policies** compared to a counterfactual scenario where the policy was not implemented. This relative impact reflects changes attributed directly to the policy within the observed data but doesn't inherently specify a timeframe for predicted future impacts

**Example Interpretation:** Based on our analysis of historical policies, we estimate that implementing this policy led to a decrease of approximately 3.0% in the rate of women accessing abortion services. This estimate reflects the immediate and overall impact observed in past cases, considering the diverse conditions and durations of those policies.

While our model provides a directional estimate of the policy's effect, the exact timeframe over which this change unfolds may vary depending on specific policy characteristics and contextual factors not captured in our historical dataset.

## Current Limitations:
To understand the impact of policies on the rate of women accessing abortion services, our analysis draws on historical data to estimate the relative change resulting from policy implementations. While we can identify whether policies are likely to lead to increases or decreases, the precise timing—when these changes will fully materialize—and their duration remain uncertain. This uncertainty stems from the diverse nature of policies in our dataset, each with unique characteristics and varying periods of observation. Consequently, our current models provide directional guidance on policy effects rather than specific time-bound predictions. Moving forward, incorporating more detailed data and advanced modeling techniques could enhance our ability to make time-specific forecasts.