In [None]:
#Python Code for Master Thesis of Kiara Schröder
#This Notebook displays the code I've used to run my did-regression analysis. I've run the analysis in a step-by-step manner to 
#see the impact that further fixed effects and controls have for the fit of my model. The models show the following specifications:
#Model 1: Basic DiD without controls



#Imports
import pandas as pd
import statsmodels.formula.api as smf
import os
import numpy as np

In [None]:
# Model 1: Basic DiD without controls


# === Step 1: Configuration ===
run_polarization = False  # Set to True for polarization analysis

if run_polarization:
    input_filepath = "PATH/TO/your_polarization_data.csv"
    output_dir = "did_results_model1_polarization"
    dependent_vars = [
        'avg_compound_score', 'sentiment_variance',
        'log_avg_us_count', 'log_avg_them_count'
    ]
else:
    input_filepath = "PATH/TO/your_engagement_data.csv"
    output_dir = "did_results_model1_engagement"
    dependent_vars = [
        'log_actual_comment_count', 'log_avg_childcom_depth',
        'log_unique_authors', 'log_avg_word_count'
    ]

os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load Data ===
df = pd.read_csv(input_filepath)

# === Step 3: Time Fixed Effects ===
if 'created_utc' in df.columns:
    df['datetime'] = pd.to_datetime(df['created_utc'], errors='coerce')
    df['date'] = df['datetime'].dt.date
    time_fe = "C(date)"
else:
    time_fe = "C(Period)"

# === Step 4: Run Model 1 ===
model_results = {}
for var in [v for v in dependent_vars if v in df.columns]:
    formula = f"{var} ~ treatment:Period + C(treatment) + {time_fe}"
    model = smf.ols(formula, data=df).fit(cov_type='HC3')
    model_results[var] = model

# === Step 5: Save Results ===
summary = pd.DataFrame({
    'Dependent Variable': list(model_results.keys()),
    'Treatment Effect': [m.params['treatment:Period'] for m in model_results.values()],
    'Standard Error': [m.bse['treatment:Period'] for m in model_results.values()],
    'p-value': [m.pvalues['treatment:Period'] for m in model_results.values()],
    'R-squared': [m.rsquared for m in model_results.values()]
})
summary['p-value'] = summary['p-value'].apply(lambda p: f"{p:.4f}" + ("***" if p < 0.01 else "**" if p < 0.05 else "*" if p < 0.1 else ""))
summary.to_csv(os.path.join(output_dir, "did_results_model1.csv"), index=False)

In [None]:
# Model 2: DiD with Subreddit Fixed Effects


# === Step 1: Configuration ===

# Set True for polarization, False for user engagement
run_polarization = False

if run_polarization:
    input_filepath = "PATH/TO/your_polarization_data.csv"
    output_dir = "did_results_model2_polarization"
    dependent_vars = [
        'avg_compound_score',
        'sentiment_variance',
        'log_avg_us_count',
        'log_avg_them_count'
    ]
else:
    input_filepath = "PATH/TO/your_engagement_data.csv"
    output_dir = "did_results_model2_engagement"
    dependent_vars = [
        'log_actual_comment_count',
        'log_avg_childcom_depth',
        'log_unique_authors',
        'log_avg_word_count'
    ]

os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load Data ===
print(f"Loading data from {input_filepath}...")
df = pd.read_csv(input_filepath)
print(f"Loaded {len(df)} observations")

# === Step 3: Validate Variables ===
existing = [v for v in dependent_vars if v in df.columns]
if len(existing) < len(dependent_vars):
    missing = list(set(dependent_vars) - set(existing))
    print(f"Warning: These variables are missing and will be skipped: {missing}")
    dependent_vars = existing

# === Step 4: Run Regressions ===
model_results = {}

for var in dependent_vars:
    formula = f"{var} ~ Period + treatment:Period + C(treatment)"
    print(f"\nRunning Model 2 for {var}:\nFormula: {formula}")
    
    try:
        model = smf.ols(formula, data=df).fit(cov_type="HC3")
        model_results[var] = model
        
        print(f"  Treatment effect: {model.params['treatment:Period']:.4f}")
        print(f"  p-value: {model.pvalues['treatment:Period']:.4f}")
        print(f"  R²: {model.rsquared:.4f}")
        
    except Exception as e:
        print(f"  Failed on {var}: {e}")

# === Step 5: Summarize Results ===
if model_results:
    summary_df = pd.DataFrame({
        'Dependent Variable': list(model_results.keys()),
        'Treatment Effect': [m.params['treatment:Period'] for m in model_results.values()],
        'Standard Error': [m.bse['treatment:Period'] for m in model_results.values()],
        't-statistic': [m.tvalues['treatment:Period'] for m in model_results.values()],
        'p-value': [m.pvalues['treatment:Period'] for m in model_results.values()],
        'R-squared': [m.rsquared for m in model_results.values()],
        'Adjusted R-squared': [m.rsquared_adj for m in model_results.values()],
        'N': [m.nobs for m in model_results.values()]
    })

    def add_stars(p):
        return f"{p:.4f}" + ("***" if p < 0.01 else "**" if p < 0.05 else "*" if p < 0.1 else "")
    
    summary_df['p-value'] = summary_df['p-value'].apply(add_stars)

    output_csv = os.path.join(output_dir, "did_results_model2.csv")
    summary_df.to_csv(output_csv, index=False)
    print(f"\nSaved summary table to: {output_csv}")

    # Save full model summaries
    for var, model in model_results.items():
        path = os.path.join(output_dir, f"model2_summary_{var}.txt")
        with open(path, "w") as f:
            f.write(model.summary().as_text())
        print(f"Saved full summary for {var}")
else:
    print("No models successfully estimated.")

print("\nModel 2 analysis complete.")

In [None]:
# Model 3: DiD with Daily Fixed Effects for Separate Input Files


# === Step 1: Configuration ===

# Set this flag to True for polarization analysis, False for user engagement
run_polarization = False

# Define filepaths
if run_polarization:
    input_filepath = "PATH/TO/your_polarization_data.csv"
    output_dir = "did_results_model3_polarization"
    dependent_vars = [
        'avg_compound_score',
        'sentiment_variance',
        'log_avg_us_count',
        'log_avg_them_count'
    ]
else:
    input_filepath = "PATH/TO/your_engagement_data.csv"
    output_dir = "did_results_model3_engagement"
    dependent_vars = [
        'log_actual_comment_count',
        'log_avg_childcom_depth',
        'log_unique_authors',
        'log_avg_word_count'
    ]

os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load Data ===
print(f"Loading data from {input_filepath}...")
df = pd.read_csv(input_filepath)
print(f"Loaded {len(df)} observations")

# === Step 3: Create Date-Based Fixed Effects ===
if 'created_utc' in df.columns:
    df['datetime'] = pd.to_datetime(df['created_utc'], errors='coerce')
    df['date'] = df['datetime'].dt.date
    time_fe = "C(date)"
    print(f"Using daily fixed effects with {df['date'].nunique()} unique days")
else:
    time_fe = "C(Period)"
    print("Falling back to period-based fixed effects")

# === Step 4: Validate Variables ===
existing = [v for v in dependent_vars if v in df.columns]
if len(existing) < len(dependent_vars):
    missing = list(set(dependent_vars) - set(existing))
    print(f"Warning: These variables are missing and will be skipped: {missing}")
    dependent_vars = existing

# === Step 5: Run Regressions ===
model_results = {}

for var in dependent_vars:
    formula = f"{var} ~ treatment:Period + C(treatment) + {time_fe}"
    print(f"\nRunning Model 3 for {var}:\nFormula: {formula}")
    
    try:
        model = smf.ols(formula, data=df).fit(cov_type="HC3")
        model_results[var] = model
        
        print(f"  Treatment effect: {model.params['treatment:Period']:.4f}")
        print(f"  p-value: {model.pvalues['treatment:Period']:.4f}")
        print(f"  R²: {model.rsquared:.4f}")
        
    except Exception as e:
        print(f"  Failed on {var}: {e}")

# === Step 6: Export Results ===
if model_results:
    summary_df = pd.DataFrame({
        'Dependent Variable': list(model_results.keys()),
        'Treatment Effect': [m.params['treatment:Period'] for m in model_results.values()],
        'Standard Error': [m.bse['treatment:Period'] for m in model_results.values()],
        't-statistic': [m.tvalues['treatment:Period'] for m in model_results.values()],
        'p-value': [m.pvalues['treatment:Period'] for m in model_results.values()],
        'R-squared': [m.rsquared for m in model_results.values()],
        'Adjusted R-squared': [m.rsquared_adj for m in model_results.values()],
        'N': [m.nobs for m in model_results.values()]
    })

    def add_stars(p):
        return f"{p:.4f}" + ("***" if p < 0.01 else "**" if p < 0.05 else "*" if p < 0.1 else "")
    
    summary_df['p-value'] = summary_df['p-value'].apply(add_stars)

    output_csv = os.path.join(output_dir, "did_results_model3.csv")
    summary_df.to_csv(output_csv, index=False)
    print(f"\nSaved summary table to: {output_csv}")

    # Save detailed model summaries
    for var, model in model_results.items():
        path = os.path.join(output_dir, f"model3_summary_{var}.txt")
        with open(path, "w") as f:
            f.write(model.summary().as_text())
        print(f"Saved full summary for {var}")
else:
    print("No models successfully estimated")

print("\nModel 3 analysis complete.")

In [None]:
# Model 4: DiD with log_avg_account_age 


# === Step 1: Configuration ===
run_polarization = False  # Set to True for polarization analysis

if run_polarization:
    input_filepath = "PATH/TO/your_polarization_data.csv"
    output_dir = "did_results_model4_polarization"
    dependent_vars = [
        'avg_compound_score',
        'sentiment_variance',
        'log_avg_us_count',
        'log_avg_them_count'
    ]
else:
    input_filepath = "PATH/TO/your_engagement_data.csv"
    output_dir = "did_results_model4_engagement"
    dependent_vars = [
        'log_actual_comment_count',
        'log_avg_childcom_depth',
        'log_unique_authors',
        'log_avg_word_count'
    ]

os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load Data ===
print(f"Loading data from {input_filepath}...")
df = pd.read_csv(input_filepath)
print(f"Loaded {len(df)} observations")

# === Step 3: Time Fixed Effects ===
if 'created_utc' in df.columns:
    df['datetime'] = pd.to_datetime(df['created_utc'], errors='coerce')
    df['date'] = df['datetime'].dt.date
    time_fe = "C(date)"
else:
    time_fe = "C(Period)"

# === Step 4: Controls and Check Variables ===
controls = "log_avg_account_age"
existing_vars = [v for v in dependent_vars if v in df.columns]
dependent_vars = existing_vars

# === Step 5: Run Model 4 ===
model_results = {}

for var in dependent_vars:
    formula = f"{var} ~ treatment:Period + C(treatment) + {time_fe} + {controls}"
    print(f"\nRunning Model 4 for {var}:\nFormula: {formula}")
    
    try:
        model = smf.ols(formula, data=df).fit(cov_type='HC3')
        model_results[var] = model

        print(f"  Treatment effect: {model.params['treatment:Period']:.4f}")
        print(f"  p-value: {model.pvalues['treatment:Period']:.4f}")
        print(f"  R²: {model.rsquared:.4f}")
        
    except Exception as e:
        print(f"  Error modeling {var}: {e}")

# === Step 6: Save Results ===
if model_results:
    summary = pd.DataFrame({
        'Dependent Variable': list(model_results.keys()),
        'Treatment Effect': [m.params['treatment:Period'] for m in model_results.values()],
        'Standard Error': [m.bse['treatment:Period'] for m in model_results.values()],
        'p-value': [m.pvalues['treatment:Period'] for m in model_results.values()],
        'R-squared': [m.rsquared for m in model_results.values()]
    })

    def stars(p): return f"{p:.4f}" + ("***" if p < 0.01 else "**" if p < 0.05 else "*" if p < 0.1 else "")
    summary['p-value'] = summary['p-value'].apply(stars)

    summary.to_csv(os.path.join(output_dir, "did_results_model4.csv"), index=False)

    for var, model in model_results.items():
        with open(os.path.join(output_dir, f"model4_summary_{var}.txt"), "w") as f:
            f.write(model.summary().as_text())

print("\nModel 4 complete.")

In [None]:
# Model 5: DiD with log_post_score and log_avg_account_age 


# === Step 1: Configuration ===
run_polarization = False  # Set to True for polarization analysis

if run_polarization:
    input_filepath = "PATH/TO/your_polarization_data.csv"
    output_dir = "did_results_model5_polarization"
    dependent_vars = [
        'avg_compound_score',
        'sentiment_variance',
        'log_avg_us_count',
        'log_avg_them_count'
    ]
else:
    input_filepath = "PATH/TO/your_engagement_data.csv"
    output_dir = "did_results_model5_engagement"
    dependent_vars = [
        'log_actual_comment_count',
        'log_avg_childcom_depth',
        'log_unique_authors',
        'log_avg_word_count'
    ]

os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load Data ===
print(f"Loading data from {input_filepath}...")
df = pd.read_csv(input_filepath)
print(f"Loaded {len(df)} observations")

# === Step 3: Time Fixed Effects ===
if 'created_utc' in df.columns:
    df['datetime'] = pd.to_datetime(df['created_utc'], errors='coerce')
    df['date'] = df['datetime'].dt.date
    time_fe = "C(date)"
else:
    time_fe = "C(Period)"

# === Step 4: Controls and Dependent Vars ===
controls = "log_post_score +log_avg_account_age"
existing_vars = [v for v in dependent_vars if v in df.columns]
dependent_vars = existing_vars

# === Step 5: Run Model 5 ===
model_results = {}

for var in dependent_vars:
    formula = f"{var} ~ treatment:Period + C(treatment) + {time_fe} + {controls}"
    print(f"\nRunning Model 5 for {var}:\nFormula: {formula}")
    
    try:
        model = smf.ols(formula, data=df).fit(cov_type='HC3')
        model_results[var] = model

        print(f"  Treatment effect: {model.params['treatment:Period']:.4f}")
        print(f"  p-value: {model.pvalues['treatment:Period']:.4f}")
        print(f"  R²: {model.rsquared:.4f}")
        
    except Exception as e:
        print(f"  Error modeling {var}: {e}")

# === Step 6: Save Results ===
if model_results:
    summary = pd.DataFrame({
        'Dependent Variable': list(model_results.keys()),
        'Treatment Effect': [m.params['treatment:Period'] for m in model_results.values()],
        'Standard Error': [m.bse['treatment:Period'] for m in model_results.values()],
        'p-value': [m.pvalues['treatment:Period'] for m in model_results.values()],
        'R-squared': [m.rsquared for m in model_results.values()]
    })

    def stars(p): return f"{p:.4f}" + ("***" if p < 0.01 else "**" if p < 0.05 else "*" if p < 0.1 else "")
    summary['p-value'] = summary['p-value'].apply(stars)

    summary.to_csv(os.path.join(output_dir, "did_results_model5.csv"), index=False)

    for var, model in model_results.items():
        with open(os.path.join(output_dir, f"model5_summary_{var}.txt"), "w") as f:
            f.write(model.summary().as_text())

print("\nModel 5 complete.")