In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
# Create a hypothetical dataset
years = np.arange(2013, 2024)

# Generate sales data for Store A
store_A_sales_1 = np.random.randint(70, 75, size=5)  # First 5 years random between 70 and 80
store_A_sales_2 = np.random.randint(95, 100, size=6)  # Next 6 years random range between 95 and 100
store_B_sales = np.random.randint(60, 65, size=11)  # Store B's sales range between 70 and 80 for all years

data = {
    'store': ['A']*11 + ['B']*11,
    'year': np.concatenate([years]*2),
    'sales': np.concatenate([store_A_sales_1, store_A_sales_2, store_B_sales])
}
df = pd.DataFrame(data)
df

Unnamed: 0,store,year,sales
0,A,2013,70
1,A,2014,74
2,A,2015,72
3,A,2016,73
4,A,2017,70
5,A,2018,96
6,A,2019,95
7,A,2020,98
8,A,2021,97
9,A,2022,97


In [3]:
df['year'] = df['year'].astype(int)

# Create a treatment indicator variable
df['treatment'] = (df['store'] == 'A').astype(int)

# Create a post-treatment indicator variable
df['post'] = (df['year'] >= 2015).astype(int)

# Create the interaction term
df['treatment_post'] = df['treatment'] * df['post']

df

Unnamed: 0,store,year,sales,treatment,post,treatment_post
0,A,2013,70,1,0,0
1,A,2014,74,1,0,0
2,A,2015,72,1,1,1
3,A,2016,73,1,1,1
4,A,2017,70,1,1,1
5,A,2018,96,1,1,1
6,A,2019,95,1,1,1
7,A,2020,98,1,1,1
8,A,2021,97,1,1,1
9,A,2022,97,1,1,1


In [4]:
# Perform Difference-in-Differences analysis using OLS regression
X = df[['treatment', 'post', 'treatment_post']]
X = sm.add_constant(X)  # Add constant term for intercept
y = df['sales']

In [5]:
# Fit OLS model
model = sm.OLS(y, X).fit()

# Print regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.724
Model:                            OLS   Adj. R-squared:                  0.678
Method:                 Least Squares   F-statistic:                     15.75
Date:                Fri, 21 Jun 2024   Prob (F-statistic):           2.82e-05
Time:                        00:16:41   Log-Likelihood:                -75.862
No. Observations:                  22   AIC:                             159.7
Df Residuals:                      18   BIC:                             164.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             61.0000      5.948     10.