In [11]:
import pyblp
import numpy as np
import pandas as pd
import statsmodels.api as sm

pyblp.options.digits = 2
pyblp.options.verbose = False
pyblp.__version__

# Load the data 
sales_data = pd.read_csv("OTC_Sales.csv", header=0, sep='\t')
print(sales_data.head())

demo_data = pd.read_csv("OTC_Demographics.csv")

   store  week  brand  sales_  count  price_  prom_  cost_
0      2     1      1      16  14181    3.29    0.0   2.06
1      2     2      1      12  13965    3.27    0.0   2.04
2      2     3      1       6  13538    3.37    0.0   2.15
3      2     4      1      12  13735    3.30    0.0   2.07
4      2     5      1      10  13735    3.34    0.0   2.12


In [12]:
#QUESTION 1
#Now, well start the calculations for the Sum Stats Table. 
#Market Shares 
#Note- for this table I want the market shares to be meaningful in terms of one another (headache medicine). 
# total sales per store-week 
sales_data["total_sales"] = sales_data.groupby(["store", "week"])["sales_"].transform("sum")

# market share: sales of product i / total sales in the store-week
sales_data["market_share"] = sales_data["sales_"] / sales_data["total_sales"]

# Calculate market share for each product (brand)
market_share = sales_data.groupby("brand")["market_share"].mean()

# Print the market share for each product
for brand, share in market_share.items():
    print(f"Market Share of Product_{brand}: {share:.2%}")


Market Share of Product_1: 14.88%
Market Share of Product_2: 17.76%
Market Share of Product_3: 11.65%
Market Share of Product_4: 12.01%
Market Share of Product_5: 7.71%
Market Share of Product_6: 3.60%
Market Share of Product_7: 4.33%
Market Share of Product_8: 3.47%
Market Share of Product_9: 8.16%
Market Share of Product_10: 9.30%
Market Share of Product_11: 7.13%


In [13]:
#Unit Price (price per 1 bottle of a product)
unit_price = sales_data.groupby("brand")["price_"].mean()

# average unit price for each product 
for product, price in unit_price.items():
    print(f"Average Unit Price of Product_{product}: {price:.2f}")


Average Unit Price of Product_1: 3.42
Average Unit Price of Product_2: 4.94
Average Unit Price of Product_3: 7.02
Average Unit Price of Product_4: 2.96
Average Unit Price of Product_5: 5.15
Average Unit Price of Product_6: 8.16
Average Unit Price of Product_7: 2.67
Average Unit Price of Product_8: 3.61
Average Unit Price of Product_9: 3.97
Average Unit Price of Product_10: 1.93
Average Unit Price of Product_11: 4.45


In [14]:
#price per 100 tablets 
price_per_100 = {}

# Tablets per bottle for each product 
tablets_per_bottle = {
    1: 25,  # Tylenol 25
    2: 50,  # Tylenol 50
    3: 100,  # Tylenol 100
    4: 25,  # Advil 25
    5: 50,  # Advil 50
    6: 100,  # Advil 100
    7: 25,  # Bayer 25
    8: 50,  # Bayer 50
    9: 100,  # Bayer 100
    10: 50,  # Store Brand 50
    11: 100  # Store Brand 100
}

# Calculate price per 100 tablets based on the number of tablets per bottle
for product, price in unit_price.items():
    if tablets_per_bottle[product] == 25:
        price_per_100[product] = price * 4  
    elif tablets_per_bottle[product] == 50:
        price_per_100[product] = price * 2  
    elif tablets_per_bottle[product] == 100:
        price_per_100[product] = price  

for product, price in price_per_100.items():
    print(f"Price per 100 tablets of Product_{product}: {price:.2f}")


Price per 100 tablets of Product_1: 13.68
Price per 100 tablets of Product_2: 9.88
Price per 100 tablets of Product_3: 7.02
Price per 100 tablets of Product_4: 11.85
Price per 100 tablets of Product_5: 10.29
Price per 100 tablets of Product_6: 8.16
Price per 100 tablets of Product_7: 10.69
Price per 100 tablets of Product_8: 7.21
Price per 100 tablets of Product_9: 3.97
Price per 100 tablets of Product_10: 3.86
Price per 100 tablets of Product_11: 4.45


In [15]:
#unit wholesale price (this will be the avg cost of each product)
unit_wholesale_price = sales_data.groupby("brand")["cost_"].mean()

for product, cost in unit_wholesale_price.items():
    print(f"Average Wholesale Price of Product_{product}: {cost:.2f}")


Average Wholesale Price of Product_1: 2.18
Average Wholesale Price of Product_2: 3.67
Average Wholesale Price of Product_3: 5.75
Average Wholesale Price of Product_4: 2.03
Average Wholesale Price of Product_5: 3.62
Average Wholesale Price of Product_6: 6.09
Average Wholesale Price of Product_7: 1.85
Average Wholesale Price of Product_8: 2.42
Average Wholesale Price of Product_9: 3.71
Average Wholesale Price of Product_10: 0.91
Average Wholesale Price of Product_11: 1.92


In [16]:
# QUESTION 2
# part a
# To calculate market share Sij, im going to calculate the total market size. 
# Sij will be the number who buy product i in market j /market size (count)
sales_data["market_share"] = sales_data["sales_"] / sales_data["count"]

# outside option share S0j = (any buyer - buyers of headache medicine) / market size
sales_data["outside_share"] = 1 - sales_data.groupby(["store", "week"])["market_share"].transform("sum")

# log market shares (log(s_ij) - log(s_0j))
sales_data["logit_share"] = np.log(sales_data["market_share"]) - np.log(sales_data["outside_share"])

print(sales_data[['store', 'week', 'brand', 'market_share', 'outside_share', 'logit_share']].head())

# dependent variable (logit-transformed market share)
y = sales_data["logit_share"]

# prodcut characteristics price and promotion
X = sales_data[["price_", "prom_"]]  
X = sm.add_constant(X)  

# OLS regression model
ols_model = sm.OLS(y, X).fit()

# Step 4: Print the summary of the OLS regression results
print(ols_model.summary())


   store  week  brand  market_share  outside_share  logit_share
0      2     1      1      0.001128       0.993724    -6.780774
1      2     2      1      0.000859       0.993913    -7.053298
2      2     3      1      0.000443       0.995199    -7.716683
3      2     4      1      0.000874       0.993957    -7.036735
4      2     5      1      0.000728       0.995268    -7.220374
                            OLS Regression Results                            
Dep. Variable:            logit_share   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     315.2
Date:                Mon, 10 Feb 2025   Prob (F-statistic):          1.72e-136
Time:                        21:41:51   Log-Likelihood:                -50711.
No. Observations:               38544   AIC:                         1.014e+05
Df Residuals:                   38541   BIC:                   

In [17]:
# Convert 'product' to numeric (if it's meant to be 1-11)
sales_data["brand"] = pd.to_numeric(sales_data["brand"], errors="coerce")

# Check if the conversion worked
print(sales_data["brand"].dtype)  # Should now be int64 or float64

print(sales_data.dtypes)
print(product_dummies.dtypes)


int64
store              int64
week               int64
brand              int64
sales_             int64
count              int64
price_           float64
prom_            float64
cost_            float64
total_sales        int64
market_share     float64
outside_share    float64
logit_share      float64
dtype: object
brand_2     int64
brand_3     int64
brand_4     int64
brand_5     int64
brand_6     int64
brand_7     int64
brand_8     int64
brand_9     int64
brand_10    int64
brand_11    int64
dtype: object


In [18]:
# part b
# Step 1: Create product fixed effects using brand column
product_dummies = pd.get_dummies(sales_data["brand"], prefix="brand", drop_first=True)

# Convert boolean dummies to integers
product_dummies = product_dummies.astype(int)

# Step 2: Define the dependent variable (logit-transformed market share)
y = sales_data["logit_share"]

# Step 3: Define the independent variables (price, promotion, and product fixed effects)
X = sales_data[["price_", "prom_"]]  # Include price and promotion
X = pd.concat([X, product_dummies], axis=1)  # Add product fixed effects (brand dummies)
X = sm.add_constant(X)  # Add intercept

# Step 4: Fit the OLS regression model
ols_model = sm.OLS(y, X).fit()

# Step 5: Print the regression results
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:            logit_share   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.460
Method:                 Least Squares   F-statistic:                     2733.
Date:                Mon, 10 Feb 2025   Prob (F-statistic):               0.00
Time:                        21:41:58   Log-Likelihood:                -39156.
No. Observations:               38544   AIC:                         7.834e+04
Df Residuals:                   38531   BIC:                         7.845e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.0709      0.036   -166.886      0.0

In [19]:
# Step 1: Convert product and store to numeric
sales_data["brand"] = pd.to_numeric(sales_data["brand"], errors="coerce")
sales_data["store"] = pd.to_numeric(sales_data["store"], errors="coerce")

# Step 2: Create product fixed effects using product column
product_dummies = pd.get_dummies(sales_data["brand"], prefix="brand", drop_first=True)

# Convert boolean dummies to integer (0/1)
product_dummies = product_dummies.astype(int)

# Step 3: Create product-store interaction variable
sales_data["product_store"] = sales_data["brand"].astype(str) + "_" + sales_data["store"].astype(str)

# Step 4: Create product-store fixed effects using the new interaction variable
product_store_dummies = pd.get_dummies(sales_data["product_store"], prefix="prod_store", drop_first=True)

# Convert boolean dummies to integer (0/1)
product_store_dummies = product_store_dummies.astype(int)

# Step 5: Define the dependent variable (logit-transformed market share)
y = sales_data["logit_share"]

# Step 6: Define the independent variables (price, promotion, product fixed effects, and product-store fixed effects)
X = sales_data[["price_", "prom_"]]  # Include price and promotion
X = pd.concat([X, product_dummies, product_store_dummies], axis=1)  # Add product and product-store fixed effects
X = sm.add_constant(X)  # Add intercept

# Step 7: Fit the OLS regression model
ols_model = sm.OLS(y, X).fit()

# Step 8: Print the regression results
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:            logit_share   R-squared:                       0.566
Model:                            OLS   Adj. R-squared:                  0.556
Method:                 Least Squares   F-statistic:                     61.10
Date:                Mon, 10 Feb 2025   Prob (F-statistic):               0.00
Time:                        21:42:09   Log-Likelihood:                -34957.
No. Observations:               38544   AIC:                         7.152e+04
Df Residuals:                   37739   BIC:                         7.841e+04
Df Model:                         804                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -6.0934      0.03

In [22]:
#PART D
# estimating a, b, c but with wholesale cost as an instrument 
from statsmodels.sandbox.regression.gmm import IV2SLS  # Import IV regression

# Ensure all necessary variables are numeric
for col in ["price_", "prom_", "logit_share", "cost_"]:
    sales_data[col] = pd.to_numeric(sales_data[col], errors="coerce")

# Drop NaNs in key variables
sales_data = sales_data.dropna(subset=["price_", "prom_", "logit_share", "cost_"])

# -------------------- PART A: IV with Product Characteristics -------------------- #

# Define dependent variable (logit-transformed market share)
y = sales_data["logit_share"]

# Define endogenous regressor (price) and exogenous variable (promotion)
X = sales_data[["price_", "prom_"]]
X = sm.add_constant(X)  # Add intercept

# Define instrument (wholesale cost)
Z_a = sales_data[["cost_", "prom_"]]
Z_a = sm.add_constant(Z_a)  # Add intercept

# Fit IV regression (2SLS)
iv_wcost_a = IV2SLS(y, X, instrument=Z_a).fit()

print("### PART D (a): IV (wholesale cost) Regression Results ###")
print(iv_wcost_a.summary())


# -------------------- PART B: IV with Product Fixed Effects -------------------- #

# Create product fixed effects (brand dummies)
product_dummies = pd.get_dummies(sales_data["brand"], prefix="brand", drop_first=True).astype(int)

# Add product fixed effects to the exogenous and instrument matrices
X_b = pd.concat([X, product_dummies], axis=1)
Z_b = pd.concat([Z_a, product_dummies], axis=1)

# Fit IV regression (2SLS)
iv_wcost_b = IV2SLS(y, X_b, instrument=Z_b).fit()

print("\n### PART D (b): IV (wholesale cost) Regression with Product Fixed Effects ###")
print(iv_wcost_b.summary())


# -------------------- PART C: IV with Product-Store Fixed Effects -------------------- #

# Create product-store fixed effects
sales_data["product_store"] = sales_data["brand"].astype(str) + "_" + sales_data["store"].astype(str)
product_store_dummies = pd.get_dummies(sales_data["product_store"], prefix="prod_store", drop_first=True).astype(int)

# Add product-store fixed effects to the exogenous and instrument matrices
X_c = pd.concat([X_b, product_store_dummies], axis=1)
Z_c = pd.concat([Z_b, product_store_dummies], axis=1)

# Fit IV regression (2SLS)
iv_wcost_c = IV2SLS(y, X_c, instrument=Z_c).fit()

print("\n### PART E(c): IV (wholesale cost) Regression with Product-Store Fixed Effects ###")
print(iv_wcost_c.summary())


### PART D (a): IV (wholesale cost) Regression Results ###
                          IV2SLS Regression Results                           
Dep. Variable:            logit_share   R-squared:                       0.010
Model:                         IV2SLS   Adj. R-squared:                  0.009
Method:                     Two Stage   F-statistic:                     118.1
                        Least Squares   Prob (F-statistic):           7.23e-52
Date:                Mon, 10 Feb 2025                                         
Time:                        21:42:41                                         
No. Observations:               38544                                         
Df Residuals:                   38541                                         
Df Model:                           2                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

  condno = np.sqrt(eigvals[-1]/eigvals[0])
  return np.sqrt(np.diag(self.cov_params()))


                          IV2SLS Regression Results                           
Dep. Variable:            logit_share   R-squared:                       0.555
Model:                         IV2SLS   Adj. R-squared:                  0.545
Method:                     Two Stage   F-statistic:                -7.802e+05
                        Least Squares   Prob (F-statistic):               1.00
Date:                Mon, 10 Feb 2025                                         
Time:                        21:42:56                                         
No. Observations:               38544                                         
Df Residuals:                   37729                                         
Df Model:                         814                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               502.4533   1.73e+0

In [27]:
#PART E 
# estimate a, b, c with Hausman instrument (average price in other markets)
#---------------------------------------------------------------------------#
# part a with hausman instrument 

# Define dependent variable (log market share)
y = sales_data["logit_share"]

# Define product characteristics, price and promo
X = sales_data[["price_", "prom_"]]
X = sm.add_constant(X)  

# Define Hausman instrument (average price in other markets)
Za_hi = sales_data.groupby(["week", "store"])["price_"].transform("mean")
Za_hi = sm.add_constant(Za_hi)

# Fit IV regression (2SLS)
iv_hi_a = IV2SLS(y, X, instrument=Za_hi).fit()

print("### PART E (a): Hausman IV Regression Results ###")
print(iv_hi_a.summary())

### PART E (a): Hausman IV Regression Results ###
                          IV2SLS Regression Results                           
Dep. Variable:            logit_share   R-squared:                  -18324.592
Model:                         IV2SLS   Adj. R-squared:             -18325.543
Method:                     Two Stage   F-statistic:                -9.994e-13
                        Least Squares   Prob (F-statistic):               1.00
Date:                Mon, 10 Feb 2025                                         
Time:                        21:45:46                                         
No. Observations:               38544                                         
Df Residuals:                   38541                                         
Df Model:                           2                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

  return np.sqrt(np.diag(self.cov_params()))


In [28]:
#PART E 
# estimate a, b, c with Hausman instrument (average price in other markets)
#---------------------------------------------------------------------------#
# part b with hausman instrument

# Create product fixed effects (brand dummies)
product_dummies = pd.get_dummies(sales_data["brand"], prefix="brand", drop_first=True).astype(int)

# Add product fixed effects to the exogenous and instrument matrices
X_b = pd.concat([X, product_dummies], axis=1)
Zb_hi = pd.concat([Za_hi, product_dummies], axis=1)

# Fit IV regression (2SLS)
iv_hi_b = IV2SLS(y, X_b, instrument=Zb_hi).fit()

print("\n### PART E (b): Hausman IV Regression with Product Fixed Effects ###")
print(iv_hi_b.summary())


### PART E (b): Hausman IV Regression with Product Fixed Effects ###
                          IV2SLS Regression Results                           
Dep. Variable:            logit_share   R-squared:                   -5711.116
Model:                         IV2SLS   Adj. R-squared:              -5712.895
Method:                     Two Stage   F-statistic:                    0.8832
                        Least Squares   Prob (F-statistic):              0.564
Date:                Mon, 10 Feb 2025                                         
Time:                        21:45:49                                         
No. Observations:               38544                                         
Df Residuals:                   38531                                         
Df Model:                          12                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------



In [29]:
# #PART E 
# estimate a, b, c with Hausman instrument (average price in other markets)
#---------------------------------------------------------------------------#
# part c with hausman instrument

# Create product-store fixed effects
sales_data["product_store"] = sales_data["brand"].astype(str) + "_" + sales_data["store"].astype(str)
product_store_dummies = pd.get_dummies(sales_data["product_store"], prefix="prod_store", drop_first=True).astype(int)

# Add product-store fixed effects to the exogenous and instrument matrices
X_c = pd.concat([X_b, product_store_dummies], axis=1)
Zc_hi = pd.concat([Zb_hi, product_store_dummies], axis=1)

# Fit IV regression (2SLS)
iv_hi_c = IV2SLS(y, X_c, instrument=Zc_hi).fit()

print("\n### PART E(c): IV (wholesale cost) Regression with Product-Store Fixed Effects ###")
print(iv_hi_c.summary())

: 