In [69]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from linearmodels.panel.data import PanelData
from linearmodels.panel import PanelOLS, PooledOLS, RandomEffects, compare
import matplotlib.pyplot as plt

In [70]:
df=pd.read_csv("../data/merged/len4_ndb_agg_DropNever.csv",encoding="shift-jis",index_col=0)

In [71]:
## processing
df_columns = df.columns # 列名をリストとして取得
import re

# 正規表現パターンを作成します
pattern = r'id|^elasped_(m[0-5]|[0-9]|1\d|20)$'

# 列名のリストを作成します
column_names = [col for col in df_columns if re.search(pattern, col)]

# 列名のリストを表示します
print(column_names)
rest_columns=["year","generic_per","後発品区分"]
column_names=rest_columns+column_names
column_names

['id', 'id_l4', 'elasped_m5', 'elasped_m4', 'elasped_m3', 'elasped_m2', 'elasped_m1', 'elasped_0', 'elasped_1', 'elasped_2', 'elasped_3', 'elasped_4', 'elasped_5', 'elasped_6', 'elasped_7', 'elasped_8', 'elasped_9', 'elasped_10', 'elasped_11', 'elasped_12', 'elasped_13', 'elasped_14', 'elasped_15', 'elasped_16', 'elasped_17', 'elasped_18', 'elasped_19', 'elasped_20', 'id_1145.0', 'id_1147.0', 'id_1149.0', 'id_1219.0', 'id_1231.0', 'id_1242.0', 'id_1249.0', 'id_1319.0', 'id_1324.0', 'id_1329.0', 'id_2119.0', 'id_2189.0', 'id_2229.0', 'id_2234.0', 'id_2239.0', 'id_2249.0', 'id_2259.0', 'id_2290.0', 'id_2319.0', 'id_2325.0', 'id_2329.0', 'id_2359.0', 'id_2399.0', 'id_2454.0', 'id_2459.0', 'id_2529.0', 'id_2590.0', 'id_2646.0', 'id_2649.0', 'id_2655.0', 'id_2659.0', 'id_3136.0', 'id_3214.0', 'id_3327.0', 'id_3399.0', 'id_3919.0', 'id_3992.0', 'id_3999.0', 'id_4413.0', 'id_4490.0', 'id_6250.0', 'id_6290.0', 'id_7990.0']


['year',
 'generic_per',
 '後発品区分',
 'id',
 'id_l4',
 'elasped_m5',
 'elasped_m4',
 'elasped_m3',
 'elasped_m2',
 'elasped_m1',
 'elasped_0',
 'elasped_1',
 'elasped_2',
 'elasped_3',
 'elasped_4',
 'elasped_5',
 'elasped_6',
 'elasped_7',
 'elasped_8',
 'elasped_9',
 'elasped_10',
 'elasped_11',
 'elasped_12',
 'elasped_13',
 'elasped_14',
 'elasped_15',
 'elasped_16',
 'elasped_17',
 'elasped_18',
 'elasped_19',
 'elasped_20',
 'id_1145.0',
 'id_1147.0',
 'id_1149.0',
 'id_1219.0',
 'id_1231.0',
 'id_1242.0',
 'id_1249.0',
 'id_1319.0',
 'id_1324.0',
 'id_1329.0',
 'id_2119.0',
 'id_2189.0',
 'id_2229.0',
 'id_2234.0',
 'id_2239.0',
 'id_2249.0',
 'id_2259.0',
 'id_2290.0',
 'id_2319.0',
 'id_2325.0',
 'id_2329.0',
 'id_2359.0',
 'id_2399.0',
 'id_2454.0',
 'id_2459.0',
 'id_2529.0',
 'id_2590.0',
 'id_2646.0',
 'id_2649.0',
 'id_2655.0',
 'id_2659.0',
 'id_3136.0',
 'id_3214.0',
 'id_3327.0',
 'id_3399.0',
 'id_3919.0',
 'id_3992.0',
 'id_3999.0',
 'id_4413.0',
 'id_4490.0',
 'id_625

In [72]:
df["log_quantity"]=np.log(df["総計"])
df["log_revenue"]=np.log(df["revenue"])
df_generic=df.copy()
df_branded=df.copy()
df_branded=df_branded.loc[df_branded["後発品区分"]==0]
df_generic=df_generic.loc[df_generic["後発品区分"]==1]
df_branded=df_branded.set_index(["id","year"])
df_generic=df_generic.set_index(["id","year"])
df=df.set_index(["id","year"])
df_branded=PanelData(df_branded,convert_dummies=False)
df_generic=PanelData(df_generic,convert_dummies=False)
df=PanelData(df,convert_dummies=False)

In [81]:
# all data
formula="log_quantity~ 後発品区分+generic_per+elasped_m5+elasped_m4+elasped_m3+elasped_m2+elasped_m1+elasped_0+elasped_1+elasped_2+elasped_3+elasped_4+elasped_5+elasped_6+elasped_7+elasped_8+elasped_9+elasped_10+elasped_11+elasped_12+elasped_13+elasped_14+elasped_15+TimeEffects+EntityEffects"
mod_fe=PanelOLS.from_formula(formula, data=df,check_rank=False,drop_absorbed=True)
res_fe=mod_fe.fit(cov_type='clustered', cluster_entity=True)
print(res_fe.summary.tables[1])
# print(res_fe.summary.tables[0])
# print(res_fe)

                              Parameter Estimates                              
             Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------
後発品区分          -0.7953     0.2850    -2.7909     0.0054     -1.3549     -0.2356
generic_per     0.2916     0.0024     121.35     0.0000      0.2869      0.2963
elasped_m5     -0.0791     0.0976    -0.8102     0.4181     -0.2707      0.1126
elasped_m4     -0.0046     0.0689    -0.0666     0.9469     -0.1399      0.1307
elasped_m3     -0.0466     0.0795    -0.5857     0.5583     -0.2027      0.1096
elasped_m2      0.0323     0.0836     0.3865     0.6992     -0.1319      0.1966
elasped_m1      0.0974     0.1042     0.9343     0.3505     -0.1073      0.3020
elasped_0       0.0079     0.1562     0.0508     0.9595     -0.2988      0.3147
elasped_1       0.0335     0.1120     0.2996     0.7646     -0.1864      0.2535
elasped_2       0.0179     0.1151     0.

In [89]:
# generic data
formula="log_quantity~1+generic_per+elasped_m5+elasped_m4+elasped_m3+elasped_m2+elasped_m1+elasped_0+elasped_1+elasped_2+elasped_3+elasped_4+elasped_5+elasped_6+elasped_7+elasped_8+elasped_9+elasped_10+TimeEffects+EntityEffects"
mod_fe=PanelOLS.from_formula(formula, data=df_generic,check_rank=False,drop_absorbed=True)
res_fe=mod_fe.fit(cov_type='clustered', cluster_entity=True)
print(res_fe.summary.tables[1])
# print(res_fe.summary.tables[0])
# print(res_fe)

                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      19.053     0.0615     309.57     0.0000      18.932      19.174
elasped_m5    -0.2578     0.1736    -1.4855     0.1386     -0.5996      0.0840
elasped_m4    -0.0479     0.1208    -0.3963     0.6922     -0.2858      0.1900
elasped_m3    -0.1449     0.1422    -1.0189     0.3092     -0.4249      0.1351
elasped_m2    -0.0232     0.1617    -0.1432     0.8863     -0.3417      0.2953
elasped_m1     0.0511     0.1819     0.2811     0.7788     -0.3071      0.4094
elasped_0      0.0163     0.2321     0.0703     0.9440     -0.4408      0.4734
elasped_1      0.0821     0.1847     0.4444     0.6571     -0.2816      0.4457
elasped_2      0.0345     0.1839     0.1875     0.8514     -0.3277      0.3967
elasped_3     -0.0658     0.1537    -0.4280     0.66

Variables have been fully absorbed and have removed from the regression:

generic_per

  res_fe=mod_fe.fit(cov_type='clustered', cluster_entity=True)


In [103]:
# branded data
formula="log_quantity~ elasped_m5+elasped_m4+elasped_m3+elasped_m2+elasped_m1+elasped_0+elasped_1+elasped_2+elasped_3+elasped_4+elasped_5+elasped_6+elasped_7+elasped_8+elasped_9+elasped_10+TimeEffects+EntityEffects"
mod_fe=PanelOLS.from_formula(formula, data=df_branded,check_rank=False,drop_absorbed=True)
res_fe=mod_fe.fit(cov_type='clustered', cluster_entity=True)
# print(res_fe.summary.tables[1])
# print(res_fe.summary.tables[0])
print(res_fe)

                          PanelOLS Estimation Summary                           
Dep. Variable:           log_quantity   R-squared:                        0.0437
Estimator:                   PanelOLS   R-squared (Between):             -0.0117
No. Observations:                 343   R-squared (Within):              -0.0733
Date:                Thu, Aug 17 2023   R-squared (Overall):             -0.0127
Time:                        13:18:56   Log-likelihood                    62.265
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      0.7919
Entities:                          43   P-value                           0.6946
Avg Obs:                       7.9767   Distribution:                  F(16,277)
Min Obs:                       7.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             2.3990
                            