In [1]:
#extension_visualization
import pandas as pd
import numpy as np
import matplotlib as plt  
from linearmodels import PanelOLS
import statsmodels.api as sm

from econtools import read, outreg, table_statrow, write_notes
import econtools as econ
import econtools.metrics as mt

#!pip install pylatex
#!pip install pylatex[matrices]
from econtools import read, outreg, table_statrow, write_notes
from pylatex import Document, Section, Subsection, Tabular, Math, TikZ, Axis, \
    Plot, Figure, Matrix, Alignat
from pylatex.utils import italic
import os

df = pd.read_stata('data/Authority.dta')

#construct work category dummy
df['OG03_dummy'] = 0
df.loc[(df['work_category']=='OG03')&(df['work_category']!=''),'OG03_dummy'] = 1

df['OG01_dummy'] = 0
df.loc[(df['work_category']=='OG01')&(df['work_category']!=''),'OG01_dummy'] = 1

df['OG_rest_dummy'] = 0
df.loc[(df['OG01_dummy']!=1)&(df['OG03_dummy']!=1)&(df['work_category']!=''),'OG_rest_dummy'] = 1

df['OG_dummy'] = 0
df.loc[df['work_category'].str[0:2] == 'OG','OG_dummy'] = 1

df['OS_dummy'] = 0
df.loc[df['work_category'].str[0:2] == 'OS','OS_dummy'] = 1

#treated vs controls
df['trend'] = df['year'] - 1999

df['trend_treat'] = df['trend']
df.loc[(df['authority_code']!=3090272)&(df['authority_code']!=3070001),'trend_treat'] = 0
#15225 real change made / no zeros in df['trend']
#print(df['trend_treat'].value_counts()) chekced

df['trend_control'] = df['trend']
df.loc[(df['authority_code']==3090272)|(df['authority_code']==3070001),'trend_control'] = 0
#902 real change made
#print(df['trend_control'].value_counts()) checked

#PA specifics
df = df.sort_values(by='authority_code',ascending=True)
#auth = econ.group_id(df, cols = 'authority_code')
#print(auth) #dataframe

auth_list = df['authority_code'].values.tolist()
auth_list = list(set(auth_list))
#겹치는 부분 = authority_code야
#id_auth = group_id + 1 if df의 auth code == auth의 code

df['id_auth'] = 0
for i in range(len(df)):
    for j in range(len(auth_list)):
        if df.loc[i,'authority_code'] == auth_list[j]:
            df.loc[i,'id_auth'] = j+1

work_dum = pd.get_dummies(df['work_category'])
year_dum = pd.get_dummies(df['year'])
work_list = list(work_dum.columns)
year_list = list(year_dum.columns)

df_dum = pd.concat([year_dum, work_dum],axis = 1)
df = pd.concat([df, df_dum],axis = 1)

In [2]:
#table2 col 2 for all outcomes
#f.write r2 and N
#talbe 2 col2 fianl version
#to make iterable
#스엉공..
df_reg_co = df[(df['turin_co_sample']==1)&(df['ctrl_exp_turin_co_sample']==1)&(df['post_experience']>= 5) & (df['pre_experience']>=5) &(df['post_experience'].isnull() == False ) & (df['pre_experience'].isnull()==False)&(df['missing']==0)]
outcome = ['discount', 'delay_ratio', 'overrun_ratio', 'days_to_award']

# mutlicollinearity function
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

#iteration
for o in outcome:
    #idx = df_reg_co[df_reg_co[o].isnull()==True].index
    #df_name = df_reg_co.drop(idx)
    
    #vif cal
    #first, make a column list
    reg_col = []
    for i in work_list:
        reg_col.append(i)
    for j in year_list:
        reg_col.append(j)
    exog_var = ['fpsb_auction','id_auth','reserve_price','municipality']
    exog = exog_var + reg_col 


    #check multicollinearity
    X = df_reg_co.loc[:,exog]
    vif = calc_vif(X)
    #print(vif)


    #delete from col list
    for i in range(len(vif)):
        if np.isnan(vif.loc[i, 'VIF']) == True:
            reg_col.remove(vif.loc[i, 'variables'])
        elif vif.loc[i,'VIF'] > 10:
            for j in exog_var:
                if str(vif.loc[i,'variables']) is j and vif.loc[i,'variables'] is not 'fpsb_auction' and vif.loc[i,'variables'] is not 'id_auth':
                    exog_var.remove(vif.loc[i,'variables'])
                
    exog = exog_var + reg_col
    exog.remove('id_auth')
    exog.remove(2000)
    exog.remove('OG01')
        #exog.remove('municipality')

    if o == 'discount':
        fe_reg_discount = mt.reg(df_reg_co, o, exog, fe_name = 'authority_code', cluster = 'auth_anno')
    elif o == 'delay_ratio':
        fe_reg_delay = mt.reg(df_reg_co, o, exog, fe_name = 'authority_code', cluster = 'auth_anno')
    elif o == 'overrun_ratio':
        exog.remove('OS07')
        fe_reg_overrun = mt.reg(df_reg_co, o, exog, fe_name = 'authority_code', cluster = 'auth_anno',check_colinear = True)
    else :
        exog.remove('OG04')
        exog.remove('OS05')
        exog.remove('OS07')
        exog.remove('OS11')
        exog.remove('OS26')
        fe_reg_award = mt.reg(df_reg_co, o, exog, fe_name = 'authority_code', cluster = 'auth_anno',check_colinear = True)

result_path = 'my_result.tex'
with open(result_path,'w') as f:
    reg = (fe_reg_discount, fe_reg_delay, fe_reg_overrun, fe_reg_award )
    table_string = outreg(reg, ['fpsb_auction'], ['First Price Auction'], digits = 3)
    #table_string += '\\\ \n'
    table_string += table_statrow("R$^2$", [x.r2 for x in reg], digits =3)
    table_string += table_statrow("Number of Observation", [x.N for x in reg])
        #table_string += table_statrwo("Stdard Error",[iterable])
    f.write(table_string + '\n')

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


In [8]:
table_list = table_string.split('&')
#type = list
#table_list = [i.lsplit(' ', -1)[0] for i in table_list]

AttributeError: 'str' object has no attribute 'lsplit'

In [11]:
table_list2 = [i.split('\\\\ \n',1)[0] for i in table_list]
#print(table_list2)
#문제는 R제곱같은 게 같이 없어짐
list_fpa =[]
list_r2 =[]
list_N = []
for i in range(len(table_list2)):
    if i<5:
        list_fpa.append(table_list2[i])
    elif i>=5 and i<9:
        list_r2.append(table_list2[i])
    else:
        list_N.append(table_list2[i])
table_series = pd.Series(list_fpa, list_r2, list_N)

TypeError: data type not understood

In [None]:
table_series =pd.Series(table_string)
#tb_2 = table_series.str.split('\\\ \n')
#tb_2 = table_series.str.split('\\\\')
tb_2 = table_series.str.split('&')
#tb_2.to_dict()

In [None]:
print(tb_2.values)

In [None]:
df_table = pd.DataFrame({'table2':[tb_2.keys], 'table2_val':[tb_2.values]})
df_table.head()

In [None]:
# df
df_string = pd.DataFrame({'table2':[table_string]})
#df['strip'] = df['table2'].str.strip()
#df['lstrip'] = df['table2'].str.lstrip()
#df['rstrip'] = df['table2'].str.rstrip()
df[[df[1]] = df['table2'].str.split('&', n=3, expand=True)
# = df['table2'].str.partition(sep='&')
df
