# Make Data for Market Level analysis 

## Input: Import and Process Text Dispersion Bert data

In [2]:
use "../2_pipeline/BERTtext_ent_90_100_2twosteps.dta "

In [3]:
su


    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
      market |     10,368    17.80469    13.55121         -1         36
year_month~t |     10,368    45.79948    14.39884          1         64
installer_id |     10,368    18044.93    6707.335        108      23027
text_d_mk~an |      9,057    .1375434    .0235513   .0545641   .3658786
text_d_mkt~d |      8,679    .0617684    .0166544   .0055242   .1519678
-------------+---------------------------------------------------------
text_d_mk~in |      9,057     .035209    .0365696  -1.19e-07   .3658786
text_d_mk~25 |      9,057    .0966523    .0236698   .0545641   .3658786
text_d_mk~50 |      9,057    .1248378    .0254379   .0545641   .3658786
text_d_mk~75 |      9,057    .1652722    .0316213   .0545641   .3694176
text_d_mkt~x |      9,057      .41044    .1597509   .0545641   .7564847
-------------+-------------------------------------------------

In [4]:
keep market year_month_count text_d_mkt*

In [5]:
su


    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
      market |     10,368    17.80469    13.55121         -1         36
year_month~t |     10,368    45.79948    14.39884          1         64
text_d_mk~an |      9,057    .1375434    .0235513   .0545641   .3658786
text_d_mkt~d |      8,679    .0617684    .0166544   .0055242   .1519678
text_d_mk~in |      9,057     .035209    .0365696  -1.19e-07   .3658786
-------------+---------------------------------------------------------
text_d_mk~25 |      9,057    .0966523    .0236698   .0545641   .3658786
text_d_mk~50 |      9,057    .1248378    .0254379   .0545641   .3658786
text_d_mk~75 |      9,057    .1652722    .0316213   .0545641   .3694176
text_d_mkt~x |      9,057      .41044    .1597509   .0545641   .7564847


In [6]:
duplicates drop


Duplicates in terms of all variables

(8,776 observations deleted)


In [7]:
su


    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
      market |      1,592    17.85741    11.82617         -1         36
year_month~t |      1,592    38.59799    16.60936          1         64
text_d_mk~an |      1,037    .1392879    .0368229   .0545641   .3658786
text_d_mkt~d |        932    .0551869    .0218395   .0055242   .1519678
text_d_mk~in |      1,037    .0584735    .0438322  -1.19e-07   .3658786
-------------+---------------------------------------------------------
text_d_mk~25 |      1,037    .1049511    .0365321   .0545641   .3658786
text_d_mk~50 |      1,037    .1303665    .0397347   .0545641   .3658786
text_d_mk~75 |      1,037    .1662856    .0477026   .0545641   .3694176
text_d_mkt~x |      1,037    .3050979    .1434346   .0545641   .7564847


## Output: market level text dispersion 

In [7]:
save "../2_pipeline/BERTtext_ent_90_100_2twosteps_mktlevel.dta"

file BERTtext_ent_90_100_2twosteps_mktlevel.dta saved


## Input:  Build on regression individual data from step11

In [15]:
use "../3_output/regression_analysis_ind_with_ent_sent_june2020.dta" ,clear


In [16]:
codebook state


---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
state                                                                                                                                                                                                                                               (unlabeled)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                  type:  string (str2)

         unique values:  33                       missing "":  0/8,113

              examples:  "CA"
                         "MA"
                         "NH"
                         "OR

## Data Cleaning: fill in missing winning_quote

In [19]:
su winning*


    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
winning_qu~t |      8,113    .4794774    1.471995          0         43


In [20]:
replace winning_quote=0 if winning_quote==.

(0 real changes made)


## Data Cleaning: make weighted state dummy before we collapse the data to market level

#### weighted state dummy 

In [21]:
tabulate state, generate(dummy_state)


      state |      Freq.     Percent        Cum.
------------+-----------------------------------
         AZ |        170        2.10        2.10
         CA |      1,729       21.31       23.41
         CO |        345        4.25       27.66
         CT |        247        3.04       30.70
         DC |         35        0.43       31.14
         DE |          8        0.10       31.23
         FL |        284        3.50       34.73
         GA |         24        0.30       35.03
         IL |        109        1.34       36.37
         IN |         11        0.14       36.51
         LA |         33        0.41       36.92
         MA |        897       11.06       47.97
         MD |        315        3.88       51.86
         MI |        140        1.73       53.58
         MN |         93        1.15       54.73
         MO |         32        0.39       55.12
         NC |        387        4.77       59.89
         NH |         19        0.23       60.13
         NJ |      

In [22]:
bysort market year_month_count: gen count_mkt_year_month=_N

### market level variables to generate 
1. sum of winning quotes 
2. sum of reviews accumulated individually 
3. sum of given quotes 

In [23]:
egen sum_quotes = total(winning_quotes), by(year_month_count market)
egen sum_reviews = total(reviews_ct), by(year_month_count market)
egen sum_givenquotes = total(quotes_count), by(year_month_count market)

4. Average experience variable (and take average) 

### Collapse into market level 

In [25]:
collapse (first) sum_quotes sum_reviews sum_givenquotes ent_mkt text_d_mkt_p50 text_d_mkt_mean log_zip_rev avgmean_mkt avg_sent_score_mkt month year count_mkt_year_month (mean) experience temp_own_price* (sum) dummy*, by(year_month_count market)

# after collapse data 

## Generate weighted state dummy

In [26]:
foreach x of varlist dummy_state* {
    replace `x'=`x'/count_mkt_year_month
}


(65 real changes made)
(158 real changes made)
(91 real changes made)
(104 real changes made)
(17 real changes made)
(8 real changes made)
(72 real changes made)
(1 real change made)
(46 real changes made)
(11 real changes made)
(4 real changes made)
(122 real changes made)
(108 real changes made)
(39 real changes made)
(37 real changes made)
(19 real changes made)
(114 real changes made)
(15 real changes made)
(81 real changes made)
(14 real changes made)
(21 real changes made)
(178 real changes made)
(88 real changes made)
(59 real changes made)
(84 real changes made)
(45 real changes made)
(75 real changes made)
(174 real changes made)
(18 real changes made)
(61 real changes made)
(24 real changes made)
(101 real changes made)
(0 real changes made)


## log variables that needed to be logged 

In [27]:
drop if market==-1
gen log_sum_quotes=log(1+sum_quotes)
gen log_sum_reviews=log(1+sum_reviews)

gen log_sum_givenquotes=log(1+sum_givenquotes)


(64 observations deleted)





In [28]:
xtset market year_month_count

       panel variable:  market (unbalanced)
        time variable:  year_month_~t, 1 to 64, but with gaps
                delta:  1 unit


In [29]:
su


    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
      market |      1,528    18.64725    11.41016          0         36
year_month~t |      1,528     38.8534    16.47747          1         64
  sum_quotes |      1,528    2.388089    6.625488          0         91
 sum_reviews |      1,528    16.71204    37.07434          0        314
sum_givenq~s |      1,528     141.498    410.1673          0       4280
-------------+---------------------------------------------------------
     ent_mkt |        804    .1851868    .2336162          0    1.05492
text_d_mk~50 |        998    .1306127    .0404729   .0545641   .3658786
text_d_mkt~n |        998     .139373    .0374821   .0545641   .3658786
 log_zip_rev |      1,528    8.201697    7.954562          0   22.30267
 avgmean_mkt |      1,027    4.887787    .2379249          3          5
-------------+-------------------------------------------------

save regression_analysis_mkt_with_ent_sent_jan18.dta, replace

##  label variables  

In [30]:
label variable ent_mkt "Rating_Entropy_Mkt"
label variable avgmean_mkt "Average_Rating_Mkt"
label variable log_zip_rev "Market_LogRevenue"
label variable experience "Experience"
label variable temp_own_price_diff_other "Price_Difference"
label variable text_d_mkt_mean  "Text_Dispersion_Mkt"
label variable text_d_mkt_p50  "Text_Dispersion_Mkt"
label variable log_sum_givenquotes "Market Transaction"
label variable log_sum_quotes "Market Transaction"
label variable avg_sent_score_mkt  "Average_Sentiment_Mkt"

## Output: market level data for regression 

In [31]:
save "../3_output/regression_analysis_mkt_with_ent_sent_june9.dta"

file ../3_output/regression_analysis_mkt_with_ent_sent_june9.dta saved
