# Solution for computing market level entropy

## Part 1 - making the ''inventory'' of reviews (per market, month)
### inputs:
- es_market_id ( month- market-installerid)
- reviews ( installerid-reviewid-timestamp) 
### process
- combine es_market_id and reviews, we can find out for each market, each month, the *set* of reviews that are present on the market and record the list of review ids  
### output :
- inventory file 
- obs is on installer-mkt-month level. 
- every obs
    - identifiers(installer id, market, year_month_count)
    - list of their own reviews id at that time 
    - list of the other reviews id at that time 
    - list of all reviews ids ( own and others) 


In [1]:
pwd

'C:\\Users\\Herbie Huang\\Google Drive\\Current_SolarResearch\\working_june2020\\solar_project\\1_code'

## Input: market levels from step 3 
 

In [3]:
import pandas as pd 
temp_es_market=pd.read_csv('../2_pipeline/es_marketlevel90_100_2two_step.csv')

In [4]:
temp_es_market['year_month_count'].describe()

count    8113.000000
mean       43.258228
std        14.543701
min         1.000000
25%        34.000000
50%        46.000000
75%        55.000000
max        64.000000
Name: year_month_count, dtype: float64

In [5]:
es_market_ids=temp_es_market[['installer_id','market','year_month_count']].drop_duplicates()

In [6]:
es_market_ids.describe()

Unnamed: 0,installer_id,market,year_month_count
count,8113.0,8113.0,8113.0
mean,17971.731912,17.784667,43.258228
std,6707.006717,13.486967,14.543701
min,108.0,-1.0,1.0
25%,20131.0,4.0,34.0
50%,20627.0,17.0,46.0
75%,21350.0,33.0,55.0
max,23027.0,36.0,64.0



import pandas as pd 
es_market_ids=pd.read_csv('es_market_ids.csv')

## input: raw reviews data : installer_review_data_20180410.csv

In [7]:
reviews=pd.read_csv('../0_data/Lock_ES_RawData/installer_review_data_20180410.csv',sep=',',escapechar='\\')


In [8]:
reviews=reviews[['id','installer_id','date_created']]
reviews['date_created']=pd.to_datetime(reviews['date_created'])
reviews['year']=reviews['date_created'].apply(lambda x:x.year)
reviews['month']=reviews['date_created'].apply(lambda x:x.month)
reviews['year_month_count']=12*(reviews['year']-2013)+reviews['month']

In [9]:
es_market_ids.head()

Unnamed: 0,installer_id,market,year_month_count
0,21283,0.0,35.0
1,21283,0.0,36.0
2,21283,0.0,37.0
3,21283,0.0,38.0
4,21283,0.0,39.0


In [10]:
es_market_ids[es_market_ids['installer_id']==20292]

Unnamed: 0,installer_id,market,year_month_count


In [11]:
es_market_ids_no_time=es_market_ids[['installer_id','market']].drop_duplicates()

We need to assign review ids to markets. Right now they are assigned to installers. 

In [12]:
es_market_ids_no_time.describe()


Unnamed: 0,installer_id,market
count,385.0,385.0
mean,19452.335065,16.976623
std,5830.528403,13.278938
min,108.0,-1.0
25%,20552.0,4.0
50%,21347.0,17.0
75%,21838.0,31.0
max,23027.0,36.0


In [12]:
reviews=pd.merge(left=reviews,right=es_market_ids_no_time,on=['installer_id'],how='left').dropna()

In [13]:
reviews.head()

Unnamed: 0,id,installer_id,date_created,year,month,year_month_count,market
1,2,556,2013-02-18 20:03:23,2013,2,2,36.0
2,3,857,2013-03-19 00:51:14,2013,3,3,34.0
4,5,857,2013-04-26 22:52:20,2013,4,4,34.0
5,6,20193,2013-06-07 15:33:22,2013,6,6,20.0
8,9,6507,2013-07-04 01:36:06,2013,7,7,36.0


Loop: 
- loop over market, slice the es_market_ids
- for the slice, loop over month, 
    - in month K, find all reviews that belonged on the market -> list_reviews_mkt
    - in same month k, find all reviews that belonged to self -> list_reviews_self

In [14]:
results_collection=[]
for market in set(es_market_ids.market):
    print('market:'+str(market))
    temp1=es_market_ids[es_market_ids['market']==market]
    for m in set(temp1.year_month_count):
        print('month: '+str(m))
        temp2=temp1[temp1['year_month_count']<=m]
        installers=set(temp2.installer_id)
        #print(installers)
        if installers:
            for i in installers:
                list_reviews_mkt=reviews[(reviews['market']==market)&(reviews['year_month_count']<=m)].id
                list_reviews_self=reviews[(reviews['market']==market)&(reviews['year_month_count']<=m)&(reviews['installer_id']==i)].id
                list_reviews_others=reviews[(reviews['market']==market)&(reviews['year_month_count']<=m)&(reviews['installer_id']!=i)].id
                temp_to_collect=[market,m,i,list_reviews_mkt,list_reviews_self,list_reviews_others]
                results_collection.append(temp_to_collect)
        

market:0.0
month: 35.0
month: 36.0
month: 37.0
month: 38.0
month: 39.0
month: 40.0
month: 41.0
month: 42.0
month: 43.0
month: 44.0
month: 45.0
month: 46.0
month: 47.0
month: 48.0
month: 49.0
month: 50.0
month: 51.0
month: 52.0
month: 53.0
month: 54.0
month: 55.0
month: 56.0
month: 57.0
month: 58.0
month: 59.0
month: 60.0
month: 61.0
month: 62.0
month: 63.0
month: 64.0
market:1.0
month: 7.0
month: 8.0
month: 9.0
month: 10.0
month: 11.0
month: 12.0
month: 13.0
month: 14.0
month: 15.0
month: 16.0
month: 17.0
month: 18.0
month: 19.0
month: 20.0
month: 21.0
month: 22.0
month: 23.0
month: 24.0
month: 25.0
month: 26.0
month: 27.0
month: 28.0
month: 29.0
month: 30.0
month: 31.0
month: 32.0
month: 33.0
month: 34.0
month: 35.0
month: 36.0
month: 37.0
month: 38.0
month: 39.0
month: 40.0
month: 41.0
month: 42.0
month: 43.0
month: 44.0
month: 45.0
month: 46.0
month: 47.0
month: 48.0
month: 49.0
month: 50.0
month: 51.0
month: 52.0
month: 53.0
month: 54.0
month: 55.0
month: 56.0
month: 57.0
month: 58

In [15]:
results_collection[8886][5].tolist() #results_collection collect the right results. 

[2,
 9,
 11,
 12,
 15,
 19,
 20,
 95,
 116,
 127,
 143,
 154,
 157,
 219,
 227,
 228,
 235,
 249,
 263,
 266,
 322,
 331,
 332,
 341,
 370,
 373,
 383,
 402,
 438,
 440,
 446,
 447,
 448,
 454,
 461,
 483]

In [16]:
results_collection2=[]
for r in results_collection:
    results_collection2.append([r[0],r[1],r[2],r[3].tolist(),r[4].tolist(),r[5].tolist()])

In [17]:
reviewsid_mkt_month_individual_inventory=pd.DataFrame(data=results_collection2,columns=['market','year_month_count','installer_id','reviewidlist_mkt','reviewidlist_self','reviewidlist_others'])

In [18]:
reviewsid_mkt_month_individual_inventory.tail()

Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others
10363,-1.0,64.0,21350,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[342, 343, 344, 345, 346, 347, 348, 349, 350, ...","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10364,-1.0,64.0,21862,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[1234, 1235, 1239, 1241, 1259, 2223]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10365,-1.0,64.0,22375,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2082, 2959]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10366,-1.0,64.0,22773,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2915, 2921, 2926, 3001]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10367,-1.0,64.0,21118,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[200, 202, 210, 329]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."


## Output: review_inventory : 2_pipeline/reviewsid_mkt_month_individual_inventory_jan17_2020.csv 

In [19]:
reviewsid_mkt_month_individual_inventory.to_csv('../2_pipeline/reviewsid_mkt_month_individual_inventory_jan17_2020.csv')

# Part 2 Find the cosine distance based entropy 

Use the inventory dataframe produced in part 1 

In [13]:
import pandas as pd
inventory=pd.read_csv('../2_pipeline/reviewsid_mkt_month_individual_inventory_jan17_2020.csv')

## Input: read the distance file from folder

In [14]:

distances_pairs=pd.read_csv('../3_output/ALL_BERT_distances_pairwise_dec30.csv')
distances_pairs.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vec_review_id_1,vec_review_id_2,cosine_distance
count,5700000.0,5700000.0,5700000.0,5700000.0,5700000.0
mean,49999.5,49999.5,1121.237,2259.326,0.1532048
std,28867.52,28867.52,785.6149,798.4089,0.0765878
min,0.0,0.0,1.0,2.0,-1.192093e-07
25%,24999.75,24999.75,454.0,1694.0,0.09990977
50%,49999.5,49999.5,989.0,2396.0,0.1349529
75%,74999.25,74999.25,1683.0,2934.0,0.187068
max,99999.0,99999.0,3134.0,3396.0,0.7962324


### define a helper function get_distance that get the distance between pairs

In [22]:
def get_distances(temp_inv):
    return distances_pairs[distances_pairs.vec_review_id_1.isin(temp_inv)&distances_pairs.vec_review_id_2.isin(temp_inv)].cosine_distance
    

## Input: Read the inventory data from Part 1 

In [15]:
inventory=pd.read_csv('../2_pipeline/reviewsid_mkt_month_individual_inventory_jan17_2020.csv')
inventory.head()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others
0,0,0.0,35.0,21283,[],[],[]
1,1,0.0,36.0,21283,[],[],[]
2,2,0.0,37.0,21283,[],[],[]
3,3,0.0,38.0,21283,[],[],[]
4,4,0.0,39.0,21283,[],[],[]


Look on inventory. For every row, get the reviewidlist_mkt, if not empty, get the median, 

inventory[['text_d_mkt_min','text_d_mkt_p25','text_d_mkt_p50','text_d_mkt_p75','text_d_mkt_max']]=0


In [24]:
inventory.tail()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others
10363,10363,-1.0,64.0,21350,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[342, 343, 344, 345, 346, 347, 348, 349, 350, ...","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10364,10364,-1.0,64.0,21862,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[1234, 1235, 1239, 1241, 1259, 2223]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10365,10365,-1.0,64.0,22375,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2082, 2959]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10366,10366,-1.0,64.0,22773,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2915, 2921, 2926, 3001]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10367,10367,-1.0,64.0,21118,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[200, 202, 210, 329]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."


for i, row in df.iterrows():
      ifor_val = something  
      if <condition>:  
        ifor_val = something_else  
      df.at[i,'ifor'] = ifor_val  

## Make the reduced version so we don't have to encounter too many distances lookups

### Reduced mkt level list 

In [25]:
inventory_reduced_mkt=inventory[['reviewidlist_mkt']].drop_duplicates().reset_index(drop=True)

In [26]:
inventory_reduced_mkt.head()

Unnamed: 0,reviewidlist_mkt
0,[]
1,"[503, 510]"
2,"[503, 510, 853, 854, 855, 856, 863, 866]"
3,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ..."
4,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ..."


### Reduced self level list

In [27]:
inventory_reduced_self = inventory[['reviewidlist_self']].drop_duplicates().reset_index(drop=True)
inventory_reduced_self.tail()

Unnamed: 0,reviewidlist_self
1234,"[2854, 2910, 2914, 2954, 3072]"
1235,[3064]
1236,"[778, 2837, 3119, 3201]"
1237,"[151, 166, 182, 1111, 1122, 1150, 1157, 1158, ..."
1238,"[2854, 2910, 2914, 2954, 3072, 3266, 3298]"


### Reduced others level list 

In [28]:
inventory_reduced_others=inventory[['reviewidlist_others']].reset_index(drop=True) 
inventory_reduced_others.tail()

Unnamed: 0,reviewidlist_others
10363,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10364,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10365,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10366,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10367,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."


In [29]:
type(inventory_reduced_others.iloc[10334].reviewidlist_others)

str

## Calculate the store the summary stats for every reduced level id lists 

### Calculate and store the summary stats for the reduced market lists 

In [42]:
inventory_reduced_mkt_list=[]
for index, row in inventory_reduced_mkt.iterrows():
    if len(row.reviewidlist_mkt)>2:
        temp=[int(i) for i in row.reviewidlist_mkt.strip('[').strip(']').split(',')]
        if len(temp)>1:
            temp_distances=get_distances(temp)
            stats=temp_distances.describe(percentiles=[0.25,0.5,0.75 ]) 
            #print(stats)
            inventory_reduced_mkt_list.append([index,stats[1],stats[2],stats[3],stats[4],stats[5],stats[6],stats[7]])
            if index%20==0:
                print('we processed {} rows for mkt level'.format(index))
 



we processed 20 rows for mkt level
we processed 40 rows for mkt level
we processed 60 rows for mkt level
we processed 80 rows for mkt level
we processed 100 rows for mkt level
we processed 120 rows for mkt level
we processed 140 rows for mkt level
we processed 160 rows for mkt level
we processed 180 rows for mkt level
we processed 200 rows for mkt level
we processed 220 rows for mkt level
we processed 240 rows for mkt level
we processed 260 rows for mkt level
we processed 280 rows for mkt level
we processed 300 rows for mkt level
we processed 320 rows for mkt level
we processed 340 rows for mkt level
we processed 360 rows for mkt level
we processed 380 rows for mkt level
we processed 400 rows for mkt level
we processed 420 rows for mkt level
we processed 440 rows for mkt level
we processed 460 rows for mkt level
we processed 500 rows for mkt level
we processed 520 rows for mkt level
we processed 540 rows for mkt level


In [43]:
inventory_reduced_mkt_list[1]

[2,
 0.13148378900119237,
 0.05357383618628571,
 0.06895774602890015,
 0.09243164956569673,
 0.11075007915496826,
 0.14962449669837952,
 0.24646145105361936]

In [44]:
stats

count    11600.000000
mean         0.157125
std          0.085060
min          0.000000
25%          0.100182
50%          0.131768
75%          0.185321
max          0.596568
Name: cosine_distance, dtype: float64

## Output: text dispersion for market level 

In [45]:
col_name=['index','text_d_mkt_mean','text_d_mkt_std','text_d_mkt_min','text_d_mkt_p25','text_d_mkt_p50','text_d_mkt_p75','text_d_mkt_max']
inventory_reduced_mkt_df=pd.DataFrame(data=inventory_reduced_mkt_list,columns=col_name)
inventory_reduced_mkt_df.to_csv('../2_pipeline/inventory_reduced_mkt_sumstats_jan18_2020.csv')

### Calculate and store the important stats for the reduced self lists

In [46]:

inventory_reduced_self_list=[]
for index, row in inventory_reduced_self.iterrows():
    if len(row.reviewidlist_self)>2:
        temp=[int(i) for i in row.reviewidlist_self.strip('[').strip(']').split(',')]
        if len(temp)>1:
            temp_distances=get_distances(temp)
            stats=temp_distances.describe(percentiles=[0.25,0.5,0.75 ]) 
            inventory_reduced_self_list.append([index,stats[1],stats[2],stats[3],stats[4],stats[5],stats[6],stats[7]])
            if index%100==0:
                print('we processed {} rows for self level'.format(index))        



we processed 100 rows for self level
we processed 200 rows for self level
we processed 300 rows for self level
we processed 400 rows for self level
we processed 500 rows for self level
we processed 600 rows for self level
we processed 900 rows for self level
we processed 1000 rows for self level
we processed 1100 rows for self level
we processed 1200 rows for self level


## Output: text dispersion for Installer level Own Dispersion 

In [None]:


col_name=['index','text_d_self_mean','text_d_self_std','text_d_self_min','text_d_self_p25','text_d_self_p50','text_d_self_p75','text_d_self_max']
inventory_reduced_self_df=pd.DataFrame(data=inventory_reduced_self_list,columns=col_name)
inventory_reduced_self_df.to_csv('../2_pipeline/inventory_reduced_self_sumstats_jan18_2020.csv')

In [48]:
inventory_reduced_self_df.head()

Unnamed: 0,index,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
0,1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
1,2,0.104398,0.024051,0.068958,0.088068,0.099304,0.120322,0.14949
2,3,0.12496,0.045977,0.055537,0.091352,0.111885,0.149483,0.211835
3,4,0.188121,0.056137,0.125027,0.165909,0.206791,0.219667,0.232544
4,5,0.163721,0.062319,0.09876,0.106348,0.165909,0.216008,0.232544


### Calculate and store the important stats for the reduced other lists

In [49]:
inventory_reduced_others_list=[]                
for index, row in inventory_reduced_others.iterrows():
    if len(row.reviewidlist_others)>2:
        temp=[int(i) for i in row.reviewidlist_others.strip('[').strip(']').split(',')]
        #print(temp)
        if len(temp)>1:
            temp_distances=get_distances(temp) 
            stats=temp_distances.describe(percentiles=[0.25,0.5,0.75 ]) 
            inventory_reduced_others_list.append([index,stats[1],stats[2],stats[3],stats[4],stats[5],stats[6],stats[7]])
            if index%100==0:
                print('we processed {} rows for other level'.format(index))


we processed 300 rows for other level
we processed 500 rows for other level
we processed 600 rows for other level
we processed 700 rows for other level
we processed 800 rows for other level
we processed 900 rows for other level
we processed 1000 rows for other level
we processed 1100 rows for other level
we processed 1200 rows for other level
we processed 1400 rows for other level
we processed 1500 rows for other level
we processed 1600 rows for other level
we processed 1700 rows for other level
we processed 1800 rows for other level
we processed 1900 rows for other level
we processed 2000 rows for other level
we processed 2100 rows for other level
we processed 2200 rows for other level
we processed 2300 rows for other level
we processed 2400 rows for other level
we processed 2500 rows for other level
we processed 2600 rows for other level
we processed 2700 rows for other level
we processed 2800 rows for other level
we processed 3100 rows for other level
we processed 3200 rows for othe

## Output: text dispersion for installer level others dispersion 

In [50]:


col_name=['index','text_d_others_mean','text_d_others_std','text_d_others_min','text_d_others_p25','text_d_others_p50','text_d_others_p75','text_d_others_max']
inventory_reduced_others_df=pd.DataFrame(data=inventory_reduced_others_list,columns=col_name)
inventory_reduced_others_df.to_csv('../2_pipeline/inventory_reduced_others_sumstats_jan18_2020.csv')

In [51]:
inventory_reduced_others_df.tail()

Unnamed: 0,index,text_d_others_mean,text_d_others_std,text_d_others_min,text_d_others_p25,text_d_others_p50,text_d_others_p75,text_d_others_max
8734,10363,0.162572,0.087613,0.0,0.102881,0.137298,0.195393,0.596568
8735,10364,0.159029,0.08613,0.0,0.100628,0.133164,0.189239,0.596568
8736,10365,0.157757,0.08547,0.0,0.100275,0.132494,0.18663,0.596568
8737,10366,0.156887,0.085166,0.0,0.10034,0.131543,0.183596,0.596568
8738,10367,0.155964,0.086166,0.0,0.098928,0.128723,0.184136,0.596568


# Part 3 merge the produced summary stats to inventory data 

## first make sure the reduced version has the summary stats by merging them back 

In [52]:
inventory.head()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others
0,0,0.0,35.0,21283,[],[],[]
1,1,0.0,36.0,21283,[],[],[]
2,2,0.0,37.0,21283,[],[],[]
3,3,0.0,38.0,21283,[],[],[]
4,4,0.0,39.0,21283,[],[],[]


### ent of others 

In [53]:
inventory_reduced_others.tail()

Unnamed: 0,reviewidlist_others
10363,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10364,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10365,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10366,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."
10367,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1..."


In [54]:
inventory_reduced_others['index']=inventory_reduced_others.index

In [55]:
inventory_reduced_others_df.tail()

Unnamed: 0,index,text_d_others_mean,text_d_others_std,text_d_others_min,text_d_others_p25,text_d_others_p50,text_d_others_p75,text_d_others_max
8734,10363,0.162572,0.087613,0.0,0.102881,0.137298,0.195393,0.596568
8735,10364,0.159029,0.08613,0.0,0.100628,0.133164,0.189239,0.596568
8736,10365,0.157757,0.08547,0.0,0.100275,0.132494,0.18663,0.596568
8737,10366,0.156887,0.085166,0.0,0.10034,0.131543,0.183596,0.596568
8738,10367,0.155964,0.086166,0.0,0.098928,0.128723,0.184136,0.596568


In [56]:
inventory_reduced_others_merged=pd.merge(left=inventory_reduced_others,right=inventory_reduced_others_df,on='index',how='left')

In [57]:
inventory_reduced_others_merged.tail(5)

Unnamed: 0,reviewidlist_others,index,text_d_others_mean,text_d_others_std,text_d_others_min,text_d_others_p25,text_d_others_p50,text_d_others_p75,text_d_others_max
10363,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",10363,0.162572,0.087613,0.0,0.102881,0.137298,0.195393,0.596568
10364,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",10364,0.159029,0.08613,0.0,0.100628,0.133164,0.189239,0.596568
10365,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",10365,0.157757,0.08547,0.0,0.100275,0.132494,0.18663,0.596568
10366,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",10366,0.156887,0.085166,0.0,0.10034,0.131543,0.183596,0.596568
10367,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",10367,0.155964,0.086166,0.0,0.098928,0.128723,0.184136,0.596568


In [58]:
type(inventory_reduced_others_merged.iloc[10334].reviewidlist_others)

str

### ent of self


In [59]:
inventory_reduced_self['index']=inventory_reduced_self.index

In [60]:
inventory_reduced_self.head(15)

Unnamed: 0,reviewidlist_self,index
0,[],0
1,"[503, 510]",1
2,"[853, 854, 855, 856, 863, 866]",2
3,"[853, 854, 855, 856, 863, 866, 955, 960]",3
4,"[503, 510, 1538]",4
5,"[503, 510, 1538, 2760]",5
6,[250],6
7,"[250, 277, 288]",7
8,"[250, 277, 288, 531, 535]",8
9,"[803, 806, 808]",9


In [61]:
inventory_reduced_self_df.head()

Unnamed: 0,index,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
0,1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
1,2,0.104398,0.024051,0.068958,0.088068,0.099304,0.120322,0.14949
2,3,0.12496,0.045977,0.055537,0.091352,0.111885,0.149483,0.211835
3,4,0.188121,0.056137,0.125027,0.165909,0.206791,0.219667,0.232544
4,5,0.163721,0.062319,0.09876,0.106348,0.165909,0.216008,0.232544


In [62]:
inventory_reduced_self_merged=pd.merge(left=inventory_reduced_self,right=inventory_reduced_self_df,on='index',how='left')

In [63]:
inventory_reduced_self_merged.head()

Unnamed: 0,reviewidlist_self,index,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
0,[],0,,,,,,,
1,"[503, 510]",1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
2,"[853, 854, 855, 856, 863, 866]",2,0.104398,0.024051,0.068958,0.088068,0.099304,0.120322,0.14949
3,"[853, 854, 855, 856, 863, 866, 955, 960]",3,0.12496,0.045977,0.055537,0.091352,0.111885,0.149483,0.211835
4,"[503, 510, 1538]",4,0.188121,0.056137,0.125027,0.165909,0.206791,0.219667,0.232544


### ent of market

In [64]:
inventory_reduced_mkt['index']=inventory_reduced_mkt.index

In [65]:
inventory_reduced_mkt.head()

Unnamed: 0,reviewidlist_mkt,index
0,[],0
1,"[503, 510]",1
2,"[503, 510, 853, 854, 855, 856, 863, 866]",2
3,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ...",3
4,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ...",4


In [66]:
inventory_reduced_mkt_df.head()

Unnamed: 0,index,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max
0,1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
1,2,0.131484,0.053574,0.068958,0.092432,0.11075,0.149624,0.246461
2,3,0.143781,0.060078,0.055537,0.092818,0.129108,0.197542,0.28504
3,4,0.141036,0.057699,0.055537,0.093159,0.125027,0.195216,0.28504
4,5,0.135878,0.056901,0.055537,0.092755,0.116766,0.186694,0.28504


In [67]:
inventory_reduced_mkt_merged=pd.merge(left=inventory_reduced_mkt,right=inventory_reduced_mkt_df,on='index',how='left')

In [68]:
inventory_reduced_mkt_merged.head()

Unnamed: 0,reviewidlist_mkt,index,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max
0,[],0,,,,,,,
1,"[503, 510]",1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
2,"[503, 510, 853, 854, 855, 856, 863, 866]",2,0.131484,0.053574,0.068958,0.092432,0.11075,0.149624,0.246461
3,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ...",3,0.143781,0.060078,0.055537,0.092818,0.129108,0.197542,0.28504
4,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ...",4,0.141036,0.057699,0.055537,0.093159,0.125027,0.195216,0.28504


## final merging? 

In [69]:
inventory.head()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others
0,0,0.0,35.0,21283,[],[],[]
1,1,0.0,36.0,21283,[],[],[]
2,2,0.0,37.0,21283,[],[],[]
3,3,0.0,38.0,21283,[],[],[]
4,4,0.0,39.0,21283,[],[],[]


In [70]:
inventory.describe()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id
count,10368.0,10368.0,10368.0,10368.0
mean,5183.5,17.804688,45.799479,18044.925637
std,2993.12813,13.551211,14.398838,6707.334838
min,0.0,-1.0,1.0,108.0
25%,2591.75,4.0,37.0,20131.0
50%,5183.5,17.0,49.0,20663.0
75%,7775.25,33.0,57.0,21389.0
max,10367.0,36.0,64.0,23027.0


### 1. Merge on mkt level ents measures 

In [71]:
inventory_reduced_mkt_merged.head()

Unnamed: 0,reviewidlist_mkt,index,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max
0,[],0,,,,,,,
1,"[503, 510]",1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
2,"[503, 510, 853, 854, 855, 856, 863, 866]",2,0.131484,0.053574,0.068958,0.092432,0.11075,0.149624,0.246461
3,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ...",3,0.143781,0.060078,0.055537,0.092818,0.129108,0.197542,0.28504
4,"[503, 510, 853, 854, 855, 856, 863, 866, 955, ...",4,0.141036,0.057699,0.055537,0.093159,0.125027,0.195216,0.28504


In [72]:
inventory_reduced_mkt_merged=inventory_reduced_mkt_merged.drop(['index'],axis=1).drop_duplicates()

In [73]:
final_text_ent=pd.merge(left=inventory,right=inventory_reduced_mkt_merged,on='reviewidlist_mkt',how='left')

In [74]:
final_text_ent.describe()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max
count,10368.0,10368.0,10368.0,10368.0,9057.0,8679.0,9057.0,9057.0,9057.0,9057.0,9057.0
mean,5183.5,17.804688,45.799479,18044.925637,0.137543,0.061768,0.03520904,0.096652,0.124838,0.165272,0.41044
std,2993.12813,13.551211,14.398838,6707.334838,0.023551,0.016654,0.03656955,0.02367,0.025438,0.031621,0.159751
min,0.0,-1.0,1.0,108.0,0.054564,0.005524,-1.192093e-07,0.054564,0.054564,0.054564,0.054564
25%,2591.75,4.0,37.0,20131.0,0.131075,0.055473,0.0,0.086404,0.115659,0.154873,0.29864
50%,5183.5,17.0,49.0,20663.0,0.135117,0.064879,0.03152883,0.092818,0.120884,0.162674,0.42046
75%,7775.25,33.0,57.0,21389.0,0.143472,0.071107,0.04966545,0.098796,0.127784,0.176718,0.52669
max,10367.0,36.0,64.0,23027.0,0.365879,0.151968,0.3658786,0.365879,0.365879,0.369418,0.756485


### 2. Merge on self level ents measure 


In [75]:
inventory_reduced_self_merged.head()

Unnamed: 0,reviewidlist_self,index,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
0,[],0,,,,,,,
1,"[503, 510]",1,0.232544,,0.232544,0.232544,0.232544,0.232544,0.232544
2,"[853, 854, 855, 856, 863, 866]",2,0.104398,0.024051,0.068958,0.088068,0.099304,0.120322,0.14949
3,"[853, 854, 855, 856, 863, 866, 955, 960]",3,0.12496,0.045977,0.055537,0.091352,0.111885,0.149483,0.211835
4,"[503, 510, 1538]",4,0.188121,0.056137,0.125027,0.165909,0.206791,0.219667,0.232544


In [76]:
inventory_reduced_self_merged=inventory_reduced_self_merged.drop(['index'],axis=1).drop_duplicates()

In [77]:
final_text_ent=pd.merge(left=final_text_ent,right=inventory_reduced_self_merged,on='reviewidlist_self',how='left')

In [78]:
final_text_ent.tail()

Unnamed: 0.1,Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,...,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
10363,10363,-1.0,64.0,21350,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[342, 343, 344, 345, 346, 347, 348, 349, 350, ...","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,...,0.131768,0.185321,0.596568,0.098136,0.022455,0.048601,0.083058,0.095586,0.113595,0.154108
10364,10364,-1.0,64.0,21862,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[1234, 1235, 1239, 1241, 1259, 2223]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,...,0.131768,0.185321,0.596568,0.106721,0.020954,0.061483,0.097076,0.109613,0.120701,0.14292
10365,10365,-1.0,64.0,22375,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2082, 2959]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,...,0.131768,0.185321,0.596568,0.122692,,0.122692,0.122692,0.122692,0.122692,0.122692
10366,10366,-1.0,64.0,22773,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2915, 2921, 2926, 3001]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,...,0.131768,0.185321,0.596568,0.167972,0.074041,0.074705,0.114051,0.171353,0.221196,0.257659
10367,10367,-1.0,64.0,21118,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[200, 202, 210, 329]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,...,0.131768,0.185321,0.596568,0.196974,0.028975,0.160524,0.173285,0.201712,0.216017,0.2333


In [79]:
final_text_ent=final_text_ent.drop(['Unnamed: 0'],axis=1)

In [80]:
final_text_ent.tail()

Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
10363,-1.0,64.0,21350,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[342, 343, 344, 345, 346, 347, 348, 349, 350, ...","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.098136,0.022455,0.048601,0.083058,0.095586,0.113595,0.154108
10364,-1.0,64.0,21862,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[1234, 1235, 1239, 1241, 1259, 2223]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.106721,0.020954,0.061483,0.097076,0.109613,0.120701,0.14292
10365,-1.0,64.0,22375,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2082, 2959]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.122692,,0.122692,0.122692,0.122692,0.122692,0.122692
10366,-1.0,64.0,22773,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2915, 2921, 2926, 3001]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.167972,0.074041,0.074705,0.114051,0.171353,0.221196,0.257659
10367,-1.0,64.0,21118,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[200, 202, 210, 329]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.196974,0.028975,0.160524,0.173285,0.201712,0.216017,0.2333


### 3. Merge on others level ents measure

In [81]:
type(final_text_ent.iloc[10334].reviewidlist_others)

str

In [82]:
inventory_reduced_others_merged=inventory_reduced_others_merged.drop(['index'],axis=1).drop_duplicates()

In [83]:
final_text_ent.describe()

Unnamed: 0,market,year_month_count,installer_id,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
count,10368.0,10368.0,10368.0,9057.0,8679.0,9057.0,9057.0,9057.0,9057.0,9057.0,5313.0,4096.0,5313.0,5313.0,5313.0,5313.0,5313.0
mean,17.804688,45.799479,18044.925637,0.137543,0.061768,0.03520904,0.096652,0.124838,0.165272,0.41044,0.136149,0.048521,0.08346311,0.111118,0.133084,0.160149,0.204667
std,13.551211,14.398838,6707.334838,0.023551,0.016654,0.03656955,0.02367,0.025438,0.031621,0.159751,0.049345,0.031815,0.05088104,0.045195,0.051664,0.066405,0.097369
min,-1.0,1.0,108.0,0.054564,0.005524,-1.192093e-07,0.054564,0.054564,0.054564,0.054564,0.0,0.002974,-1.192093e-07,0.0,0.0,0.0,0.0
25%,4.0,37.0,20131.0,0.131075,0.055473,0.0,0.086404,0.115659,0.154873,0.29864,0.10464,0.027264,0.05186743,0.081767,0.100148,0.117574,0.130816
50%,17.0,49.0,20663.0,0.135117,0.064879,0.03152883,0.092818,0.120884,0.162674,0.42046,0.125779,0.039913,0.07166195,0.103619,0.121747,0.144627,0.185836
75%,33.0,57.0,21389.0,0.143472,0.071107,0.04966545,0.098796,0.127784,0.176718,0.52669,0.15852,0.06103,0.1017198,0.127261,0.153204,0.189867,0.262202
max,36.0,64.0,23027.0,0.365879,0.151968,0.3658786,0.365879,0.365879,0.369418,0.756485,0.381214,0.204801,0.3812145,0.381214,0.381214,0.430115,0.604839


In [84]:
final_text_ent.tail()

Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max,text_d_self_mean,text_d_self_std,text_d_self_min,text_d_self_p25,text_d_self_p50,text_d_self_p75,text_d_self_max
10363,-1.0,64.0,21350,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[342, 343, 344, 345, 346, 347, 348, 349, 350, ...","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.098136,0.022455,0.048601,0.083058,0.095586,0.113595,0.154108
10364,-1.0,64.0,21862,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[1234, 1235, 1239, 1241, 1259, 2223]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.106721,0.020954,0.061483,0.097076,0.109613,0.120701,0.14292
10365,-1.0,64.0,22375,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2082, 2959]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.122692,,0.122692,0.122692,0.122692,0.122692,0.122692
10366,-1.0,64.0,22773,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2915, 2921, 2926, 3001]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.167972,0.074041,0.074705,0.114051,0.171353,0.221196,0.257659
10367,-1.0,64.0,21118,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[200, 202, 210, 329]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,0.131768,0.185321,0.596568,0.196974,0.028975,0.160524,0.173285,0.201712,0.216017,0.2333


In [85]:
final_text_ent=pd.merge(left=final_text_ent,right=inventory_reduced_others_merged,on=['reviewidlist_others'],how='left')
final_text_ent.tail()

Unnamed: 0,market,year_month_count,installer_id,reviewidlist_mkt,reviewidlist_self,reviewidlist_others,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,...,text_d_self_p50,text_d_self_p75,text_d_self_max,text_d_others_mean,text_d_others_std,text_d_others_min,text_d_others_p25,text_d_others_p50,text_d_others_p75,text_d_others_max
10363,-1.0,64.0,21350,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[342, 343, 344, 345, 346, 347, 348, 349, 350, ...","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,...,0.095586,0.113595,0.154108,0.162572,0.087613,0.0,0.102881,0.137298,0.195393,0.596568
10364,-1.0,64.0,21862,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[1234, 1235, 1239, 1241, 1259, 2223]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,...,0.109613,0.120701,0.14292,0.159029,0.08613,0.0,0.100628,0.133164,0.189239,0.596568
10365,-1.0,64.0,22375,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2082, 2959]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,...,0.122692,0.122692,0.122692,0.157757,0.08547,0.0,0.100275,0.132494,0.18663,0.596568
10366,-1.0,64.0,22773,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[2915, 2921, 2926, 3001]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,...,0.171353,0.221196,0.257659,0.156887,0.085166,0.0,0.10034,0.131543,0.183596,0.596568
10367,-1.0,64.0,21118,"[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...","[200, 202, 210, 329]","[23, 55, 67, 69, 76, 80, 82, 88, 89, 90, 92, 1...",0.157125,0.08506,0.0,0.100182,...,0.201712,0.216017,0.2333,0.155964,0.086166,0.0,0.098928,0.128723,0.184136,0.596568


In [86]:
final_text_ent.to_csv('final_text_ent_jan18_2020.csv')

In [87]:
final_text_ent.describe()

Unnamed: 0,market,year_month_count,installer_id,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max,...,text_d_self_p50,text_d_self_p75,text_d_self_max,text_d_others_mean,text_d_others_std,text_d_others_min,text_d_others_p25,text_d_others_p50,text_d_others_p75,text_d_others_max
count,10368.0,10368.0,10368.0,9057.0,8679.0,9057.0,9057.0,9057.0,9057.0,9057.0,...,5313.0,5313.0,5313.0,8739.0,8375.0,8739.0,8739.0,8739.0,8739.0,8739.0
mean,17.804688,45.799479,18044.925637,0.137543,0.061768,0.03520904,0.096652,0.124838,0.165272,0.41044,...,0.133084,0.160149,0.204667,0.13736,0.061849,0.03537682,0.096515,0.124777,0.164885,0.409735
std,13.551211,14.398838,6707.334838,0.023551,0.016654,0.03656955,0.02367,0.025438,0.031621,0.159751,...,0.051664,0.066405,0.097369,0.023529,0.016769,0.03722176,0.023383,0.025637,0.031449,0.160333
min,-1.0,1.0,108.0,0.054564,0.005524,-1.192093e-07,0.054564,0.054564,0.054564,0.054564,...,0.0,0.0,0.0,0.054564,0.00326,-1.192093e-07,0.054031,0.054564,0.054564,0.054564
25%,4.0,37.0,20131.0,0.131075,0.055473,0.0,0.086404,0.115659,0.154873,0.29864,...,0.100148,0.117574,0.130816,0.130334,0.055468,0.0,0.086241,0.115349,0.154134,0.292546
50%,17.0,49.0,20663.0,0.135117,0.064879,0.03152883,0.092818,0.120884,0.162674,0.42046,...,0.121747,0.144627,0.185836,0.135117,0.064664,0.03070414,0.092501,0.120633,0.162342,0.419871
75%,33.0,57.0,21389.0,0.143472,0.071107,0.04966545,0.098796,0.127784,0.176718,0.52669,...,0.153204,0.189867,0.262202,0.143384,0.071254,0.04961014,0.098924,0.128522,0.175471,0.52669
max,36.0,64.0,23027.0,0.365879,0.151968,0.3658786,0.365879,0.365879,0.369418,0.756485,...,0.381214,0.430115,0.604839,0.372957,0.159244,0.3729565,0.372957,0.372957,0.372957,0.756485


In [88]:
final_text_ent_condensed=final_text_ent.drop(['reviewidlist_mkt','reviewidlist_self','reviewidlist_others'],axis=1)

In [89]:
final_text_ent_condensed.describe()

Unnamed: 0,market,year_month_count,installer_id,text_d_mkt_mean,text_d_mkt_std,text_d_mkt_min,text_d_mkt_p25,text_d_mkt_p50,text_d_mkt_p75,text_d_mkt_max,...,text_d_self_p50,text_d_self_p75,text_d_self_max,text_d_others_mean,text_d_others_std,text_d_others_min,text_d_others_p25,text_d_others_p50,text_d_others_p75,text_d_others_max
count,10368.0,10368.0,10368.0,9057.0,8679.0,9057.0,9057.0,9057.0,9057.0,9057.0,...,5313.0,5313.0,5313.0,8739.0,8375.0,8739.0,8739.0,8739.0,8739.0,8739.0
mean,17.804688,45.799479,18044.925637,0.137543,0.061768,0.03520904,0.096652,0.124838,0.165272,0.41044,...,0.133084,0.160149,0.204667,0.13736,0.061849,0.03537682,0.096515,0.124777,0.164885,0.409735
std,13.551211,14.398838,6707.334838,0.023551,0.016654,0.03656955,0.02367,0.025438,0.031621,0.159751,...,0.051664,0.066405,0.097369,0.023529,0.016769,0.03722176,0.023383,0.025637,0.031449,0.160333
min,-1.0,1.0,108.0,0.054564,0.005524,-1.192093e-07,0.054564,0.054564,0.054564,0.054564,...,0.0,0.0,0.0,0.054564,0.00326,-1.192093e-07,0.054031,0.054564,0.054564,0.054564
25%,4.0,37.0,20131.0,0.131075,0.055473,0.0,0.086404,0.115659,0.154873,0.29864,...,0.100148,0.117574,0.130816,0.130334,0.055468,0.0,0.086241,0.115349,0.154134,0.292546
50%,17.0,49.0,20663.0,0.135117,0.064879,0.03152883,0.092818,0.120884,0.162674,0.42046,...,0.121747,0.144627,0.185836,0.135117,0.064664,0.03070414,0.092501,0.120633,0.162342,0.419871
75%,33.0,57.0,21389.0,0.143472,0.071107,0.04966545,0.098796,0.127784,0.176718,0.52669,...,0.153204,0.189867,0.262202,0.143384,0.071254,0.04961014,0.098924,0.128522,0.175471,0.52669
max,36.0,64.0,23027.0,0.365879,0.151968,0.3658786,0.365879,0.365879,0.369418,0.756485,...,0.381214,0.430115,0.604839,0.372957,0.159244,0.3729565,0.372957,0.372957,0.372957,0.756485


## Output: the merged text dispersion data 

In [90]:
final_text_ent_condensed.to_csv('..2_pipeline/BERTtext_ent_90_100_2steps.csv')