## The realm of possibility on VarageSale is more like MineCraft than Airbnb.

#### TODO:

- adapt sql for other network types
- are inweight and outwieght useful?

#### Edge Types

- Transaction
- Message
- Praise
- Comment
- Interest

In [180]:
# initializing

import pandas as pd
import pandas_gbq as pgbq
from multiprocessing import Pool
from scipy import stats
import os
import time
import numpy as np

directory = '/Users/joshharris/community_health/sql/cumulative_networks/transaction'
sql_modules = list(map(lambda x: os.path.splitext(x)[0], os.listdir(network_directory)))
google_project_id = 'solid-ridge-104914'
dataset = 'community_networks'

In [181]:
# worker function

def bigquery_worker(i):
    sql_module_type = os.path.basename(directory)
    sql_module = sql_modules[i]
    print('Starting module ',i,': ',sql_modules[i], sep='')
    try:
        result = pgbq.read_gbq('select * from community_networks.'+sql_module_type+'_'+sql_module, google_project_id, dialect='standard', verbose=False)
        print('\n\n',sql_module_type,'_',sql_module,' table found.\n\n', sep='')
    except:
        print('\n\n',sql_module_type,'_',sql_module,' table does not exist, generating now\n\n', sep='')
        with open(directory+'/'+sql_module+'.txt') as query_file:
            query = query_file.read()
        result = pgbq.read_gbq(query, google_project_id, dialect='standard', verbose=False)
        print('\n\nCreating new ',sql_module_type,'_',sql_module,' table\n\n', sep='')
        pgbq.to_gbq(result, dataset+'.'+sql_module_type+'_'+sql_module, google_project_id, if_exists='replace', verbose=False)
        print('\n\nModule ',i,': ',sql_module,'query complete\n\n', sep='')
    return (result)

def add_network_stats(df):
    edges = df['edges']
    nodes = df['nodes']
    avg_indegree = df['avg_indegree']
    avg_outdegree = df['avg_outdegree']
    df['network_density_X_100'] = (edges/(nodes*(nodes-1)/2))*100
    df['avg_degree'] = 2*edges/nodes
    df['indegree_skew'] = (avg_indegree-avg_outdegree)/avg_outdegree
    return(df)
        

In [274]:
#Run network sql modules and join with community facts

if __name__ == '__main__':
    with Pool(processes=len(sql_modules)) as pool:
        result = pool.map(bigquery_worker, (range(len(sql_modules),)))
        pool.close()
        pool.join()


joined = pgbq.read_gbq('select * from '+dataset+'.community_facts_main', google_project_id, dialect='standard', verbose=False).set_index('cid', drop=False)
print('\n\nJoining ',network_type,' modules to community facts\n\n', sep='')
for i in range(len(result)):
    joined = joined.join(result[i].set_index('cid'), how='left', lsuffix='_left')
transaction_main = add_network_stats(joined)
transaction_main.replace([np.inf,-np.inf], np.nan)
transaction_main.fillna(0, inplace=True)
transaction_main.drop(transaction_main[transaction_main.MAU < 1000].index, inplace=True)
print('\n\nDone\n\n', sep='')


Starting module 0: edge_weight
Starting module 2: indegree
Starting module 1: edges
Starting module 3: outdegree


transaction_edge_weight table found.




transaction_edges table found.




transaction_indegree table found.




transaction_outdegree table found.




Joining transaction modules to community facts




Done




In [275]:
# Cross correlate community facts and output key correlations

variables = list(transaction_main)[2:len(transaction_main)] #this is not a good way to create the list of variables.. why not

transaction_corr_matrix = pd.DataFrame(index=variables, columns=variables)
transaction_key_correlations = pd.DataFrame(columns=['pair','coefficient'])
corr_checked = list()

i = 0
j = 0

for i in range(len(variables)):
    for j in range(len(variables)):
        pair = variables[i]+','+variables[j]
        rho, pval = stats.spearmanr(transaction_main[[variables[i],variables[j]]])
        transaction_corr_matrix.set_value(variables[i], variables[j], rho)
        if pair not in corr_checked and rho < 0.99 and (rho > 0.5 or rho < -0.5):
            transaction_key_correlations.loc[len(transaction_key_correlations)] = [pair, rho]
            corr_checked.append(variables[j]+','+variables[i])

transaction_corr_matrix.reset_index(inplace=True)
transaction_key_correlations.sort_values(by='coefficient', ascending=False, inplace=True)
transaction_key_correlations.reset_index(drop=True, inplace=True)

#Save to BigQuery
#pgbq.to_gbq(transaction_corr_matrix, 'community_networks.transaction_corr_matrix_over_1000', google_project_id, if_exists='replace')
#pgbq.to_gbq(transaction_key_correlations, 'community_networks.transaction_key_correlations_over_1000', google_project_id, if_exists='replace')
transaction_key_correlations

Unnamed: 0,pair,coefficient
0,"items_sold_june_2017,new_listings",0.930475
1,"m1_total_retention,m3_total_retention",0.928410
2,"mau_june_2017,edges",0.914929
3,"mau_change_2017,paid_signups_june",0.892353
4,"nodes,mau_june_2017",0.848194
5,"edges,avg_degree",0.828217
6,"MAU,new_listings",0.826805
7,"mau_change_2017,total_signups_june",0.797043
8,"paid_signups_june,total_signups_june",0.792780
9,"nodes,edges",0.771169


In [260]:
len(transaction_main)

690

In [276]:
transaction_corr_matrix

Unnamed: 0,index,age,m1_total_retention,m3_total_retention,MAU,nodes,items_sold_june_2017,items_sold_change_2017,inventory,mau_june_2017,...,total_signups_june,w4_paid_retention,w4_total_retention,avg_weight,edges,avg_indegree,avg_outdegree,network_density_X_100,avg_degree,indegree_skew
0,age,1.0,0.115816,0.211225,0.303041,0.258061,0.458624,-0.130657,0.25722,0.104252,...,-0.248252,-0.129096,0.211102,0.229637,0.180934,0.026712,0.427431,-0.195843,0.0150115,-0.469924
1,m1_total_retention,0.115816,1.0,0.92841,0.45853,0.0838592,0.467759,0.163338,0.356098,0.408377,...,0.299235,0.516789,0.705222,-0.493555,0.405825,0.0461248,0.292887,0.405928,0.530375,-0.23156
2,m3_total_retention,0.211225,0.92841,1.0,0.519792,0.0901896,0.575183,0.16332,0.386666,0.3973,...,0.2692,0.480939,0.70553,-0.428842,0.409808,0.0781143,0.356448,0.389312,0.529296,-0.285626
3,MAU,0.303041,0.45853,0.519792,1.0,0.629034,0.768421,0.17719,0.659617,0.657383,...,0.606032,0.339126,0.488035,-0.248344,0.525302,-0.0730708,0.387886,-0.256675,0.233986,-0.437458
4,nodes,0.258061,0.0838592,0.0901896,0.629034,1.0,0.420703,-0.0170718,0.703323,0.848194,...,0.420072,0.0753652,0.131591,0.0241346,0.771169,0.15453,0.340612,-0.505911,0.309412,-0.256115
5,items_sold_june_2017,0.458624,0.467759,0.575183,0.768421,0.420703,1.0,0.00622569,0.739142,0.386452,...,0.23521,0.116789,0.579942,0.0651842,0.428724,0.136544,0.7571,-0.0844056,0.271692,-0.681667
6,items_sold_change_2017,-0.130657,0.163338,0.16332,0.17719,-0.0170718,0.00622569,1.0,0.0313711,0.205782,...,0.5204,0.473423,0.174466,-0.308954,0.0861198,0.0308525,-0.175415,0.143311,0.144468,0.283192
7,inventory,0.25722,0.356098,0.386666,0.659617,0.703323,0.739142,0.0313711,1.0,0.680205,...,0.408451,0.142138,0.422,0.0684401,0.66963,0.230304,0.704773,-0.163005,0.416847,-0.542524
8,mau_june_2017,0.104252,0.408377,0.3973,0.657383,0.848194,0.386452,0.205782,0.680205,1.0,...,0.581974,0.399866,0.361852,-0.275712,0.914929,0.26627,0.287258,-0.10745,0.638967,-0.0926592
9,mau_change_2017,-0.299658,0.403188,0.372555,0.356791,0.0977959,0.00418103,0.676175,0.0917243,0.454764,...,0.797043,0.723927,0.26733,-0.545914,0.330878,0.0990357,-0.205694,0.314583,0.407824,0.390575


### Notes on Correlations among transaction network properties
 
OBSERVATIONS
 
 Inventory correlations:
 
 - Communities where the average seller has a high number of buyers are also likely to have more inventory (0.658)
 - density is inversly correlated with inventory (row 17, -0.596)