## The realm of possibility on VarageSale is more like MineCraft than Airbnb.

#### TODO:

- refactor indegree and outdegree *
- adapt sql for other network types
- move to active network instead of all time
- are inweight and outwieght useful?

#### Edge Types

- Transaction
- Message
- Praise
- Comment
- Interest

In [88]:
# initializing

import pandas as pd
import pandas_gbq as pgbq
from multiprocessing import Pool
from scipy import stats

network_type = 'transaction'
sql_modules = ['edges','edge_weight','indegree','outdegree']
google_project_id = 'solid-ridge-104914'

Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 2737 rows.

Total time taken 4.61 s.
Finished at 2017-07-10 18:13:56.


In [111]:
# worker function

def bigquery_worker(i):
    sql_module = sql_modules[i]
    print('Starting module ',i,': ',sql_modules[i], sep='')
    try:
        result = pgbq.read_gbq('select * from community_networks.'+network_type+'_'+sql_module, google_project_id, dialect='standard')
        print('\n\n',network_type,'_',sql_module,' table found.\n\n', sep='')
    except:
        print('\n\n',network_type,'_',sql_module,' table does not exist, generating now\n\n', sep='')
        with open('../sql/cumulative_networks/'+network_type+'/'+sql_module+'.txt') as query_file:
            query = query_file.read()
        result = pgbq.read_gbq(query, google_project_id, dialect='standard')
        print('\n\nCreating new ',network_type,'_',sql_module,' table\n\n', sep='')
        pgbq.to_gbq(result, 'community_networks.'+network_type+'_'+sql_module, google_project_id, if_exists='replace')
        print('\n\nModule ',i,': ',sql_modules[i],'query complete\n\n', sep='')
    return (result)

#if community facts are already saved, use those otherwise re-calculate and save

def community_facts():
    try:
        result = pgbq.read_gbq('select * from community_networks.community_facts', google_project_id, dialect='standard')
    except:
        print('Community facts table does not exist, generating now.')
        with open('../sql/cumulative_networks/community_facts.txt') as query_file:
            query = query_file.read()
        result = pgbq.read_gbq(query, google_project_id, dialect='standard')
        print('Saving result to new table')
        result.to_gbq(result, 'community_networks.community_facts', google_project_id, if_exists='replace')
    return (result) 
        
        

In [112]:
#make the worker work, join it all with generic community facts

if __name__ == '__main__':
    with Pool(processes=len(sql_modules)) as pool:
        result = pool.map(bigquery_worker, (range(len(sql_modules),)))

print('\n\nRetrieving generic community facts\n\n', sep='')
raw_network = community_facts()
print('\n\nJoining ',network_type,' stats to community facts\n\n', sep='')
for i in range(len(result)):
    raw_network = raw_network.join(result[i].set_index('cid'))
print('\n\nDone\n\n', sep='')


Starting module 0: edges
Starting module 1: edge_weight
Starting module 2: indegree
Starting module 3: outdegree
Requesting query... Requesting query... Requesting query... ok.
Query running...
ok.
Query running...
ok.
Query running...
Requesting query... ok.
Query running...
Query done.
Processed: 42.1 KB

Retrieving results...
Got 2692 rows.

Query done.
Processed: 0.0 B

Total time taken 3.09 s.
Finished at 2017-07-10 18:34:59.
Retrieving results...


Got 2692 rows.

transaction_edges table found.


Total time taken 3.22 s.
Finished at 2017-07-10 18:34:59.


transaction_edge_weight table found.


Query done.
Processed: 0.0 B

Retrieving results...
Got 2692 rows.

Total time taken 4.33 s.
Finished at 2017-07-10 18:35:00.


transaction_outdegree table found.


Query done.
Processed: 0.0 B

Retrieving results...
Got 2692 rows.

Total time taken 3.68 s.
Finished at 2017-07-10 18:35:01.


transaction_indegree table found.




Retrieving generic community facts


Requesting query... ok.
Q

In [113]:
raw_network

Unnamed: 0,cid,name,nodes,items_sold_may_2017,items_sold_change_2017,inventory,mau_may_2017,mau_change_2017,edges,avg_weight,avg_indegree,avg_outdegree
0,221,"Community Blessings of Prosper, TX",7827,1626,-0.091620,18369,1735,-0.061147,,,,
1,299,SARNIA - LAMBTON ONTARIO BUY AND SELL,28239,12745,0.272464,101885,11417,0.076872,424914.0,1.625703,36.374283,22.199569
2,314,Oyen AB and Area Buy & Sell,1029,0,-1.000000,1101,196,-0.024876,4225.0,1.570888,80.939024,1.794754
3,319,Northern Wisconsin,1973,5,4.000000,3706,236,-0.008403,507893.0,1.615386,30.181099,23.511090
4,338,"Fremont, California Mommies Buy/Sell/Trade",4498,39,-0.426471,8218,232,-0.254019,266808.0,1.631765,42.876502,15.330399
5,449,"Newmarket, Ontario Buy and Sell",31515,7584,0.105217,168283,16317,0.060510,219295.0,1.613689,46.926668,15.172748
6,491,"Athens Metro, GA Buy & Sell",3875,17,-0.370370,3451,381,-0.220859,593.0,1.655987,57.764706,1.731922
7,600,"Southwest Orlando, FL Buy and Sell",1923,19,0.583333,4769,267,-0.118812,11145.0,1.770480,68.041379,2.270656
8,611,"South Tampa, FL Buy and Sell",757,9,0.285714,1651,117,-0.193103,53108.0,1.878248,34.539474,6.891192
9,707,"Holladay, UT Buy and Sell",230,2,0.000000,469,30,-0.268293,178.0,1.724719,25.583333,1.774566


In [None]:
# Calculations

  'transaction' as edge_type,
  cf.*,
  avg_weight,
  if(nodes*(nodes-1)/2=0, 0, edges/(nodes*(nodes-1)/2))*100 as network_density_X_100,
  2*edges/nodes as avg_degree,
  avg_indegree,
  avg_outdegree,
  (avg_indegree - avg_outdegree) / avg_outdegree indegree_skew

In [10]:
# Cross correlate community facts and output key correlations

variables = list(transaction_net)[3:len(transaction_net)] #this is not a good way to create the list of variables

transaction_corr_matrix = pd.DataFrame(index=variables, columns=variables)
transaction_key_correlations = pd.DataFrame(columns=['pair','coefficient'])
corr_checked = list()

i = 0
j = 0

for i in range(len(variables)):
    for j in range(len(variables)):
        pair = variables[i]+','+variables[j]
        rho, pval = stats.spearmanr(transaction_net[[variables[i],variables[j]]])
        transaction_corr_matrix.set_value(variables[i], variables[j], rho)
        if pair not in corr_checked and rho < 0.99 and (rho > 0.5 or rho < -0.5):
            transaction_key_correlations.loc[len(transaction_key_correlations)] = [pair, rho]
            corr_checked.append(variables[j]+','+variables[i])

transaction_key_correlations.sort_values(by='coefficient', ascending=False, inplace=True)
transaction_key_correlations.reset_index(drop=True, inplace=True)

#Save to BigQuery
pgbq.to_gbq(transaction_corr_matrix, 'community_networks.transaction_corr_matrix', google_project_id, if_exists='replace')
pgbq.to_gbq(transaction_key_correlations, 'community_networks.transaction_key_correlations', google_project_id, if_exists='replace')
transaction_key_correlations

NameError: name 'transaction_net' is not defined

In [4]:
transaction_corr_matrix

Unnamed: 0,nodes,items_sold_may_2017,items_sold_change_2017,inventory,mau_may_2017,mau_change_2017,avg_weight,network_density_X_100,avg_degree,avg_indegree,avg_outdegree,indegree_skew
nodes,1.0,0.811128,0.151552,0.86714,0.898414,0.214265,0.222632,-0.687822,0.283159,0.264371,0.644004,-0.294765
items_sold_may_2017,0.811128,1.0,0.363262,0.775534,0.837331,0.313216,0.133187,-0.490727,0.34162,0.19147,0.590193,-0.343991
items_sold_change_2017,0.151552,0.363262,1.0,0.0996845,0.222924,0.248604,-0.0265483,-0.0296677,0.156609,0.0759387,0.109157,0.00269656
inventory,0.86714,0.775534,0.0996845,1.0,0.754781,0.042468,0.262172,-0.596689,0.256094,0.212205,0.660469,-0.388256
mau_may_2017,0.898414,0.837331,0.222924,0.754781,1.0,0.480426,0.165736,-0.416746,0.549715,0.351145,0.687001,-0.243797
mau_change_2017,0.214265,0.313216,0.248604,0.042468,0.480426,1.0,-0.117285,0.11229,0.42104,0.245559,0.149689,0.128616
avg_weight,0.222632,0.133187,-0.0265483,0.262172,0.165736,-0.117285,1.0,-0.0646782,0.166843,0.542312,0.66375,0.10278
network_density_X_100,-0.687822,-0.490727,-0.0296677,-0.596689,-0.416746,0.11229,-0.0646782,1.0,0.437724,0.308069,-0.189178,0.529621
avg_degree,0.283159,0.34162,0.156609,0.256094,0.549715,0.42104,0.166843,0.437724,1.0,0.71979,0.603985,0.203144
avg_indegree,0.264371,0.19147,0.0759387,0.212205,0.351145,0.245559,0.542312,0.308069,0.71979,1.0,0.585827,0.538854


### Notes on Correlations among transaction network properties
 
OBSERVATIONS
 
 Inventory correlations:
 
 - Communities where the average seller has a high number of buyers are also likely to have more inventory (0.658)
 - density is inversly correlated with inventory (row 17, -0.596)