In [3]:
import pandas as pd
import pandas_gbq as pgbq
from scipy import stats

google_project_id = 'solid-ridge-104914'

In [5]:
# If already calculated, load message_net

message_net = pd.read_csv('~/community_health/data/message_network_properties.csv')

In [8]:
# Or, calculate network properties 
# WARNING: This query takes over 30 mins! No harm possible, just annoying.

with open('../sql/cumulative_networks/message_net.txt') as query_file:
    message_query = query_file.read()

message_net = pgbq.read_gbq(message_query, google_project_id, dialect='standard')
pgbq.to_gbq(message_net, 'community_networks.transaction_net', google_project_id, if_exists='replace')

ServerNotFoundError: Unable to find the server at www.googleapis.com

In [6]:
message_net

Unnamed: 0,edge_type,cid,name,possible_edges,actual_edges,nodes,network_density_X_100,avg_degree,avg_weight,total_transactions
0,transaction,1,Vaudreuil-Dorion & Surrounding Areas,836384550,4245412,40900,0.507591,207.599609,5.855542,49718378
1,transaction,2,Turks and Caicos Islands,73536,89632,384,121.888599,466.833333,3.403996,610214
2,transaction,3,West Island of Montreal,2228415420,4737508,66760,0.212595,141.926543,6.169650,58457534
3,transaction,4,Laval,364783555,4064968,27011,1.114351,300.986117,4.837209,39326202
4,transaction,5,Valleyfield et environs,115801371,3730028,15219,3.221057,490.180432,4.480231,33422772
5,transaction,6,Casa Grande Buy-Sell-Trade,144453,23379,538,16.184503,86.910781,2.672912,124980
6,transaction,7,"Whidbey Island Neighbors' Classifieds, WA",4385241,1258441,2962,28.697191,849.723835,1.916677,4824050
7,transaction,8,"Mount Forest, Arthur & Fergus Ontario Buy and ...",25180156,1694951,7097,6.731297,477.652811,2.795346,9475950
8,transaction,11,"Fort Lewis Flea Market, WA",25514796,189178,7144,0.741444,52.961366,3.757371,1421624
9,transaction,12,New Richmond Wisconsin Buy and Sell,26875446,600925,7332,2.235963,163.918440,2.678076,3218646


In [7]:
# Cross correlate community facts and output key correlations

variables = list(message_net)[4:len(message_net)] #this is not a good way to create the list of variables

message_corr_matrix = pd.DataFrame(index=variables, columns=variables)
message_key_correlations = pd.DataFrame(columns=['pair','coefficient'])
corr_checked = list()

i = 0
j = 0

for i in range(len(variables)):
    for j in range(len(variables)):
        pair = variables[i]+','+variables[j]
        rho, pval = stats.spearmanr(message_net[[variables[i],variables[j]]])
        message_corr_matrix.set_value(variables[i], variables[j], rho)
        if pair not in corr_checked and rho < 0.99 and (rho > 0.5 or rho < -0.5):
            message_key_correlations.loc[len(message_key_correlations)] = [pair, rho]
            corr_checked.append(variables[j]+','+variables[i])

message_key_correlations.sort_values(by='coefficient', ascending=False, inplace=True)
message_key_correlations.reset_index(drop=True, inplace=True)

#Save to BigQuery
#pgbq.to_gbq(message_corr_matrix, 'community_networks.message_corr_matrix', google_project_id, if_exists='replace')
#pgbq.to_gbq(message_key_correlations, 'community_networks.message_key_correlations', google_project_id, if_exists='replace')
message_key_correlations

Unnamed: 0,pair,coefficient
0,"nodes,avg_weight",0.805421
1,"actual_edges,total_transactions",0.803515
2,"avg_degree,avg_weight",-0.82763
3,"network_density_X_100,avg_weight",-0.832252
4,"nodes,avg_degree",-0.952044
5,"nodes,network_density_X_100",-0.979852


Transaction-Message observations:

 - node count and avg edge weight are NOT correlated in transactions, but ARE strongly correlated in messages
         ie, the bigger the community, the more messages between pairs. Almost certainly confounded by time.
 - node count and avg degree are NOT correlated in transactions, but ARE strongly inversely correlated in messages
         ie, the bigger the community, the fewer members one has messaged. Likely confounded by inactive users, maybe by ease of shopping due to more selection.
       