## The realm of possibility on VarageSale is more like MineCraft than Airbnb.

#### TODO:

- adapt sql for other network types
- are inweight and outwieght useful?

#### Edge Types

- Transaction
- Message
- Praise
- Comment
- Interest

In [1]:
# initializing

import pandas as pd
import pandas_gbq as pgbq
from multiprocessing import Pool
from scipy import stats
import os
import time
import numpy as np
import re
from get_features import *

In [2]:
google_project_id = 'solid-ridge-104914'
dataset = 'community_networks'
sql_module_types = ['community_facts','transaction','follow']

In [3]:
data = pgbq.read_gbq('select id cid from vs_reporting.communities', google_project_id, dialect='standard', verbose=False).set_index('cid', drop=False)

for module_type in sql_module_types:
    sql_modules = open_active_modules(module_type)
    module_table = build_table(module_type)
    data = data.join(module_table, how='left', lsuffix='_left')

feature_set = data.replace([np.inf,-np.inf], np.nan)
feature_set.fillna(0, inplace=True)
feature_set = feature_set.iloc[:,2:]
feature_set.drop(feature_set[feature_set.avg_communities_per_user == 0].index, inplace=True)


community_facts


UnboundLocalError: local variable 'result' referenced before assignment

In [None]:


def open_active_modules(module_type):
    directory = '/Users/joshharris/community_health/sql/module_types/'+module_type
    sql_files = list(map(lambda x: os.path.splitext(x), os.listdir(directory)))
    sql_modules = list()
    for i in range(len(sql_files)):
        if sql_files[i][1] == '.on':
            sql_modules.append(sql_files[i][0])
    return(sql_modules)

def bigquery_worker(i):
    sql_module = sql_modules[i]
    print('Starting module ',i,': ',sql_modules[i], sep='')
    try:
        result = pgbq.read_gbq('select * from community_networks.'+module_type+'_'+sql_module, google_project_id, dialect='standard', verbose=False)
        print(module_type,'_',sql_module,' table found.', sep='')
    except:
        print(module_type,'_',sql_module,' table does not exist, generating now', sep='')
        with open(directory+'/'+sql_module) as query_file:
            query = query_file.read()
        result = pgbq.read_gbq(query, google_project_id, dialect='standard', verbose=False)
        print('Creating new ',module_type,'_',sql_module,' table', sep='')
        pgbq.to_gbq(result, dataset+'.'+module_type+'_'+sql_module, google_project_id, if_exists='replace', verbose=False)
        print('Module ',i,': ',sql_module,'query complete', sep='')
    return (result)

def add_network_stats(df, sql_module_type):
    edges = df[sql_module_type+'_edges']
    nodes = df[sql_module_type+'_nodes']
    avg_indegree = df[sql_module_type+'_avg_indegree']
    avg_outdegree = df[sql_module_type+'_avg_outdegree']
    df[sql_module_type+'_network_density_X_100'] = (edges/(nodes*(nodes-1)/2))*100
    df[sql_module_type+'_indegree_skew'] = (avg_indegree-avg_outdegree)/avg_outdegree
    df.drop([sql_module_type+'_edges',sql_module_type+'_nodes'], axis=1, inplace=True)
    return(df)

In [None]:
# Cross correlate community facts and output key correlations

variables = list(feature_set)[:len(feature_set)] #this is not a good way to create the list of variables.. why not
print(variables)
corr_matrix = pd.DataFrame(index=variables, columns=variables)
key_correlations = pd.DataFrame(columns=['pair','coefficient'])
corr_checked = list()

i = 0
j = 0

for i in range(len(variables)):
    for j in range(len(variables)):
        pair = variables[i]+','+variables[j]
        rho, pval = stats.spearmanr(feature_set[[variables[i],variables[j]]])
        corr_matrix.set_value(variables[i], variables[j], rho)
        if pair not in corr_checked and rho < 0.99 and (rho > 0.4 or rho < -0.4):
            key_correlations.loc[len(key_correlations)] = [pair, rho]
            corr_checked.append(variables[j]+','+variables[i]) 
            

corr_matrix.reset_index(inplace=True)
key_correlations.sort_values(by='coefficient', ascending=False, inplace=True)
key_correlations.reset_index(drop=True, inplace=True)

#Save to BigQuery
#pgbq.to_gbq(transaction_corr_matrix, 'community_networks.transaction_corr_matrix_over_1000', google_project_id, if_exists='replace')
#pgbq.to_gbq(transaction_key_correlations, 'community_networks.transaction_key_correlations_over_1000', google_project_id, if_exists='replace')


In [None]:
key_correlations

In [None]:
corr_matrix

## Notes

pct_with_about_me correlates with pct_female_mau
 - action: encourage men to fill in their about me
 


In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from pandas.tools.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline
plt.switch_backend('MacOSX') 

In [None]:
drop = pd.concat([pd.DataFrame(feature_set.iloc[:,3:])], axis = 1, join_axes = [feature_set.index])
data = pd.DataFrame(scale(drop))
data3D = pd.DataFrame(PCA(n_components=3).fit_transform(data)) # Reduce dimensions from 4 to 3 for visualization
data2D = pd.DataFrame(PCA(n_components=2).fit_transform(data)) # Reduce dimensions from 4 to 2 for visualization

# run kmeans with 10 sets of clusters seeds
# extract classification labels and cluster centers 
# do some prep for vizualization
def cluster_it_up(N):
    kmeans = KMeans(init = 'k-means++', n_clusters = N, n_init = 100).fit(drop) # it's as easy as a function call
    labels = pd.DataFrame((kmeans.labels_)) # extract labels
    centers = pd.DataFrame(kmeans.cluster_centers_) # extract centers
    centers.columns = drop.columns # give centers table readable column names
    labels.columns = ['label'] 
    clust_out = pd.concat([feature_set, labels], axis=1, join_axes = [feature_set.index]) # make DataFrame that is labeled users and their features
    viz3D = pd.concat([data3D, labels], axis = 1, join_axes = [data.index]) 
    viz2D = pd.concat([data2D, labels], axis = 1, join_axes = [data2D.index])
    return clust_out, centers, N, feature_set, data, labels, viz3D, viz2D

In [None]:
clust_out, centers, N, strip, data, labels, viz3D, viz2D  = cluster_it_up(5)

In [None]:
%matplotlib inline
colors = plt.cm.rainbow(np.linspace(0, 1, len(centers)))

#2D viz
fig = plt.figure()
fig.set_size_inches(18.5, 10.5)
for i,c in enumerate(colors):
    single_clus = viz2D.loc[viz2D['label'] == i]
    plt.scatter(single_clus[0], single_clus[1], s=3, c = c, label=str(i))
plt.legend()
plt.show()

#3D viz
fig = plt.figure()
fig.set_size_inches(18.5, 10.5)
ax = fig.add_subplot(111, projection='3d')
for i,c in enumerate(colors):
    single_clus = viz3D.loc[viz3D['label'] == i]
    ax.scatter(single_clus[1], single_clus[2], single_clus[0], s=20, c = c, label=str(i))
plt.legend()
plt.show()

#scatter matrix
#scatter_matrix(data, alpha=0.1, figsize=(6, 6), diagonal='kde')
#plt.show()

centers