In [16]:
from regression import citylist, loadData
import statsmodels.api as sm, itertools, networkx as nx, community, numpy as np
from sklearn.cluster import affinity_propagation, spectral_clustering
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = loadData(citylist)
cols = list(df.columns.difference(['lat', 'lon', 'name', 'riders']))
combos = [(a, b) for a, b in itertools.product(cols, cols) if a != b]

In [3]:
data = []
for a, b in combos:
#    a, b = 'near_population', 'near_family'
    y = df[a]
    X = sm.add_constant(df[b])
    result = sm.OLS(y, X).fit()
    rs = result.rsquared
    data.append((a, b, rs))
    


In [6]:
adj = np.zeros([len(cols), len(cols)])
for istr, jstr, score in data:
    i = cols.index(istr)
    j = cols.index(jstr)
    adj[i][j] = score
    adj[j][i] = score
    

In [17]:
centers, labels = affinity_propagation(adj)
clusters = {}
for name, c in zip(cols, labels):
    if c in clusters:
        clusters[c].append([name])
    else:
        clusters[c] = [name]
for c, names in clusters.items():
    print(c, names)

0 ['15net_business', ['15net_emp_pay'], ['15net_employment'], ['15net_entertainment'], ['15net_finance'], ['15net_hospitality'], ['15net_hunits_new']]
1 ['15net_bachelors', ['15net_emp_full_time'], ['15net_employed'], ['15net_family'], ['15net_household'], ['15net_hunits'], ['15net_hunits_attached'], ['15net_hunits_large'], ['15net_hunits_medium'], ['15net_hunits_old'], ['15net_hunits_owner'], ['15net_hunits_renter'], ['15net_hunits_vacant'], ['15net_labor_force'], ['15net_pop_old'], ['15net_pop_rich'], ['15net_population']]
2 ['15net_house_w_child', ['15net_hunits_detached'], ['15net_pop_child'], ['15net_pop_poor'], ['30net_hunits_detached']]
3 ['30net_business', ['30net_emp_pay'], ['30net_employment'], ['30net_entertainment'], ['30net_finance'], ['30net_hospitality']]
4 ['30net_bachelors', ['30net_emp_full_time'], ['30net_employed'], ['30net_family'], ['30net_house_w_child'], ['30net_household'], ['30net_hunits'], ['30net_hunits_attached'], ['30net_hunits_large'], ['30net_hunits_medi

In [19]:
labels = spectral_clustering(adj)
for name, c in zip(cols, labels):
    if c in clusters:
        clusters[c].append([name])
    else:
        clusters[c] = [name]
for c, names in clusters.items():
    print(c, names)


0 ['15net_business', ['15net_emp_pay'], ['15net_employment'], ['15net_entertainment'], ['15net_finance'], ['15net_hospitality'], ['15net_hunits_new'], ['15net_medical'], ['15net_university'], ['30net_medical'], ['30net_university'], ['60net_medical'], ['60net_university'], ['near_medical'], ['near_university'], ['parking'], ['walk_medical'], ['walk_university']]
1 ['15net_bachelors', ['15net_emp_full_time'], ['15net_employed'], ['15net_family'], ['15net_household'], ['15net_hunits'], ['15net_hunits_attached'], ['15net_hunits_large'], ['15net_hunits_medium'], ['15net_hunits_old'], ['15net_hunits_owner'], ['15net_hunits_renter'], ['15net_hunits_vacant'], ['15net_labor_force'], ['15net_pop_old'], ['15net_pop_rich'], ['15net_population'], ['near_bachelors'], ['near_emp_full_time'], ['near_employed'], ['near_household'], ['near_hunits'], ['near_hunits_large'], ['near_hunits_medium'], ['near_hunits_new'], ['near_hunits_old'], ['near_hunits_owner'], ['near_hunits_renter'], ['near_hunits_vacan

In [7]:
result = AgglomerativeClustering(n_clusters=4).fit(adj)
part_agg = {}
for part, name in zip(result.fit_predict(adj), cols):
    if part in part_agg:
        part_agg[part].append(name)
    else:
        part_agg[part] = [name]

  return linkage(y, method='ward', metric='euclidean')


In [5]:
nodes_part = community.best_partition(G, weight='weight')
part_louvain = {}
for k, v in nodes_part.items():
    if v in part_louvain:
        part_louvain[v].append(k)
    else:
        part_louvain[v] = [k]

In [9]:
result = KMeans(n_clusters=4).fit(adj)
part_kmeans = {}
for part, name in zip(result.predict(adj), cols):
    if part in part_kmeans:
        part_kmeans[part].append(name)
    else:
        part_kmeans[part] = [name]

In [15]:
output = []
for i in range(4):
    for j in range(4):
        for k in range(4):
            comlen = len(set(part_kmeans[i]) & set(part_louvain[j]) & set(part_agg[k]))
            avglen = len(part_kmeans[i])+len(part_louvain[j]) + len(part_agg[k])
            output.append((i, j, k, format(comlen*2/avglen)))
#for i, j, k, score in sorted(output, key = lambda x: x[3], reverse = True):
#    print(i, j, k, score)

In [70]:
row = adj[0]
rf = RandomForestRegressor()
rf.fit(df[cols], df['riders'])
featScore = {name: score for score, name in zip(rf.feature_importances_, cols)}

In [101]:
workinglist = []
for i, name, row in zip(range(len(cols)), cols, adj):
    maxIndex = list(row).index(max(row))
    highInds = [cols[i] for i,x in enumerate(row) if x > 0.99]
    minScore = min([featScore[x] for x in highInds]) if highInds else 0
    workinglist.append((name, featScore[name], len(highInds), minScore, featScore[name]<minScore))

In [58]:
rcols = [x for x in cols]
radj = np.array(adj)

In [103]:
sorted(workinglist, key=lambda x: x[4], reverse=True)

[('15net_hunits', 0.00026336082820744568, 1, 0.0019689980499935258, True),
 ('15net_labor_force', 0.00041193117738682315, 2, 0.0010508059390040038, True),
 ('30net_hunits', 2.8711893337580935e-05, 2, 0.000146978083384718, True),
 ('30net_labor_force', 0.00010440695566416357, 3, 0.000146978083384718, True),
 ('30net_pop_rich', 0.00028799923705843695, 1, 0.001146499088693286, True),
 ('30net_population', 0.00013163538459463393, 1, 0.0011910796363626534, True),
 ('60net_hunits', 3.2045386832697929e-05, 2, 6.6787926549970289e-05, True),
 ('60net_pop_rich', 1.976004044229574e-05, 2, 0.00011314609507415014, True),
 ('near_employed', 0.0012475036599142854, 1, 0.0041732792569205814, True),
 ('near_hunits', 0.0024649207744563608, 1, 0.016070561388668712, True),
 ('walk_household', 0.0016135731308468502, 1, 0.0097343576285956294, True),
 ('walk_labor_force', 0.0039010567639515851, 1, 0.0040884198390042656, True),
 ('15net_bachelors', 0.018687412713852502, 0, 0, False),
 ('15net_business', 0.0575

In [77]:
for k, v in sorted([(k, v) for k, v in featScore.items()], key=lambda x: x[1], reverse=True):
    print(k, v)
    

15net_family 0.110946150931
15net_house_w_child 0.105154991853
15net_business 0.0575889291829
15net_population 0.0532849317182
15net_pop_old 0.0525248368172
15net_emp_pay 0.0412963177918
15net_hunits_attached 0.0412799727111
30net_emp_pay 0.0315679313752
near_bachelors 0.0312288917137
near_hunits_owner 0.0305472347152
walk_hunits_detached 0.0244507812997
15net_bachelors 0.0186874127139
walk_hunits_attached 0.0183399272695
15net_hunits_detached 0.0169487263904
near_household 0.0160705613887
walk_pop_child 0.0159031731173
30net_medical 0.0158683149431
near_pop_old 0.0137567155245
near_business 0.0131786288042
15net_employment 0.0119859222617
15net_university 0.0111181740261
near_finance 0.0105359771977
30net_hunits_detached 0.0104608559076
walk_hunits 0.0097343576286
30net_hunits_owner 0.00877828918314
walk_pop_poor 0.0087616470689
near_emp_full_time 0.00830053614058
near_employment 0.00825353307889
15net_hospitality 0.00740298902437
walk_hunits_new 0.00642398397195
near_university 0.006