In [1]:
from regression import citylist, loadData
import itertools, numpy as np, scipy.stats as stats
from sklearn.cluster import affinity_propagation, spectral_clustering
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load data
df = loadData(citylist)
cols = list(df.columns.difference(['lat', 'lon', 'name', 'riders']))

In [3]:
# Regression for each possible combo of features
combos = [(a, b) for a, b in itertools.combinations(cols, 2) if a != b]
data = []
for a, b in combos:
    y = df[a]
    x = df[b]
    slope, intercept, rval, pval, stderr = stats.linregress(x, y)
    rs = rval**2
    data.append((a, b, rs))

In [4]:
# Build affinity matrix; default affinity is 1 (for value to itself)
adj = np.ones([len(cols), len(cols)])
for istr, jstr, score in data:
    i = cols.index(istr)
    j = cols.index(jstr)
    adj[i][j] = score
    adj[j][i] = score

In [36]:
df2 = df[clusters[9]]

In [46]:
scores = {c: [] for c in clusters[9]}
for _ in range(100):
    rf = RandomForestRegressor(max_features='log2')
    rf.fit(df2, df['riders'])
    for name, s in zip(clusters[9], rf.feature_importances_):
        scores[name].append(s)

out = [(name, np.mean(s)) for name, s in scores.items()]
for name, score in sorted(out, key=lambda x: x[1]):
    print(name, score)

near_house_w_child 0.179022183915
near_family 0.185022164597
near_pop_poor 0.18637702309
near_population 0.215517532877
near_pop_child 0.234061095522


In [26]:
data = [(x, y) for x, y in enumerate(adj[cols.index('near_pop_child')])]

In [32]:
cols[118]

'near_population'

In [27]:
sorted(data, key=lambda x: x[1], reverse = True)

[(114, 1.0),
 (100, 0.94472953065075882),
 (116, 0.86290240901723181),
 (97, 0.82568380589605916),
 (118, 0.75125447725030947),
 (145, 0.64041147589977476),
 (112, 0.53795030373495589),
 (115, 0.51305995546681649),
 (94, 0.48437670466484439),
 (104, 0.43602156332772668),
 (92, 0.42398754211854101),
 (131, 0.41624171705991037),
 (135, 0.40134654326055358),
 (101, 0.39264663866241728),
 (147, 0.39081142873171743),
 (102, 0.37891150059570022),
 (103, 0.37655430928535982),
 (110, 0.36445216327291591),
 (108, 0.29384795044250844),
 (86, 0.2744382110735587),
 (24, 0.27027961904166337),
 (74, 0.26016109571719653),
 (14, 0.23545958587790006),
 (106, 0.23534856190870224),
 (84, 0.22839072105195246),
 (26, 0.21764518480731476),
 (109, 0.20749237117196243),
 (70, 0.1863329182565123),
 (10, 0.18123566661230181),
 (128, 0.1533703034673346),
 (67, 0.14901569326538941),
 (111, 0.14720651133839949),
 (88, 0.14171511464658582),
 (80, 0.13221171426024858),
 (127, 0.1266322736817895),
 (82, 0.11456448986

In [5]:
centers, labels = affinity_propagation(adj)
clusters = {}
for name, c in zip(cols, labels):
    if c in clusters:
        clusters[c].append(name)
    else:
        clusters[c] = [name]
for c in clusters:
    print(c)
    print(clusters[c])

0
['15net_business', '15net_emp_pay', '15net_employment', '15net_entertainment', '15net_finance', '15net_hospitality', '15net_hunits_new']
1
['15net_bachelors', '15net_emp_full_time', '15net_employed', '15net_family', '15net_household', '15net_hunits', '15net_hunits_attached', '15net_hunits_large', '15net_hunits_medium', '15net_hunits_old', '15net_hunits_owner', '15net_hunits_renter', '15net_hunits_vacant', '15net_labor_force', '15net_pop_old', '15net_pop_rich', '15net_population']
2
['15net_house_w_child', '15net_hunits_detached', '15net_pop_child', '15net_pop_poor', '30net_hunits_detached']
3
['30net_business', '30net_emp_pay', '30net_employment', '30net_entertainment', '30net_finance', '30net_hospitality']
4
['30net_bachelors', '30net_emp_full_time', '30net_employed', '30net_family', '30net_house_w_child', '30net_household', '30net_hunits', '30net_hunits_attached', '30net_hunits_large', '30net_hunits_medium', '30net_hunits_new', '30net_hunits_old', '30net_hunits_owner', '30net_hunit

In [12]:
def remove_cols(to_remove, rcols, radj):
    i_to_remove = [rcols.index(name) for name in to_remove]
    radj = np.delete(radj, i_to_remove, axis = 0)
    radj = np.delete(radj, i_to_remove, axis = 1)
    rcols = [name for i, name in enumerate(rcols) if i not in i_to_remove]
    return rcols, radj       

In [21]:
rf = RandomForestRegressor(max_features='log2')
rf.fit(df[cols], df['riders'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [22]:
data = sorted([(name, score) for name, score in zip(cols, rf.feature_importances_)], key=lambda x:x[1], reverse=True)
for d in data:
    print(d)

('30net_hunits_detached', 0.093772865259808047)
('15net_population', 0.063509592785467112)
('15net_pop_poor', 0.045356127577798652)
('near_employment', 0.035898281956300612)
('15net_entertainment', 0.033363939465154376)
('15net_hunits_attached', 0.029527585878582129)
('walk_entertainment', 0.029358971266068356)
('walk_hunits_vacant', 0.02701120881341354)
('30net_hunits_vacant', 0.024946845829902406)
('15net_pop_old', 0.024126224430420552)
('walk_employment', 0.024035931528499557)
('15net_hunits_renter', 0.022747902126581072)
('15net_emp_pay', 0.022181532305590994)
('30net_hunits_new', 0.021284251156469647)
('60net_employed', 0.015350575253033415)
('15net_hunits_owner', 0.015300120836224488)
('near_pop_rich', 0.015271730959104415)
('near_household', 0.013574203407501775)
('near_pop_old', 0.013511066890044878)
('near_business', 0.012973389561307109)
('near_hospitality', 0.01265711142754947)
('15net_employment', 0.012600174167173017)
('30net_pop_poor', 0.012491732446172975)
('near_hunits_

In [25]:
def choose_best():
    rcols = [x for x in cols]
    radj = np.array(adj)
    keep = []
    while len(rcols) > 1:
        keep, rcols, radj = one_iter(keep, rcols, radj)
    
    keep.extend(rcols)
    return keep

In [28]:
def one_iter(keep, rcols, radj):
    rf = RandomForestRegressor()
    rf.fit(df[rcols], df['riders'])
    featScore = {name: score for score, name in zip(rf.feature_importances_, rcols)}

    if len(rcols) == 2:
        if featScore[rcols[0]] > featScore[rcols[1]]:
            del rcols[1]
            return keep, rcols, radj
        else:
            del rcols[0]
            return keep, rcols, radj


    centers, labels = affinity_propagation(radj)
    clusters = {}
    #print(len(rcols), len(labels))
    try:
        for name, c in zip(rcols, labels):
            if c in clusters:
                clusters[c].append(name)
            else:
                clusters[c] = [name]
    except TypeError as e:
        print(rcols, labels)
        raise e
    drop_clusters = []
    for c, names in clusters.items():
        if len(names) == 1:
            #print("Single cluster, keeping: ", names)
            drop_clusters.append(c)
            keep.extend(names)
            rcols, radj = remove_cols(names, rcols, radj)
    for to_drop in drop_clusters:
        del clusters[to_drop]

    to_remove = []
    for c, names in clusters.items():
        #print("Cluster") # Remove
        #for n in names:
        #    print(n, featScore[n])
        name, minVal = min(zip(names, [featScore[n] for n in names]), key=lambda x: x[1])
        #print("Lowest in cluster, dropping: ", name, minVal)
        to_remove.append(name)
    rcols, radj = remove_cols(to_remove, rcols, radj)
    
    return keep, rcols, radj

In [7]:
choose_best()

Single cluster, keeping:  ['parking']
Cluster
15net_business 0.0166224719
15net_emp_pay 0.000106013773345
15net_employment 0.00131183198163
15net_entertainment 0.00221766924499
15net_finance 0.0057302579702
15net_hospitality 0.00216665341327
15net_hunits_new 0.00034177438936
Lowest in cluster, dropping:  15net_emp_pay 0.000106013773345
Cluster
15net_bachelors 0.000275723378745
15net_emp_full_time 0.00369076443603
15net_employed 0.000342906930573
15net_family 0.0537819600326
15net_household 0.000911320426072
15net_hunits 0.000818656682765
15net_hunits_attached 0.00114731206639
15net_hunits_large 0.00286425183029
15net_hunits_medium 0.0123275563621
15net_hunits_old 0.0013300520094
15net_hunits_owner 0.010411133919
15net_hunits_renter 0.00048220510755
15net_hunits_vacant 0.00107922556538
15net_labor_force 0.000615102727736
15net_pop_old 0.0380590575648
15net_pop_rich 0.00020890283751
15net_population 0.16833039848
Lowest in cluster, dropping:  15net_pop_rich 0.00020890283751
Cluster
15net

KeyboardInterrupt: 

In [29]:
totalcounts = {}
for i in range(10):
    print("Iteration: {0}\r".format(i+1))
    selects = choose_best() 
    for name in selects:
        if name in totalcounts:
            totalcounts[name] += 1
        else:
            totalcounts[name] = 1
            
countlist = sorted([(name, count) for name, count in totalcounts.items()], key=lambda x:x[1], reverse=True)
for name, count in countlist:
    print("{0}: {1}%".format(name, count))

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
15net_hunits_detached: 10%
parking: 10%
near_university: 8%
walk_hunits_detached: 7%
near_medical: 6%
near_employment: 6%
walk_pop_old: 6%
15net_house_w_child: 6%
walk_hunits_attached: 6%
near_hunits_new: 5%
15net_university: 4%
near_household: 4%
near_hospitality: 4%
walk_business: 3%
near_bachelors: 3%
near_hunits_detached: 3%
near_entertainment: 3%
15net_business: 2%
near_hunits_owner: 2%
60net_emp_pay: 2%
15net_family: 2%
walk_university: 2%
15net_population: 2%
near_hunits_vacant: 1%
60net_hospitality: 1%
60net_university: 1%
60net_entertainment: 1%
walk_pop_poor: 1%
walk_hunits: 1%
30net_emp_pay: 1%
60net_family: 1%
walk_population: 1%
near_labor_force: 1%
walk_pop_child: 1%
30net_university: 1%
60net_hunits_detached: 1%


In [29]:
selects   

['parking',
 'near_entertainment',
 '15net_hunits_detached',
 'walk_hunits_detached',
 'near_employment',
 'walk_medical',
 'near_hunits_vacant',
 'walk_pop_poor',
 '30net_university',
 'walk_hunits_attached',
 'near_hunits_owner',
 '15net_house_w_child',
 '15net_business']

[0 1 0 1 2 1 1 0]
