In [1]:
from regression import citylist, loadData
import itertools, numpy as np, scipy.stats as stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

In [14]:
# load data
df = loadData(citylist)
cols = ['60net_bachelors', '60net_business', '60net_emp_pay', '60net_employment', '60net_finance', '60net_hospitality', '60net_hunits_attached', '60net_hunits_medium', '60net_hunits_new', '60net_hunits_old', '60net_hunits_owner', '60net_hunits_vacant', '60net_pop_rich']

In [15]:
# Regression for each possible combo of features
combos = [(a, b) for a, b in itertools.combinations(cols, 2) if a != b]
data = []
for a, b in combos:
    y = df[a]
    x = df[b]
    slope, intercept, rval, pval, stderr = stats.linregress(x, y)
    rs = rval**2
    data.append((a, b, rs))

In [16]:
# Build affinity matrix; default affinity is 1 (for value to itself)
adj = np.ones([len(cols), len(cols)])
for istr, jstr, score in data:
    i = cols.index(istr)
    j = cols.index(jstr)
    adj[i][j] = score
    adj[j][i] = score

In [17]:
adj, cols

(array([[ 1.        ,  0.96813219,  0.83601712,  0.94710539,  0.89800918,
          0.97361911,  0.97911346,  0.99259078,  0.78883309,  0.9809996 ,
          0.95702245,  0.8470023 ,  0.99823079],
        [ 0.96813219,  1.        ,  0.87429026,  0.93977099,  0.93398219,
          0.97621814,  0.91568084,  0.94736367,  0.75809305,  0.91196   ,
          0.89933221,  0.78606432,  0.9741136 ],
        [ 0.83601712,  0.87429026,  1.        ,  0.94537893,  0.98668293,
          0.91495926,  0.7692797 ,  0.81659029,  0.42786328,  0.78327582,
          0.67131394,  0.48396832,  0.84452579],
        [ 0.94710539,  0.93977099,  0.94537893,  1.        ,  0.96542469,
          0.9847186 ,  0.91041211,  0.93545543,  0.61944907,  0.92176914,
          0.84246562,  0.6821071 ,  0.94603235],
        [ 0.89800918,  0.93398219,  0.98668293,  0.96542469,  1.        ,
          0.95603906,  0.83353558,  0.87750832,  0.52518423,  0.84405117,
          0.75309524,  0.57694171,  0.90591694],
        [ 0.973

In [18]:
mdl = Lasso(normalize=True)
result = mdl.fit(df[cols], df['riders'])
result.coef_

array([ 0.        , -0.        ,  0.        ,  0.01411023, -0.08386563,
        0.        ,  0.        , -0.        , -0.06636101, -0.        ,
        0.        , -0.        ,  0.01231286])