In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm

### Simple SIS on 20 parameters, 2 of which form the true model

In [58]:
#set up parameters and model
x = np.random.normal(size=(100,20))
#scale & center such that each parameter has mean 0 and std 1
x_mean = np.mean(x,axis=0)
x = x - x_mean
x_std = np.std(x,axis=0)
x = x/x_std
print('x std:',np.std(x,axis=0))
print('x mean:',np.round(np.mean(x,axis=0),10))

true_y = x[:,0]*5 - x[:,1]*1
noise = np.random.normal(size=100)
obs_y = true_y + noise

x std: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
x mean: [-0. -0. -0. -0. -0. -0.  0.  0. -0.  0.  0. -0.  0. -0.  0. -0. -0.  0.
  0. -0.]


In [59]:
#componentwise regression
w = np.dot(x.T,obs_y)
w

array([ 505.21168657, -105.92353261,  -46.16309252,  156.43601909,
         36.64172262,  -34.22914895,  -64.40352807,  -71.63951447,
        -48.89145366,   63.34449369,  -54.02042548,  -31.75148389,
         21.87834443,   20.49610815,  -72.91190339,   50.08186075,
         26.060757  ,   50.13672791,   30.09756258,   -1.04828851])

In [60]:
#sort by largest abs value of w
ind = np.arange(0,len(w)) #add index to keep track of parameters
w_ind = np.column_stack((ind,w))
w_ind[(np.abs(w)).argsort()[::-1]]

array([[   0.        ,  505.21168657],
       [   3.        ,  156.43601909],
       [   1.        , -105.92353261],
       [  14.        ,  -72.91190339],
       [   7.        ,  -71.63951447],
       [   6.        ,  -64.40352807],
       [   9.        ,   63.34449369],
       [  10.        ,  -54.02042548],
       [  17.        ,   50.13672791],
       [  15.        ,   50.08186075],
       [   8.        ,  -48.89145366],
       [   2.        ,  -46.16309252],
       [   4.        ,   36.64172262],
       [   5.        ,  -34.22914895],
       [  11.        ,  -31.75148389],
       [  18.        ,   30.09756258],
       [  16.        ,   26.060757  ],
       [  12.        ,   21.87834443],
       [  13.        ,   20.49610815],
       [  19.        ,   -1.04828851]])

Less important parameter in model may not be at top of correlation list if its linear coefficient is small. However, given large enough submodel, the parameter should still make the cut, and can be identified in the subsequent application of a more careful selector.

In [29]:
w_ind[:,1].argsort()[::-1]

array([ 0,  1, 19,  7, 11, 14, 18,  5, 13,  3,  8, 16, 15, 10,  9, 12,  2,
       17,  4,  6], dtype=int64)

In [28]:
w_ind

array([[0.00000000e+00, 5.48099155e+02],
       [1.00000000e+00, 2.59171537e+02],
       [2.00000000e+00, 6.43993590e+00],
       [3.00000000e+00, 4.73470286e+01],
       [4.00000000e+00, 4.14251087e+00],
       [5.00000000e+00, 5.70147005e+01],
       [6.00000000e+00, 4.56981213e-02],
       [7.00000000e+00, 9.30293493e+01],
       [8.00000000e+00, 4.10357082e+01],
       [9.00000000e+00, 1.25038788e+01],
       [1.00000000e+01, 2.56054111e+01],
       [1.10000000e+01, 7.67849420e+01],
       [1.20000000e+01, 7.56933827e+00],
       [1.30000000e+01, 5.21806272e+01],
       [1.40000000e+01, 7.49144042e+01],
       [1.50000000e+01, 3.40553115e+01],
       [1.60000000e+01, 3.92202826e+01],
       [1.70000000e+01, 6.33617168e+00],
       [1.80000000e+01, 6.89774951e+01],
       [1.90000000e+01, 1.41879500e+02]])