# Introduction

$ f(X) = Y^{T} Y $

In [1]:
# Initialize / import all modules

%load_ext autoreload
%autoreload 2

import sys
import os
import glob
import utm

import numpy as np
import pandas as pd
import scipy
import scipy.signal
import scipy.stats
import sklearn as skl
import sklearn.datasets as ds
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler


import colorlover as cl 
# Matplotlib
%matplotlib inline 
import matplotlib.pyplot as plt

# Plotly
import plotly.offline as ply
import plotly.tools as plytools
import plotly.graph_objs as plygo
ply.init_notebook_mode(connected=False)



import sys
import os
import glob
import numpy as np
import pandas as pd
import scipy.signal

from tqdm import tqdm_notebook as progress
from IPython.core.display import display, HTML

# The Dataset

In [2]:
#
dataset = ds.fetch_california_housing()
df = pd.DataFrame(dataset["data"], columns=dataset["feature_names"])

zeroLat, zeroLon = 37.133615, -126.774271
zeroX, zeroY, _, _ =  utm.from_latlon(37.133615, -126.774271)

dists = []
for i,r in df[['Latitude','Longitude']].iterrows():    
    lat  = r['Latitude']
    lon  = r['Longitude']
    x,y,_,_ = utm.from_latlon(lat, lon)
    x = x-zeroX
    y = y-zeroY
    dist = np.sqrt(x**2 + y**2)
    #dist = np.sqrt(x**2)
    dists.append(dist)
    
df['Distance'] = dists

df["target"] = dataset["target"]
display(df.tail())
print(dataset["DESCR"])
display(df.describe())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Distance,target
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,261914.724945,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,264320.285968,0.771
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,257863.109941,0.923
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,259325.485208,0.847
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24,251559.823724,0.894


California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.




Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Distance,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,346575.162732,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,158603.959969,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,6112.701811,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,165694.61692,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,437586.174798,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,472599.853853,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,607694.701956,5.00001


## Select Features

In [3]:
#features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'Distance']
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

## Average Income x Targer Price

In [4]:
traces=[]
for f in features[0:1]:
    traces.append(plygo.Scatter(x=df[f].values[::5], y=df['target'].values[::5], name=f, mode="markers"))

data = plygo.Data(traces)

layout = plygo.Layout(
    xaxis=dict(
        title='Average Income',
        autorange=True        
    ),
    yaxis=dict(
        title='Target Price (x100.000)',
        autorange=True        
    )
)

fig = plygo.Figure(data=data, layout=layout)
ply.iplot(fig)

## Locations and Prices

In [5]:
#
data = [ dict(
        type = 'scattergeo',
        lon = df['Longitude'][::3],
        lat = df['Latitude'][::3],
        text = df['target'][::3]*100000,
        mode = 'markers',
        marker = dict(
            size = 5,
            opacity = 0.7,
            autocolorscale = False,
            symbol = 'square',
            colorscale = 'Jet',
            color=df['target'][::3],
            cmin = df['target'].min(),
            cmax = df['target'].max(),
        ))]

layout = dict(
        title = 'House Locations',
        margin=plygo.Margin(
            l=0,
            r=0,
            b=0,
            t=0),
        geo = dict(
            resolution=50,
            scope='usa',
            projection=dict(type='albers usa', scale=1),
            landcolor = "rgb(200, 200,200)",
            subunitcolor = "rgb(255,255,255)",
            countrycolor = "rgb(255,255,255)",
            countrywidth = 1,
            subunitwidth = 3,
            lonaxis=dict(range=[-160,-80]), 
            lataxis=dict(range=[30,50]), 
            showland=True, 
            showocean=True, 
            showcountries=False,
            showcoastlines=True,
        ),

    )


fig = dict( data=data, layout=layout )
ply.iplot(fig, validate=True)

# Whitened data

In [6]:
X = df[features].values
y = df['target'].values.reshape(-1,1)

# Manual "Whithen data formula" was replaced by a more efficient [StandarScaler] for data and [RobustScaler] for target.
#wdf = np.dot(scipy.linalg.sqrtm(np.dot(X.T,X)),X.T).T
#wdf = pd.DataFrame(data=wdf, columns=features)
#wdf["target"] = y

wdf = StandardScaler().fit(X).transform(X)
wdf = pd.DataFrame(data=wdf, columns=features)
wdf["target"] = RobustScaler().fit(y).transform(y)

display(wdf.tail())

traces=[]
for f in features[:]:
    traces.append(plygo.Scatter(x=wdf[f].values[:1000], y=wdf['target'].values[:1000], name=f, mode="markers", opacity=1))


data = plygo.Data(traces)

layout = plygo.Layout(
    xaxis=dict(
    ),
    yaxis=dict(
        title="target"

    )
)
fig = plygo.Figure(data=data,layout=layout)
ply.iplot(fig)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.04911,1.801647,-0.758826,-0.700086
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722,-0.706977
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713,-0.602239
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626,-0.654608
20639,-0.780129,-1.004309,-0.070443,0.138403,-0.033977,-0.043682,1.750146,-0.833696,-0.622222


# Ridge Regression

In [7]:
def getTheta(X, y, d2):
    ridge = Ridge(alpha=d2)
    ridge.fit(X, y)
    return ridge.coef_

size = len(wdf)
nTrain = int(size* 1)
nTest = len(wdf) - nTrain 

# Only part of the data ?
# Move this approach to Cross Validation or Leave one Out
X = wdf[features].values[0:nTrain,:]
y = wdf[['target']].values[0:nTrain]

_thetas=[]

for _d2 in np.logspace(0,8, num=30):
    theta = getTheta(X,y,_d2)    
    l = [_d2]
    l.extend(theta.tolist()[0])
    _thetas.append(l)


_thetas = pd.DataFrame(_thetas, columns=["d2"] + ["theta{0}".format(i) for i in range(len(features))])
_thetas.set_index("d2", inplace=True)


display(_thetas.head())
display(_thetas.tail())

traces=[]
for c in _thetas:
    traces.append(plygo.Scatter(x=_thetas[c].index, y=_thetas[c].values, name=c, mode="lines", opacity=.5))


data = plygo.Data(traces)

layout = plygo.Layout(
    xaxis=dict(
        title='d^2',
        type='log',
        #autorange='reversed',    
    )
)
fig = plygo.Figure(data=data,layout=layout)
ply.iplot(fig)

Unnamed: 0_level_0,theta0,theta1,theta2,theta3,theta4,theta5,theta6,theta7
d2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,0.57164,0.081872,-0.182875,0.210525,-0.003087,-0.027101,-0.61965,-0.599425
1.887392,0.571624,0.081912,-0.182795,0.21042,-0.003073,-0.027103,-0.619272,-0.599044
3.562248,0.571592,0.081987,-0.182645,0.210222,-0.003047,-0.027107,-0.61856,-0.598325
6.723358,0.571533,0.082128,-0.182361,0.209848,-0.002997,-0.027114,-0.617221,-0.596974
12.68961,0.571418,0.082391,-0.181823,0.209144,-0.002904,-0.027128,-0.614713,-0.594442


Unnamed: 0_level_0,theta0,theta1,theta2,theta3,theta4,theta5,theta6,theta7
d2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7880463.0,0.001429,0.00022,0.000315,-9.7e-05,-5.1e-05,-4.9e-05,-0.000299,-9.6e-05
14873520.0,0.000758,0.000117,0.000167,-5.2e-05,-2.7e-05,-2.6e-05,-0.000159,-5.1e-05
28072160.0,0.000402,6.2e-05,8.9e-05,-2.7e-05,-1.4e-05,-1.4e-05,-8.4e-05,-2.7e-05
52983170.0,0.000213,3.3e-05,4.7e-05,-1.4e-05,-8e-06,-7e-06,-4.5e-05,-1.4e-05
100000000.0,0.000113,1.7e-05,2.5e-05,-8e-06,-4e-06,-4e-06,-2.4e-05,-8e-06


In [8]:
X = wdf[features].values
y = wdf['target'].values.reshape(-1,1)

scores = []
nChunks = 3
hChunks = ['#{0}'.format(c) for c in range(nChunks)]


for d2 in np.logspace(0,7):    
    ridge = Ridge(alpha=d2)
    score = cross_val_score(ridge, X=X, y=y, cv=nChunks, scoring='r2')
    score = [d2] + score.tolist()
    scores.append(score)

columns=['d2'] + hChunks
scores = pd.DataFrame(scores, columns=columns)
display(scores.head(),HTML('<hr>'))

traces=[]
traces.append(plygo.Scatter(x=scores['d2'].values, y=np.mean(scores[hChunks].values, axis=1), name='Score', mode="lines", opacity=.7))

data = plygo.Data(traces)

layout = plygo.Layout(
    xaxis=dict(
        title='d^2',
        type='log',
        #autorange='reversed',    
    )
)
fig = plygo.Figure(data=data,layout=layout)
ply.iplot(fig)


Unnamed: 0,d2,#0,#1,#2
0,1.0,0.555051,0.588394,0.585385
1,1.389495,0.555063,0.5884,0.585361
2,1.930698,0.555079,0.588409,0.585327
3,2.682696,0.555101,0.58842,0.585281
4,3.727594,0.555132,0.588436,0.585216


# Conclusions

# References