<a href="https://colab.research.google.com/github/jburchfield76/datasharing/blob/master/Deep_Learning_CA_Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from sklearn.datasets import fetch_california_housing #diff from book code, ethical concerns with Boston df
housing = fetch_california_housing()    #may use Boston in future, along with study links, to understand ethical concerns

In [5]:
from sklearn.preprocessing import scale

In [6]:
X, y = scale(housing.data), housing.target

In [7]:
from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(X, y)

print('R2 %0.3f' % regression.score(X, y))

R2 0.606


In [8]:
print([a + ':' + str(round(b, 1)) for a, b in
       zip(housing.feature_names, regression.coef_)])

['MedInc:0.8', 'HouseAge:0.1', 'AveRooms:-0.3', 'AveBedrms:0.3', 'Population:-0.0', 'AveOccup:-0.0', 'Latitude:-0.9', 'Longitude:-0.9']


In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
lbl = LabelEncoder()
enc = OneHotEncoder()
qualitative = ['red', 'red', 'green', 'blue',
               'red', 'blue', 'blue', 'green']
labels = lbl.fit_transform(qualitative).reshape(8,1)
print(enc.fit_transform(labels).toarray())

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [11]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection  import train_test_split
from sklearn.metrics import r2_score

pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(poly_X,
                    y, test_size=0.33, random_state=42)

from sklearn.linear_model import Ridge #the below comment is changed code from book
# The 'normalize' parameter has been removed. If you need to normalize your data,
# you should do it before fitting the model using StandardScaler or MinMaxScaler.
reg_regression = Ridge(alpha=0.1)
reg_regression.fit(X_train,y_train)
print ('R2: %0.3f' % r2_score(y_test,reg_regression.predict(X_test)))

R2: 0.659


In [12]:
import numpy as np

a = np.array([0, 0, 0, 0, 1, 1, 1, 1])
b = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(8,1)
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(b,a)
print (regression.predict(b)>0.5)

[False False False False  True  True  True  True]


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Adjust the threshold to create a more balanced binary target variable
binary_y = np.array(y >= np.median(y)).astype(int)  # DIFF FROM BOOK VALUE Use median as threshold

# Alternatively, explore other ways to create a binary target variable with sufficient samples in each class

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    binary_y,
                                                    test_size=0.33,
                                                    random_state=5)
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
from sklearn.metrics import accuracy_score

print('In-sample accuracy: %0.3f' %
      accuracy_score(y_train, logistic.predict(X_train)))
print('Out-of-sample accuracy: %0.3f' %
      accuracy_score(y_test, logistic.predict(X_test)))

In-sample accuracy: 0.838
Out-of-sample accuracy: 0.841


In [15]:
for var,coef in zip(housing.feature_names,
                    logistic.coef_[0]):
        print ("%7s : %7.3f" %(var, coef))

 MedInc :   2.479
HouseAge :   0.259
AveRooms :  -0.800
AveBedrms :   0.971
Population :   0.131
AveOccup :  -5.711
Latitude :  -3.632
Longitude :  -3.390


In [16]:
print('\nclasses:',logistic.classes_)
print('\nProbs:\n',logistic.predict_proba(X_test)[:3,:])


classes: [0 1]

Probs:
 [[0.69845053 0.30154947]
 [0.46476874 0.53523126]
 [0.96476771 0.03523229]]


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X,
                y, test_size=0.33, random_state=42)
check = [2**i for i in range(8)]
for i in range(2**7+1):
    X_train = np.column_stack((X_train,np.random.random(
        X_train.shape[0])))
    X_test = np.column_stack((X_test,np.random.random(
        X_test.shape[0])))
    regression.fit(X_train, y_train)
    if i in check:
        print ("Random features: %i -> R2: %0.3f" %
               (i, r2_score(y_train,regression.predict(X_train))))

Random features: 1 -> R2: 0.609
Random features: 2 -> R2: 0.609
Random features: 4 -> R2: 0.609
Random features: 8 -> R2: 0.610
Random features: 16 -> R2: 0.610
Random features: 32 -> R2: 0.610
Random features: 64 -> R2: 0.611
Random features: 128 -> R2: 0.613


In [18]:
regression.fit(X_train, y_train)
print ('R2 %0.3f'
   % r2_score(y_test,regression.predict(X_test)))
# Please notice that the R2 result may change from run to
# run due to the random nature of the experiment

R2 0.592


In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split #below comment-removed normalize=True from book code
from sklearn.preprocessing import StandardScaler # Import StandardScaler for normalization

pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(poly_X,
                    y, test_size=0.33, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha=0.1) # Remove normalize=True
reg_regression.fit(X_train,y_train)
print ('R2: %0.3f'
   % r2_score(y_test,reg_regression.predict(X_test)))

R2: 0.662


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor

# reset X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X,
                    y, test_size=0.33, random_state=42)

SGD = SGDRegressor(penalty=None,
                   learning_rate='invscaling',
                   eta0=0.01, power_t=0.25,
                   max_iter=5, tol=None)

power = 17
check = [2**i for i in range(power+1)]
for i in range(400):
    for j in range(X_train.shape[0]): #below is changed from book-shape of df is now different
        # Reshape using the actual number of features in X_train
        SGD.partial_fit(X_train[j,:].reshape(1, X_train.shape[1]),
                        y_train[j].reshape(1,))
        count = (j+1) + X_train.shape[0] * i
        if count in check:
            R2 = r2_score(y_test,SGD.predict(X_test))
            print ('Example %6i R2 %0.3f coef: %s' %
            (count, R2, ' '.join(map(lambda x:'%0.3f' %x, SGD.coef_))))

Example      1 R2 -3.209 coef: -0.018 -0.009 -0.011 0.001 -0.007 -0.001 -0.009 0.008
Example      2 R2 -3.112 coef: -0.013 0.001 -0.002 0.000 -0.019 -0.003 0.017 -0.033
Example      4 R2 -3.056 coef: -0.019 0.017 -0.007 -0.000 -0.023 -0.000 0.004 -0.022
Example      8 R2 -2.937 coef: -0.015 0.006 -0.013 -0.005 0.020 -0.001 0.023 -0.051
Example     16 R2 -2.701 coef: -0.035 0.044 -0.023 -0.009 0.003 -0.000 0.017 -0.053
Example     32 R2 -2.286 coef: -0.006 0.096 -0.012 -0.009 0.013 -0.006 -0.020 -0.018
Example     64 R2 -1.610 coef: 0.151 0.130 0.005 -0.022 -0.001 -0.015 -0.009 -0.053
Example    128 R2 -0.938 coef: 0.250 0.167 0.017 -0.029 0.005 -0.024 -0.080 -0.003
Example    256 R2 -0.216 coef: 0.412 0.216 0.070 -0.037 0.069 -0.037 -0.116 -0.009
Example    512 R2 0.298 coef: 0.579 0.194 0.080 -0.033 0.043 -0.058 -0.134 -0.082
Example   1024 R2 0.502 coef: 0.685 0.200 0.036 -0.039 0.043 -0.054 -0.198 -0.135
Example   2048 R2 0.547 coef: 0.776 0.204 0.021 0.001 0.049 -0.087 -0.239 -0.25

KeyboardInterrupt: 