In [1]:
import pandas as pd
import numpy as np

In [2]:
file = "kc_house_data.csv"
data = pd.read_csv(file)

In [3]:
feature = data.drop(columns=['price'])

In [4]:
columns = ['bedrooms', 'bathrooms', 
           'sqft_living', 'sqft_lot', 'floors', 
           'waterfront', 'view', 'condition', 
           'grade', 'sqft_above', 'sqft_basement', 
           'yr_built', 'yr_renovated', 
           'lat', 'long', 'sqft_living15', 'sqft_lot15']

## Question 1

In [5]:
for c in columns:
    col = feature[c]
    print(f'Column {c} summary:')
    print(f'- maximum {np.amax(col)}')
    print(f'- minimum {np.amin(col)}')
    print(f'- average {np.average(col)}')
    print(f'- variance {np.var(col)}')
    print('\n')

Column bedrooms summary:
- maximum 33
- minimum 0
- average 3.37084162309721
- variance 0.8649749868540167


Column bathrooms summary:
- maximum 8.0
- minimum 0.0
- average 2.1147573219821405
- variance 0.5931238445451253


Column sqft_living summary:
- maximum 13540
- minimum 290
- average 2079.8997362698374
- variance 843494.6523725765


Column sqft_lot summary:
- maximum 1651359
- minimum 520
- average 15106.967565816869
- variance 1715579393.3040423


Column floors summary:
- maximum 3.5
- minimum 1.0
- average 1.4943089807060566
- variance 0.2915745155520679


Column waterfront summary:
- maximum 1
- minimum 0
- average 0.007541757275713691
- variance 0.007484879172907909


Column view summary:
- maximum 4
- minimum 0
- average 0.23430342849211122
- variance 0.5872154461720236


Column condition summary:
- maximum 5
- minimum 1
- average 3.4094295100171195
- variance 0.4234469192550091


Column grade summary:
- maximum 13
- minimum 1
- average 7.656873178179799
- variance 1.381639

In [6]:
corr = data.corr()['price']
for c in columns:
    print(f'Correlation price vs. {c}: {corr[c]}')

Correlation price vs. bedrooms: 0.3083495981456364
Correlation price vs. bathrooms: 0.5251375054139724
Correlation price vs. sqft_living: 0.7020350546118009
Correlation price vs. sqft_lot: 0.08966086058710003
Correlation price vs. floors: 0.25679388755070176
Correlation price vs. waterfront: 0.26636943403055346
Correlation price vs. view: 0.3972934882944871
Correlation price vs. condition: 0.03636178912899409
Correlation price vs. grade: 0.667434256020255
Correlation price vs. sqft_above: 0.6055672983560842
Correlation price vs. sqft_basement: 0.323816020712004
Correlation price vs. yr_built: 0.05401153149478604
Correlation price vs. yr_renovated: 0.12643379344092243
Correlation price vs. lat: 0.307003479995218
Correlation price vs. long: 0.02162624103930622
Correlation price vs. sqft_living15: 0.5853789035795697
Correlation price vs. sqft_lot15: 0.08244715251948594


In [7]:
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

In [8]:
print(data['price'].shape)
print(feature.shape)

(21613,)
(21613, 20)


In [9]:
print(feature.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 20 columns):
id               21613 non-null int64
date             21613 non-null object
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(4), int64(15), object(1)
memory usage: 3.3+ MB
None


In [10]:
train = pd.read_csv('train.csv')
lreg.fit(train.drop(columns=['Unnamed: 0', 'price', 'zipcode']), train['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
from sklearn.metrics import mean_squared_error, r2_score

price_predicted = lreg.predict(train.drop(columns=['Unnamed: 0', 'price', 'zipcode']))

mse = mean_squared_error(train['price'], price_predicted)
rse = np.sqrt(mse)
r2 = r2_score(train['price'], price_predicted)

print(f'MSE {mse}')
print(f'RSE {rse}')
print(f'R2 {r2}')

MSE 31486167775.794903
RSE 177443.42133704168
R2 0.7265334318706016


In [12]:
print(f'Coefficients for linear regression\n {lreg.coef_}')
print(f'Coeff length {len(lreg.coef_)}')

Coefficients for linear regression
 [-1.47042805e+04  2.56877840e+04  8.30842103e+01  3.75929764e-01
  1.55555810e+04  7.15535170e+05  6.30278980e+04  1.88164028e+04
  7.95346027e+04  4.20104951e+01  4.10737151e+01 -2.40066933e+03
  4.36829418e+01  5.53505032e+05 -7.42402712e+03  6.80157923e+01
 -5.15527568e-01]
Coeff length 17


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)

In [14]:
scaler.fit_transform(train)

array([[-1.73031962, -0.87974769, -0.40982347, ..., -0.35519332,
        -0.96563661, -0.3128578 ],
       [-1.72685552,  0.05182493, -0.40982347, ..., -0.7998304 ,
        -0.44332966, -0.23355563],
       [-1.72339142, -1.00323042, -1.58410276, ..., -0.18307573,
         1.09374507, -0.21669046],
       ...,
       [ 1.72339142,  0.0975047 , -1.58410276, ..., -0.86437449,
        -1.02532883, -0.4185143 ],
       [ 1.72685552, -0.97390696, -1.58410276, ...,  1.07911986,
        -0.80148299, -0.40352304],
       [ 1.73031962, -0.68199849, -0.40982347, ..., -0.46993837,
         0.39236146, -0.15736334]])

In [15]:
print(scaler.mean_)

[ 5.00500000e+02  5.20414834e+05  3.34900000e+00  2.04575000e+00
  2.05119600e+03  1.47020850e+04  1.44650000e+00  8.00000000e-03
  2.37000000e-01  3.46400000e+00  7.60600000e+00  1.75033300e+03
  3.00863000e+02  1.96904900e+03  8.17490000e+01  9.80743680e+04
  4.75494927e+01 -1.22207472e+02  1.98707700e+03  1.34968740e+04]


In [16]:
print(scaler.var_)

[8.33332500e+04 1.15137174e+11 7.25199000e-01 5.20219437e-01
 7.87629886e+05 8.37902562e+08 2.67387750e-01 7.93600000e-03
 5.84831000e-01 4.74704000e-01 1.34476400e+00 6.23598196e+05
 2.03105874e+05 7.93930599e+02 1.56325670e+05 2.76093058e+03
 2.00502655e-02 1.94433912e-02 4.49039437e+05 6.29070578e+08]


In [17]:
feature = train.drop(columns=['Unnamed: 0', 'price', 'zipcode'])
scaled = scaler.fit_transform(feature)
lreg.fit(feature, train['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
predicted = lreg.predict(scaled)

In [19]:
mse = mean_squared_error(predicted, train['price'])
rse = np.sqrt(mse)
r2 = r2_score(predicted, train['price'])

print(f'Scaled MSE {mse}')
print(f'Scaled rse {rse}')
print(f'Scaled R2 {r2}')

Scaled MSE 558042575104906.6
Scaled rse 23622924.778801344
Scaled R2 -631.7338415017563


In [20]:
help(lreg.fit)

Help on method fit in module sklearn.linear_model.base:

fit(X, y, sample_weight=None) method of sklearn.linear_model.base.LinearRegression instance
    Fit linear model.
    
    Parameters
    ----------
    X : numpy array or sparse matrix of shape [n_samples,n_features]
        Training data
    
    y : numpy array of shape [n_samples, n_targets]
        Target values. Will be cast to X's dtype if necessary
    
    sample_weight : numpy array of shape [n_samples]
        Individual weights for each sample
    
        .. versionadded:: 0.17
           parameter *sample_weight* support to LinearRegression.
    
    Returns
    -------
    self : returns an instance of self.



In [21]:
test = pd.read_csv('test.csv')
nonScaled = LinearRegression()
scaled = LinearRegression()
dropped = train.drop(columns=['Unnamed: 0', 'zipcode', 'price'])
targettedY = train['price']

In [22]:
nonScaled.fit(dropped, targettedY)
scaled.fit(scaler.fit_transform(dropped), targettedY)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
def printMetrics(linreg):
    testDrop = test.drop(columns=['Unnamed: 0', 'zipcode', 'price', 'date', 'id'])
    testYPredict = linreg.predict(testDrop)
    targettedYTest = test['price']
    
    mse = mean_squared_error(testYPredict, targettedYTest)
    r2 = r2_score(testYPredict, targettedYTest)
    
    print(f'MSE score {mse}')
    print(f'RSE score {np.sqrt(mse)}')
    print(f'R2 score {r2}')

In [24]:
printMetrics(nonScaled)

MSE score 57628154705.66816
RSE score 240058.64847088547
R2 score 0.3538380835910615


In [25]:
printMetrics(scaled)

MSE score 2.3173058704076886e+17
RSE score 481384032.80620855
R2 score -0.1522352461656029


## Question 3: Linear Regression - Closed form

### 3.1 sqft_living and price

In [26]:
x_train = train['sqft_living']
y_train = train['price']

In [27]:
x_hat = sum(x_train)/len(x_train)
y_hat = sum(y_train)/len(y_train)

In [28]:
theta_1 = sum([(x - x_hat)*(y - y_hat) for (x,y) in zip(x_train,y_train)]) / sum([(x - x_hat)**2 for x in x_train])
theta_0 = y_hat - theta_1*x_hat

In [29]:
theta_1, theta_0

(269.46205468469446, -32304.6547210266)

In [30]:
x_test = test['sqft_living']
y_test = test['price']

In [31]:
y_pred = x_test * theta_1 + theta_0

In [32]:
mse = mean_squared_error(y_pred, y_test)
r2 = r2_score(y_pred,y_test)

In [34]:
print(f'MSE {mse}')
print(f'RMSE {np.sqrt(mse)}')
print(f'R2 {r2}')

MSE 88575978543.09607
RMSE 297617.16775598825
R2 -0.38581772448215945


### 3.2 Multiple linear regression

In [35]:
x_train = train.drop(columns=['Unnamed: 0','price', 'zipcode'])
x_train['bias'] = 1
y_train = np.array([train['price']]).T

In [36]:
print(x_train.shape, y_train.shape)

(1000, 18) (1000, 1)


In [37]:
# x_transpose = np.transpose(x_train)
# p1 = np.linalg.inv(np.matmul(x_transpose, x_train))
# theta = np.matmul(np.matmul(p1, x_transpose), y_train)

transpose = x_train.transpose()
theta = np.linalg.pinv(transpose.dot(x_train)).dot(transpose).dot(y_train)

In [38]:
print(theta.shape)
print(theta.T.shape)

(18, 1)
(1, 18)


In [39]:
x_test = test.drop(columns=['Unnamed: 0', 'zipcode', 'price', 'date', 'id'])
x_test['bias'] = 1
y_test = test['price']

In [40]:
y_predict = x_test.dot(theta)
mse = mean_squared_error(y_predict, y_test)
rse = np.sqrt(mse)
r2 = r2_score(y_predict, y_test)

In [41]:
print(f'Mean square error {mse}')
print(f'Root mean square error {rse}')
print(f'R2 error {r2}')

Mean square error 58089533328.27369
Root mean square error 241017.7033503425
R2 error 0.35001837493692867


## 4 Gradient descent

In [46]:
train_normal = train.drop(columns=['zipcode', 'price'])

In [48]:
train_normal['Unnamed: 0'] = 1
train_normal.shape

(1000, 18)

In [49]:
std_scale = StandardScaler()

In [50]:
train_normal_scaled = std_scale.fit_transform(train_normal)

In [51]:
train_normal_scaled.shape

(1000, 18)

In [53]:
print(train_normal_scaled)

[[ 0.         -0.40982347 -1.44988843 ... -0.35519332 -0.96563661
  -0.3128578 ]
 [ 0.         -0.40982347  0.28318404 ... -0.7998304  -0.44332966
  -0.23355563]
 [ 0.         -1.58410276 -1.44988843 ... -0.18307573  1.09374507
  -0.21669046]
 ...
 [ 0.         -1.58410276 -1.44988843 ... -0.86437449 -1.02532883
  -0.4185143 ]
 [ 0.         -1.58410276 -1.44988843 ...  1.07911986 -0.80148299
  -0.40352304]
 [ 0.         -0.40982347 -0.06343045 ... -0.46993837  0.39236146
  -0.15736334]]


In [54]:
theta = np.random.rand(train_normal_scaled.shape[1], 1)

In [57]:
theta
print(theta[1])

[0.3930459]


In [68]:
def deltaMse(theta, j, x, y):
    row, col = theta.shape
    total = 0
    for i in range(0, row):
        s = 0
        for k in range(0, col):
            s += theta[j][0] * x[k][i]
        s -= y[i]
        total += s * x[j][i]
    return 2/row * total
    
def descent(alpha, theta, x, y, epsilon=0.001):
    old = theta
    
    row, col = theta.shape
    
    while (True):
        for j in range(0, col):
            theta[j][0] = theta[j][0] - alpha * deltaMse(theta, j, x, y)
        
        norm = np.linalg.norm(old - theta)
        if 0 < norm and norm < epsilon:
            return theta
        else:
            print(theta.T)
            old = theta
    
    return theta

def gradient_descent(x, y, theta, iterations, alpha):
    past_costs = []
    past_thetas = [theta]
    m = y.shape[]
    print(m)
    for i in range(iterations):
        prediction = np.dot(x, theta)
        error = prediction - y
        cost = 1/m * np.dot(error.T, error)
        past_costs.append(cost)
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        past_thetas.append(theta)
        
    return past_thetas, past_costs


In [69]:
past_thetas, past_costs = gradient_descent(x_train, y_train, theta, 2000, 0.01)

1


In [67]:
past_thetas[-1]

array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan]])