## Question 2

In [None]:
pip install -r requirements.txt

In [1]:
## import packages
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
## Read data
df = pd.read_csv("train.csv")

df = df.drop(columns=["zipcode"])

print(df.info())

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1000 non-null   int64  
 1   price          1000 non-null   float64
 2   bedrooms       1000 non-null   int64  
 3   bathrooms      1000 non-null   float64
 4   sqft_living    1000 non-null   int64  
 5   sqft_lot       1000 non-null   int64  
 6   floors         1000 non-null   float64
 7   waterfront     1000 non-null   int64  
 8   view           1000 non-null   int64  
 9   condition      1000 non-null   int64  
 10  grade          1000 non-null   int64  
 11  sqft_above     1000 non-null   int64  
 12  sqft_basement  1000 non-null   int64  
 13  yr_built       1000 non-null   int64  
 14  yr_renovated   1000 non-null   int64  
 15  lat            1000 non-null   float64
 16  long           1000 non-null   float64
 17  sqft_living15  1000 non-null   int64  
 18  sqft_lot15     1000 

In [3]:
## Standardize features and scale price

columns = df.drop(columns=["price"]).columns.tolist()
#print(columns)
df[columns] = (df[columns] - df[columns].mean()) / df[columns].std()
df["price"] = df["price"] / 1000

print(df.describe().round(2))

       Unnamed: 0    price  bedrooms  bathrooms  sqft_living  sqft_lot  \
count     1000.00  1000.00   1000.00    1000.00      1000.00   1000.00   
mean        -0.00   520.41     -0.00       0.00         0.00      0.00   
std          1.00   339.49      1.00       1.00         1.00      1.00   
min         -1.73    80.00     -3.93      -2.83        -1.88     -0.49   
25%         -0.86   309.45     -0.41      -0.76        -0.72     -0.32   
50%          0.00   435.00     -0.41      -0.06        -0.17     -0.23   
75%          0.86   633.69      0.76       0.63         0.47     -0.11   
max          1.73  3075.00      4.29       4.09         4.53     10.38   

        floors  waterfront     view  condition    grade  sqft_above  \
count  1000.00     1000.00  1000.00    1000.00  1000.00     1000.00   
mean      0.00       -0.00    -0.00       0.00     0.00       -0.00   
std       1.00        1.00     1.00       1.00     1.00        1.00   
min      -0.86       -0.09    -0.31      -3.57   

## Question 2.1

In [4]:
## Create model

model = LinearRegression()
X_train = df.drop(columns=["price"])
y_train = df["price"]

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)

#print(y_train_pred)

In [6]:
## Gather metrics

coeff = model.coef_

print("Feature Coefficients:")
for feature, coef in zip(X_train.columns, coeff):
    print(f"    {feature}: {coef.round(2)}")

train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

print("Training MSE:", train_mse)
print("Training R^2:", train_r2)


Feature Coefficients:
    Unnamed: 0: 8.46
    bedrooms: -12.81
    bathrooms: 18.47
    sqft_living: 57.19
    sqft_lot: 11.13
    floors: 8.16
    waterfront: 64.26
    view: 47.63
    condition: 12.65
    grade: 92.56
    sqft_above: 48.46
    sqft_basement: 27.7
    yr_built: -68.08
    yr_renovated: 17.35
    lat: 78.17
    long: -1.44
    sqft_living15: 45.5
    sqft_lot15: -12.91
Training MSE: 31415.74791610087
Training R^2: 0.7271450489303788


## Question 2.2

In [7]:
## load and scale test data

df_test = pd.read_csv("test.csv")

#print(df_test.info)

df_test = df_test.drop(columns=["zipcode", "id", "date"])

columns = df_test.drop(columns=["price"]).columns.tolist()
#print(columns)
df_test[columns] = (df_test[columns] - df_test[columns].mean()) / df_test[columns].std()
df_test["price"] = df_test["price"] / 1000

print(df_test.describe().round(2))

       Unnamed: 0    price  bedrooms  bathrooms  sqft_living  sqft_lot  \
count     1000.00  1000.00   1000.00    1000.00      1000.00   1000.00   
mean        -0.00   541.68     -0.00       0.00        -0.00     -0.00   
std          1.00   408.53      1.00       1.00         1.00      1.00   
min         -1.73    75.00     -2.60      -2.65        -1.77     -0.26   
25%         -0.86   325.00     -0.45      -0.76        -0.68     -0.18   
50%          0.00   453.38     -0.45       0.19        -0.19     -0.14   
75%          0.86   632.75      0.62       0.50         0.47     -0.08   
max          1.73  5350.00      4.91       4.91         6.30     27.87   

        floors  waterfront     view  condition    grade  sqft_above  \
count  1000.00     1000.00  1000.00    1000.00  1000.00     1000.00   
mean     -0.00        0.00    -0.00       0.00    -0.00        0.00   
std       1.00        1.00     1.00       1.00     1.00        1.00   
min      -0.85       -0.11    -0.33      -3.69   

In [8]:
## Evaluate model on test data

X_test = df_test.drop(columns=["price"])
y_test = df_test["price"]

y_test_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Testing MSE:", test_mse)
print("Testing R^2:", test_r2)

Testing MSE: 59887.87242845715
Testing R^2: 0.6408026834022006


## Question 2.3

Looking at the feature coefficients, we can see that the largest positive correlations come from sqft_above, long, and view, telling us that each of these features increase the price of a house. The strongest negative correlation comes from yr_renovated. For our training data, we can see that the R^2 value is 0.727, telling us that 72% of the variance in price is explained by the model, which is a pretty strong fit. When looking at the testing data, the R^2 goes down to 0.640, telling us that only 64% of the variance in testing price is explained the model. This is slightly worse, but only by 0.08, which is still pretty good. Taking a look at the MSE, we can see if goes from around 30,000 in the training data to 60,0000 in the testing data, pratically doubling. If we take the square root of these values, we get 173 and 245 for training and testing respectively. This is showing that on average the model is wrong by around 200, or 200,000 dollars scaled up. For the scale of house prices, this is not very far off showing that our model is fairly accurate at predicting house price.