In [12]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

housing_data = pd.read_csv('kc_house_data.csv')
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

train_data = train_data.loc[:, ~train_data.columns.str.contains("^unnamed", case=False)]
test_data  = test_data.loc[:, ~test_data.columns.str.contains("^unnamed", case=False)]


X_train = train_data.drop(columns=['id', 'date', 'zipcode', 'price'], errors='ignore')
Y_train = train_data['price']/1000

X_test = test_data.drop(columns=['id', 'date', 'zipcode', 'price'], errors='ignore')
Y_test = test_data['price']/1000

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
model = LinearRegression()
model.fit(X_train_scaled, Y_train)

y_train_pred = model.predict(X_train_scaled)
train_mse = mean_squared_error(Y_train, y_train_pred)
train_r2 = r2_score(Y_train, y_train_pred)

print(f'Training MSE: ', train_mse)
print(f'Training R^2: ', train_r2)

y_test_pred = model.predict(X_test_scaled)
test_mse = mean_squared_error(Y_test, y_test_pred)
test_r2 = r2_score(Y_test, y_test_pred)

print(f'Test MSE: ', test_mse)
print(f'Test R^2: ', test_r2)

coefficients = pd.Series(model.coef_, index=X_train.columns)
print(coefficients)


Training MSE:  31486.16777579488
Training R^2:  0.7265334318706018
Test MSE:  57628.154705670386
Test R^2:  0.6543560876120954
bedrooms        -12.521962
bathrooms        18.527633
sqft_living      56.748837
sqft_lot         10.881868
floors            8.043721
waterfront       63.742900
view             48.200109
condition        12.964269
grade            92.231475
sqft_above       48.290089
sqft_basement    27.137032
yr_built        -67.643117
yr_renovated     17.271380
lat              78.375737
long             -1.035203
sqft_living15    45.577658
sqft_lot15      -12.930091
dtype: float64
