In [667]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

In [668]:
data = pd.read_csv("../data/train.csv")

##### Selecting GarageArea, GarageCars, OverallQual, GrLivArea for training and testing respective of SalePrice

In [669]:
X = data[['GarageArea', 'GarageCars', 'OverallQual','GrLivArea']]
y = data['SalePrice'].values.reshape(-1,1)

In [670]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder()
encoder_df_garage_cars = pd.DataFrame(oh_encoder.fit_transform(X[['GarageCars']]).toarray()).add_prefix('Cars_')
encoder_df_overall_qual = pd.DataFrame(oh_encoder.fit_transform(X[['OverallQual']]).toarray()).add_prefix('Qual_')

X = X.join(encoder_df_garage_cars)
X = X.join(encoder_df_overall_qual)
X = X.drop(['GarageCars', 'OverallQual'], axis=1)
X.shape

(1460, 17)

## Model training 

#### Trainining a multiple linear regression with these four features i.e 2 Categorical and 2 Continuous . Splitting the data using sklearn train_test_split, taking test_size 25% with random_state of 0

In [671]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#### Scaling

In [672]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['GarageArea', 'GrLivArea']] = scaler.fit_transform(X_train[['GarageArea', 'GrLivArea']])
X_train[['GarageArea', 'GrLivArea']].head(5)

Unnamed: 0,GarageArea,GrLivArea
1292,-0.190341,1.647481
1018,-0.332953,-0.095211
1213,-0.513596,-1.086609
1430,-0.475566,0.613484
810,0.056854,-0.410831


###### Model Fitting Using LinearRegression

In [673]:
reg_multiple = LinearRegression()
reg_multiple.fit(X_train, y_train)
 
print('The interception is' , reg_multiple.intercept_)
print('The coefficient is' , reg_multiple.coef_)

The interception is [190469.28003864]
The coefficient is [[  9926.68784599  25446.41892702  -8672.57931505  -5502.38002144
    1290.92845208  22157.78208399  -9273.75119958 -64919.3810503
  -71843.7864603  -71953.21304297 -51986.49360895 -37207.87365953
  -23645.0753255    3269.44554997  43888.14331401 129797.49118382
  144600.74309975]]


### Model Evaluation

### Scaling the test data

In [674]:
X_test[['GarageArea', 'GrLivArea']] = scaler.fit_transform(X_test[['GarageArea', 'GrLivArea']])
X_test[['GarageArea', 'GrLivArea']].head(5)

Unnamed: 0,GarageArea,GrLivArea
529,0.036686,1.846234
491,-1.054953,0.144657
459,-0.553873,-0.536337
279,0.130639,0.950954
655,-0.947579,-0.737911


####  Making Predictions

In [675]:
y_pred = reg_multiple.predict(X_test)
y_pred[:10]

array([[215459.34046956],
       [154530.63078581],
       [128613.04436958],
       [220524.8287364 ],
       [133138.30415778],
       [117051.36712978],
       [267283.4619021 ],
       [158650.98680139],
       [509801.70850714],
       [176901.08331828]])

### Model Evaluation

In [676]:
def compute_rmsle(y_val: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
    return round(rmsle, precision)

In [677]:
compute_rmsle(y_test, y_pred)

0.19