In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
import joblib

In [2]:
data = pd.read_csv("../data/train.csv")

In [3]:
def select_clean_data(data):
    X = data[['GarageArea', 'GarageCars', 'OverallQual','GrLivArea']]
    X = X.dropna().reset_index(drop=True)
    if 'SalePrice' in data.columns:
        y = data['SalePrice'].values.reshape(-1,1)
    else:
        return X 
    return X,y

## Model training 

In [4]:
#function to return train_test_split 
def train_test_split_data(X_feature,y_feature,size,random):
    X_train, X_test, y_train, y_test = train_test_split(X_feature, y_feature, test_size = size, random_state = random)
    return X_train,X_test, y_train, y_test

#### Scaling

In [5]:
oh_encoder = OneHotEncoder(sparse=False,handle_unknown = 'ignore')
    
#function to encode the data and fit_transform
def encoding_with_one_hot_encoder(data,feature_1 ,feature_2):
    fitting_encoder = oh_encoder.fit(data[[feature_1 , feature_2]]).transform(data[[feature_1 , feature_2]])
    col_names = oh_encoder.get_feature_names_out(input_features = [feature_1 , feature_2])
    encoder_df = pd.DataFrame(fitting_encoder,columns=col_names,index = data.index)
    data = data.join(encoder_df)
    data = data.drop([feature_1,feature_2],axis=1)
    return data


In [6]:
scaler = StandardScaler()

#function for standard scaler
def scaling_with_standard_scalar(data,feature_1,feature_2):
    data[[feature_1, feature_2]] = scaler.fit(data[[feature_1, feature_2]]).transform(data[[feature_1, feature_2]])
    return data


In [7]:
#function to train linear regression
reg_multiple = LinearRegression()

def train_linear_regression(X_data,y_data):
    regression = reg_multiple.fit(X_data, y_data)
    return regression

### Model Evaluation

In [8]:
## Tranforminig with one hot encoder

def encoding_with_one_hot_encoder_with_transform(data,feature_1 ,feature_2):
    fitting_encoder = oh_encoder.transform(data[[feature_1 , feature_2]])
    col_names = oh_encoder.get_feature_names_out(input_features = [feature_1 , feature_2])
    encoder_df = pd.DataFrame(fitting_encoder,columns=col_names,index = data.index)
    data = data.join(encoder_df)
    data = data.drop([feature_1,feature_2],axis=1)
    return data


In [9]:
#function for standard scaler with transform
def scaling_with_standard_scalar_with_transform(data,feature_1,feature_2):
    data[[feature_1, feature_2]] = scaler.transform(data[[feature_1, feature_2]])
    return data

In [10]:
#function to predict the data
def predict_data(data_to_predict):
    y_pred = reg_multiple.predict(data_to_predict)
    return y_pred

In [11]:
def compute_rmsle(y_val: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
    return round(rmsle, precision)

In [12]:
def model_evaluation(data):
    X,y = select_clean_data(data)
    X_train,X_test, y_train, y_test = train_test_split_data(X,y,0.25,0)
    X_train = encoding_with_one_hot_encoder(X_train,'GarageCars','OverallQual')
    X_train = scaling_with_standard_scalar(X_train,'GarageArea','GrLivArea')
    model = train_linear_regression(X_train,y_train)
    X_test = encoding_with_one_hot_encoder_with_transform(X_test,'GarageCars','OverallQual')
    X_test = scaling_with_standard_scalar_with_transform(X_test,'GarageArea','GrLivArea')
    y_predict = predict_data(X_test)
    compute_rmsle(y_test, y_predict)
    return compute_rmsle(y_test, y_predict)

In [13]:
model_evaluation(data)

0.19

## Model inference

In [14]:
data_test = pd.read_csv("../data/test.csv")

In [15]:
def model_inference(data):
    X = select_clean_data(data)
    X = encoding_with_one_hot_encoder_with_transform(X,'GarageCars','OverallQual')
    X = scaling_with_standard_scalar_with_transform(X,'GarageArea','GrLivArea')
    pred_data = predict_data(X)
    return pred_data

In [16]:
model_inference(data_test)

array([[129128.15171858],
       [144300.95362238],
       [160335.34552407],
       ...,
       [144815.74151179],
       [ 95156.16010871],
       [247887.37071489]])

## Joblib

In [17]:
### Regression
joblib.dump(reg_multiple, '../models/model.joblib', compress=0, protocol=None, cache_size=None)

['../models/model.joblib']

In [18]:
#one_hot_encoder
joblib.dump(oh_encoder, '../models/one_hot_encoder.joblib', compress=0, protocol=None, cache_size=None)

['../models/one_hot_encoder.joblib']

In [19]:
#standard_scaler
joblib.dump(scaler, '../models/standard_scaler.joblib', compress=0, protocol=None, cache_size=None)

['../models/standard_scaler.joblib']

In [20]:
multiple_regression = joblib.load('../models/model.joblib', mmap_mode=None)

In [21]:
ohe = joblib.load('../models/one_hot_encoder.joblib', mmap_mode=None)

In [22]:
sc = joblib.load('../models/standard_scaler.joblib', mmap_mode=None)