## House Price Modeling Using Linear Regression

**Data Science in Production**

###### data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv

**`Adegoke Olanrewaju`**

# Importing the libraries

In [11]:
import numpy as np

import pandas as pd

import sklearn

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, mean_squared_log_error

import joblib

### Model Building Section

#### Model training

In [12]:
dataPATH = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/train.csv'

In [13]:
def build_model(dataPATH: str) -> dict[str, str]:

    # Loading the train.csv dataset from path
    dataset = load_selected_dataset(dataPATH)

    # Selecting the categorical and continuous columns of interest
    categorical_features, continuous_features = select_features_columns()

    # Defining the features and target
    features, target = features_target_selection(dataset, categorical_features, continuous_features)

    # Splitting of the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

    # Preprocessing and feature engineering of the X_train set
    X_train, X_test = preprocessing(categorical_features, continuous_features, X_train, X_test)

    # Automatic checking of the DataFrame Equality for X_train dataset
    X_train, X_test = DataFrame_Equality(dataPATH, X_train, X_test)

    # Model training and fitting
    LinReg = training_model(X_train, y_train)

    # Model predictions of the X_test
    y_predicted = model_prediction(X_test)

    # Model evaluation and model performance
    rmsle = model_performance(y_test, y_predicted)

    return LinReg, rmsle



def DataFrame_Equality(dataPATH, X_train, X_test):
    X_train = x_train_dataframe_equality_check(dataPATH, X_train)
    X_test = x_test_dataframe_equality_check(dataPATH, X_test)
    return X_train,X_test

def x_test_dataframe_equality_check(dataPATH, X_test):
    X_test_df = save_load_to_parquet_x_test(dataPATH, X_test)
    X_test, X_test_df = resetting_index_parquet_x_test(X_test, X_test_df)       
    pd.testing.assert_frame_equal(X_test_df, X_test)
    return X_test

def x_train_dataframe_equality_check(dataPATH, X_train):
    X_train_df = save_load_to_parquet_x_train(dataPATH, X_train)
    X_train, X_train_df = resetting_index_parquet_x_train(X_train, X_train_df)     
    pd.testing.assert_frame_equal(X_train_df, X_train)
    return X_train

def features_target_selection(dataset, categorical_features, continuous_features):
    features = dataset[categorical_features + continuous_features]
    target = dataset['SalePrice']
    return features,target

def select_features_columns():
    categorical_features = ['MSZoning','HouseStyle']
    continuous_features = ['YearBuilt','TotalBsmtSF','MiscVal']
    return categorical_features,continuous_features

def model_performance(y_test: np.ndarray, y_predicted: np.ndarray) -> dict[str,str]:
    msle = mean_squared_log_error(y_test, y_predicted)
    rmsle = round(np.sqrt(msle), 2)
    model_rmsle = {'rmsle' : rmsle}
    return model_rmsle

def load_selected_dataset(dataPATH):
    train_csv_master = pd.read_csv(dataPATH)
    train_csv = train_csv_master.copy()
    dataset = train_csv[['MSZoning','HouseStyle','YearBuilt','TotalBsmtSF','MiscVal','SalePrice']]
    return dataset

def model_prediction(X_test):
    LinReg_model = joblib.load('../models/LinReg.joblib')
    y_predicted = LinReg_model.predict(X_test)
    return y_predicted

def preprocessing(categorical_features, continuous_features, X_train, X_test):
    X_train = preprocessing_x_train(categorical_features, continuous_features, X_train)
    X_test = preprocessing_x_test(categorical_features, continuous_features, X_test)
    return X_train,X_test

def resetting_index_parquet_x_test(X_test, X_test_df):
    X_test_df = X_test_df.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    return X_test,X_test_df

def save_load_to_parquet_x_test(dataPATH, X_test):
    X_test.to_parquet(dataPATH + 'X_test_df.parquet', index=False)
    X_test_df = pd.read_parquet(dataPATH + 'X_test_df.parquet')
    return X_test_df

def preprocessing_x_test(categorical_features, continuous_features, X_test):
    X_test_cat_DF = encoding_categorical_features(categorical_features, X_test)
    X_test_cont_DF = scaling_continuous_features(continuous_features, X_test)
    X_test = pd.concat([X_test_cont_DF, X_test_cat_DF], axis=1)
    return X_test

def scaling_continuous_features(continuous_features, X_test):
    loaded_stdScaler = joblib.load('../models/stdScaler.joblib')
    X_test_cont = loaded_stdScaler.transform(X_test[continuous_features])
    X_test_cont_DF = pd.DataFrame(X_test_cont, columns=continuous_features)
    return X_test_cont_DF

def encoding_categorical_features(categorical_features, X_test):
    loaded_oneHot = joblib.load('../models/oneHot.joblib')
    X_test_cat = loaded_oneHot.transform(X_test[categorical_features])
    X_test_cat_DF = pd.DataFrame(X_test_cat, columns = loaded_oneHot.get_feature_names(categorical_features ))
    return X_test_cat_DF

def training_model(X_train, y_train):
    LinReg = LinearRegression()
    LinReg.fit(X_train, y_train)
    LinReg = LinearRegression()
    LinReg.fit(X_train, y_train)
    joblib.dump(LinReg, '../models/LinReg.joblib')
    return LinReg

def resetting_index_parquet_x_train(X_train, X_train_df):
    X_train_df = X_train_df.reset_index(drop=True)
    X_train = X_train.reset_index(drop=True)
    return X_train,X_train_df

def save_load_to_parquet_x_train(dataPATH, X_train):
    X_train.to_parquet(dataPATH + 'X_train_df.parquet', index=False)
    X_train_df = pd.read_parquet(dataPATH + 'X_train_df.parquet')
    return X_train_df

def preprocessing_x_train(categorical_features, continuous_features, X_train):
    X_train_cat_DF = encoding_categorical_features(categorical_features, X_train)
    X_train_cont_DF = scaling_continuous_features(continuous_features, X_train)
    X_train = pd.concat([X_train_cont_DF, X_train_cat_DF], axis=1)
    return X_train

def scaling_continuous_features(continuous_features, X_train):
    stdScaler = StandardScaler()
    stdScaler.fit(X_train[continuous_features])
    X_train_cont = stdScaler.transform(X_train[continuous_features])
    X_train_cont_DF = pd.DataFrame(X_train_cont, columns=continuous_features)
    joblib.dump(stdScaler, '../models/stdScaler.joblib')
    return X_train_cont_DF

def encoding_categorical_features(categorical_features, X_train):
    oneHot = OneHotEncoder(drop = 'first', sparse=False)
    oneHot.fit(X_train[categorical_features])
    X_train_cat = oneHot.transform(X_train[categorical_features])
    X_train_cat_DF = pd.DataFrame(X_train_cat, columns=oneHot.get_feature_names(categorical_features ))
    joblib.dump(oneHot, '../models/oneHot.joblib')
    return X_train_cat_DF

#returning the root_mean_log_squared_error
build_model(dataPATH)
    
    


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


(LinearRegression(), {'rmsle': 0.27})

# Model Inference

In [14]:
dataPATH = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/test.csv'

In [15]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Loading and reading the given test.csv dataset
    test_csv = load_test_data()

    # Defining the categorical and continuous features of the test_csv
    categorical_features, continuous_features = selecting_features_columns_test_data()

    # Selecting the categorical and continuous features of the test_csv
    test_csv_features = test_csv[categorical_features + continuous_features]

    # Preprocessing of the test_csv
    test_set = preprocessing_test_data(categorical_features, continuous_features, test_csv_features)

    # Automatic checking of DataFrame Equality for the test_csv data
    dataframe_equality_check_test_data(test_set)

    # Predicting the house prices using the test_csv dataset
    predictions = make_prediction(test_set)

    return predictions



def make_prediction(test_set):
    LinReg_model = joblib.load('../models/LinReg.joblib')
    predictions = LinReg_model.predict(test_set)
    return predictions[:5]

def dataframe_equality_check_test_data(test_set):
    final_test_csv, final_test_csv_df = save_and_load_parquet_test_csv(test_set)
    final_test_csv, final_test_csv_df = resetting_index_of_dataframe(final_test_csv, final_test_csv_df)       
    pd.testing.assert_frame_equal(final_test_csv_df, final_test_csv)

def selecting_features_columns_test_data():
    categorical_features = ['MSZoning','HouseStyle']
    continuous_features = ['YearBuilt','TotalBsmtSF','MiscVal']
    return categorical_features,continuous_features

def load_test_data():
    test_csv_master = pd.read_csv(dataPATH)
    test_csv = test_csv_master.copy()
    return test_csv

def resetting_index_of_dataframe(final_test_csv, final_test_csv_df):
    final_test_csv_df = final_test_csv_df.reset_index(drop=True)
    final_test_csv = final_test_csv.reset_index(drop=True)
    return final_test_csv,final_test_csv_df

def save_and_load_parquet_test_csv(test_set):
    final_test_csv = test_set
    final_test_csv.to_parquet(dataPATH + 'final_test_csv_df.parquet', index=False)
    final_test_csv_df = pd.read_parquet(dataPATH + 'final_test_csv_df.parquet')
    return final_test_csv,final_test_csv_df

def preprocessing_test_data(categorical_features, continuous_features, test_csv_features):
    check_and_correct_NaN(test_csv_features)
    test_csv_cat_DF = encoding_categorical_features_test_data(categorical_features, test_csv_features)
    test_csv_cont_DF = scaling_continuous_features_test_data(continuous_features, test_csv_features)
    test_set = pd.concat([test_csv_cont_DF, test_csv_cat_DF],axis=1)
    return test_set

def scaling_continuous_features_test_data(continuous_features, test_csv_features):
    loaded_stdScaler = joblib.load('../models/stdScaler.joblib')
    test_csv_cont = loaded_stdScaler.transform(test_csv_features[continuous_features])
    test_csv_cont_DF = pd.DataFrame(test_csv_cont, columns=continuous_features)
    return test_csv_cont_DF

def encoding_categorical_features_test_data(categorical_features, test_csv_features):
    loaded_oneHot = joblib.load('../models/oneHot.joblib')
    test_csv_cat = loaded_oneHot.transform(test_csv_features[categorical_features])
    test_csv_cat_DF = pd.DataFrame(test_csv_cat, columns=loaded_oneHot.get_feature_names(categorical_features ))
    return test_csv_cat_DF

def check_and_correct_NaN(test_csv_features):
    test_csv_features.isna().sum()
    test_csv_features.dropna(inplace=True)
    
# Making the Predictions for the test_csv dataset returning an array
make_predictions(dataPATH)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_csv_features.dropna(inplace=True)
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


array([110610.50906774, 175679.06302268, 225803.59054318, 226381.68473571,
       199957.53776718])