In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import pickle

def create_train_test_data(dataset):
    # load and split the data
    data_train = dataset.sample(frac=0.8, \
    random_state=30).reset_index(drop=True)
    data_test = dataset.drop(data_train.index).reset_index(drop=True)
    # save the data
    data_train.to_csv('train.csv', index=False)
    data_test.to_csv('test.csv', index=False)
    print(f"Train data for modeling: {data_train.shape}")
    print(f"Test data for predictions: {data_test.shape}")
    
def train_model(x_train, y_train):
    print("Training the model ...")
    
    model = Pipeline(steps=[("label encoding", OneHotEncoder(handle_unknown='ignore')), ("tree model", LinearRegression())])
    model.fit(x_train, y_train)

    return model

def accuracy(model, x_test, y_test):
    print("Testing the model ...")
    predictions = model.predict(x_test)
    tree_mse = mean_squared_error(y_test, predictions)
    tree_rmse = np.sqrt(tree_mse)
    return tree_rmse

def export_model(model):
    # Save the model
    pkl_path = 'model.pkl'
    with open(pkl_path, 'wb') as file:
        pickle.dump(model, file)
        print(f"Model saved at {pkl_path}")

def main():
    # Load the whole data
    data = pd.read_csv('cleaned_data.csv', keep_default_na=False, index_col=0)
    # Split train/test
    # Creates train.csv and test.csv
    create_train_test_data(data)
    # Loads the data for the model training
    train = pd.read_csv('train.csv', keep_default_na=False)
    x_train = train.drop(columns=['SalePrice'])
    y_train = train['SalePrice']
    # Loads the data for the model testing
    test = pd.read_csv('test.csv', keep_default_na=False)
    x_test = test.drop(columns=['SalePrice'])
    y_test = test['SalePrice']
    # Train and Test
    model = train_model(x_train, y_train)
    rmse_test = accuracy(model, x_test, y_test)
    
    print(f"Average Price Test: {y_test.mean()}")
    print(f"RMSE: {rmse_test}")
    
    # Save the model
    export_model(model)
    
if __name__ == '__main__':
    main()

Train data for modeling: (934, 73)
Test data for predictions: (234, 73)
Training the model ...
Testing the model ...
Average Price Test: 175652.0128205128
RMSE: 11097.118358566046
Model saved at model.pkl


The model utilized 934 data points for training and 234 data points for testing. In the test set, the average sale price was $175,000. The root-mean-square error (RMSE) is a suitable metric for understanding the model's output because it can be interpreted on the same scale as the dependent variable, which in this case is the sale price. With an RMSE of 10,552, it means that, on average, the model's predictions missed the correct sale prices by a little over $10,000. Considering the average sale price of $175,000, this level of deviation is not significant.


The RMSE is a widely used measure in machine learning and statistics to assess the accuracy of a predictive model. It provides a straightforward interpretation of the overall error of the model, making it accessible even to individuals without a strong statistical background. However, it's important to note that the RMSE is not the only metric used for model evaluation. Depending on the specific problem and context, other metrics such as mean absolute error (MAE), R-squared, or precision and recall may also be relevant.


Overall, the model's performance, as indicated by the RMSE, suggests that it is reasonably accurate in predicting sale prices, with an average deviation of around $10,000 from the actual prices.