### TRAINING MODEL

In [26]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder



def create_train_test_data(dataset):

    print("Splitting the data...")

    # define X and y
    X = dataset.loc[:, dataset.columns != 'SELL_PRICE']
    y = dataset.SELL_PRICE   #target column i.e. price range

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

    print(f"Dataset split into {len(X_train)} training data points and {len(X_val)} validation points.")

    return X_train, X_val, y_train, y_val


def encode_cat_data(X_train, X_val):

    print("Encoding categorical data...")
    
    # Get list of categorical variables
    s = (X_train.dtypes == 'object')
    object_cols = list(s[s].index)
    print(f"Categorical variables: {object_cols}")

    # Apply one-hot encoder to each column with categorical data
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
    OH_cols_val = pd.DataFrame(OH_encoder.transform(X_val[object_cols]))

    # One-hot encoding removed index, so now we put it back
    OH_cols_train.index = X_train.index
    OH_cols_val.index = X_val.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_val = X_val.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_val = pd.concat([num_X_val, OH_cols_val], axis=1)

    return OH_X_train, OH_X_val


def train_model(X_train, y_train):

    print("Training the model...")

    # specify training model
    rf_training_model = RandomForestRegressor(n_estimators=500, criterion='absolute_error', random_state=0)
    # fit training model
    rf_training_model.fit(X_train, y_train)
   
    return rf_training_model


def accuracy(model, X_val, y_val):

    print("Testing the model accuracy...")

    validation_predictions = model.predict(X_val)
    validation_mae = mean_absolute_error(validation_predictions, y_val)

    return validation_mae


def main():

    # load data
    data = pd.read_csv('..\\data\\train.csv', index_col='ADDRESS')
    data = data.select_dtypes(include=[np.number])

    # split train/test
    X_train, X_val, y_train, y_val = create_train_test_data(data)

    # encode categorical data
    #X_train, X_val = encode_cat_data(X_train, X_val)

    # train and test
    model = train_model(X_train, y_train)
    mae_test = accuracy(model, X_val, y_val)

    print(f"Mean Absolute Error (MAE): ${mae_test:,.0f}")

    return model, X_val


if __name__ == '__main__':
    model, X_val = main()


Splitting the data...
Dataset split into 417 training data points and 47 validation points.
Training the model...
Testing the model accuracy...
Mean Absolute Error (MAE): $158,467


### PREDICTION MODEL

##### from population...

In [21]:
# load data
predict_data = pd.read_csv('..\\data\\predict.csv', index_col='ADDRESS')
predict_data = predict_data.select_dtypes(include=[np.number])

# make predictions
predictions = model.predict(predict_data)

# join predictions back into original dataframe
predict_data['predicts'] = predictions

# output cleaned sales data to clean-sales-data.csv
predict_data.to_csv('..\\data\\predictions.csv')

##### from sample...

In [None]:
# load data
predict_data = pd.read_csv('..\\data\\predict.csv', index_col='ADDRESS')
predict_data = predict_data.select_dtypes(include=[np.number])

address_to_predict = input("Please enter an address to predict:")
sample = predict_data.loc[[address_to_predict]]
sample_prediction = model.predict(sample)
sample_prediction = sample_prediction[0]

print(f"The predicted price of {address_to_predict} is ${sample_prediction:,.0f}")