In [1]:
import os
import json
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score

In [2]:
os.chdir('..')

In [3]:
test = pd.read_csv('ENG_DATA/SELECTED/12-10_all_test.csv', index_col = [0])
train = pd.read_csv('ENG_DATA/SELECTED/12-10_all_train.csv', index_col = [0])

In [8]:
test_2 = pd.read_csv('ENG_DATA/SELECTED/12-10_hedon_test.csv', index_col = [0])
train_2 = pd.read_csv('ENG_DATA/SELECTED/12-10_hedon_train.csv', index_col = [0])

In [None]:
test_1 = pd.read_csv('ENG_DATA/SELECTED/12-10_1he_test.csv', index_col = [0])
train_1 = pd.read_csv('ENG_DATA/SELECTED/12-10_1he_train.csv', index_col = [0])

In [4]:
x_test = test.drop(columns = 'sale_price')
y_test = test["sale_price"]
x_train = train.drop(columns = 'sale_price')
y_train = train["sale_price"]

In [9]:
x_test_2 = test_2.drop(columns = 'sale_price')
y_test_2 = test_2["sale_price"]
x_train_2 = train_2.drop(columns = 'sale_price')
y_train_2 = train_2["sale_price"]

In [None]:
x_test_1 = test_1.drop(columns = 'sale_price')
y_test_1 = test_1["sale_price"]
x_train_1 = train_1.drop(columns = 'sale_price')
y_train_1 = train_1["sale_price"]

In [7]:
test.shape[0] + train.shape[0]

23720

In [10]:
def createEncoder(x_train_, x_test_, type = "L"): 
    cat_columns = list(x_train_.select_dtypes(exclude = 'number').columns)
   # Label encode categories
    le=LabelEncoder()

    # create a dataframe to store label encoded values
    x_train_le = x_train_.copy()
    x_test_le = x_test_.copy()

    for i in cat_columns:
        le.fit(x_train_[i])
        x_train_le[i] = le.transform(x_train_[i]).copy()

    for i in cat_columns:
        le.fit(x_test_[i])
        x_test_le[i] = le.transform(x_test_[i]).copy() 
    return x_train_le, x_test_le, le

In [None]:
x_train_le, x_test_le, le = createEncoder(x_train, x_test, type = "L")

In [11]:
x_train_le2, x_test_le2, le2 = createEncoder(x_train_2, x_test_2, type = "L")

In [12]:
x_train_le2.columns

Index(['longitude', 'latitude', 'elevation', 'full_address', 'city',
       'street_name', 'zip', 'area', 'district_no', 'subdist_no',
       'on_market_date', 'cdom', 'sale_date', 'rooms', 'baths', 'beds',
       'lot_acres', 'year_built', 'zoning', 'drive_side', 'parking',
       'park_leased', 'num_parking', 'shopping', 'transportation', 'type',
       'views', 'days_on_market', 'on_market_year', 'on_market_month',
       'sale_year', 'sale_month', 'neighborhood'],
      dtype='object')

In [84]:
scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
x_train_test = scaler.fit_transform(x_train_le)

In [85]:
x_train_test

array([[0.52561328, 0.78808156, 0.16272808, ..., 0.02235294, 0.04347826,
        0.04925449],
       [0.63953824, 0.21022774, 0.27180498, ..., 0.00676471, 0.02717391,
        0.01589292],
       [0.51565657, 0.29034262, 0.36814216, ..., 0.005     , 0.02898551,
        0.0178753 ],
       ...,
       [0.0748557 , 0.66196122, 0.14621908, ..., 0.03117647, 0.08514493,
        0.09239241],
       [0.32597403, 0.13128412, 0.35060517, ..., 0.01852941, 0.0307971 ,
        0.04271433],
       [0.55533911, 0.8081663 , 0.21319741, ..., 0.01058824, 0.02355072,
        0.04611996]])

In [13]:
models = {}

In [15]:
def runModel(name, model_type, x_train_, y_train_, x_test_, y_test_):
    clf_ = None
#     x_train_ = MinMaxScaler().fit_transform(x_train_)
#     x_test_ = MinMaxScaler().fit_transform(x_test_)
    if (model_type == "F"):
        clf_ = RandomForestRegressor(n_estimators=20, max_depth=50,random_state=0).fit(x_train_, y_train_)
    elif (model_type == "L"):
        clf_ = LinearRegression().fit(x_train_, y_train_)
    else: 
        raise ValueError("Model Type not implemented in function")
    y_pred_ = clf_.predict(x_test_)
    metrics = {
        "model_type": model_type,
        "r2": r2_score(y_pred_, y_test_), 
        "mean_absolute_error": mean_absolute_error(y_pred_, y_test_) / np.mean(y_test_), 
        "median_absolute_error" :median_absolute_error(y_pred_, y_test_) / np.mean(y_test_),
        "mean_abs_perc_error" : np.mean(np.abs((y_pred_ - y_test_) / y_test_)) * 100
    }
    models[f"{name}-{model_type}"] = metrics
    del(clf_) 

In [16]:
runModel("hedon", "F", x_train_le2, y_train_2, x_test_le2, y_test_2)

In [17]:
models

{'hedon-F': {'model_type': 'F',
  'r2': 0.7828017316957556,
  'mean_absolute_error': 0.17634649794528875,
  'median_absolute_error': 0.08242477528156973,
  'mean_abs_perc_error': 16.29035583501948}}

In [12]:
runModel("all", "L", x_train_le, y_train, x_test_le, y_test)
runModel("all_no_list_price", "L", x_train_le.drop(columns = "orig_list_price"), y_train, x_test_le.drop(columns = "orig_list_price"), y_test)
runModel("all", "F", x_train_le, y_train, x_test_le, y_test)
runModel("all_no_list_price", "F", x_train_le.drop(columns = "orig_list_price"), y_train, x_test_le.drop(columns = "orig_list_price"), y_test)
runModel("all_he", "L", x_train_1, y_train_1, x_test_1, y_test_1)
runModel("all_he", "F", x_train_1, y_train_1, x_test_1, y_test_1)

In [13]:
models["12-10-all-automl"] = {'r2': 0.9669572425163331,
  'mean_absolute_error': 0.0741511405465754,
  'median_absolute_error': 0.039920145619098436,
  'mean_abs_perc_error': 7.6138036271989025}

In [14]:
with open('RESULTS/12-10-models-results.json', 'w') as fp:
    json.dump(models, fp)

In [15]:
models

{'all-L': {'model_type': 'L',
  'r2': 0.4638260671940839,
  'mean_absolute_error': 0.29628996698515814,
  'median_absolute_error': 0.2017725256608768,
  'mean_abs_perc_error': 36.24977465029022},
 'all_no_list_price-L': {'model_type': 'L',
  'r2': 0.4626893708917976,
  'mean_absolute_error': 0.2960207645987858,
  'median_absolute_error': 0.20202444441265802,
  'mean_abs_perc_error': 36.17702464258172},
 'all-F': {'model_type': 'F',
  'r2': 0.9459234300069457,
  'mean_absolute_error': 0.07678229052327126,
  'median_absolute_error': 0.04135130254327362,
  'mean_abs_perc_error': 7.714333023121968},
 'all_no_list_price-F': {'model_type': 'F',
  'r2': 0.7598732296079245,
  'mean_absolute_error': 0.17559300587910265,
  'median_absolute_error': 0.08228763594097863,
  'mean_abs_perc_error': 16.021185075401693},
 'all_he-L': {'model_type': 'L',
  'r2': -0.0003139792605919567,
  'mean_absolute_error': 10.599971014751954,
  'median_absolute_error': 0.17046573568782927,
  'mean_abs_perc_error': 31

In [19]:
df = pd.read_csv('ENG_DATA/CLEANED/12-8_Cleaned_df.csv', index_col = [0])

In [23]:
# Merge two Dataframes on index of both the dataframes - ADD Back original list price
x_train_le = x_train_le.merge(df[["orig_list_price"]], left_index=True, right_index=True)
x_test_le = x_test_le.merge(df[["orig_list_price"]], left_index=True, right_index=True)

In [21]:
df[["orig_list_price", "sale_price"]].cov()

Unnamed: 0,orig_list_price,sale_price
orig_list_price,337170700000000.0,1607321000000.0
sale_price,1607321000000.0,1685941000000.0


In [22]:
df["orig_list_price"].mean() - df["sale_price"].mean()

419367.2587268129

In [27]:
np.mean(np.abs((df["orig_list_price"] - df["sale_price"]))) / df["orig_list_price"].mean()

0.3552311444369088