In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Hyperparameters

RANDOM_CONTROL = 42 # For reproducibility of notebook
TRAIN_SIZE = 0.8

# Random Forest: Fill in based on GridSearch results
RF_NUM_ESTIMATORS = 100
RF_MAX_DEPTH = 50
RF_MAX_FEATURES = 1
RF_MIN_SPLIT = 2
RF_MIN_LEAF = 1
RF_BOOTSTRAP = True
RF_CRITERION = "squared_error"

# Gradient Boosting: Fill in based on GridSearch results
GB_NUM_ESTIMATORS = 100
GB_MAX_DEPTH = 50
GB_CRITERION = "squared_error"

# AdaBoost: Fill in based on GridSearch results
AB_NUM_ESTIMATORS = 100
AB_MAX_DEPTH = 50
AB_LEARNING_RATE = 0.1

# Neural Net: Fill in based on test iterations
NN_NUM_EPOCHS = 10
NN_BATCH_SIZE = 32
NN_LEARNING_RATE = 0.1

In [3]:
# Read training data
df = pd.read_csv('data/train.csv') 

df.head(3)

df.shape

(20254, 21)

# EDA

(Generate plots here; preprocessing steps follow below)

In [4]:
# Trivial modifications here(DO NOT NORMALIZE/IMPUTE YET)

# Drop listing id; nominal identifier with no meaning
df.drop('listing_id', axis=1, inplace=True)

# Drop elevation; all the values are 0, spurious attribute
df.drop('elevation', axis=1, inplace=True)

# Drop url; nominal identifier with no meaning; useful for manual lookups or scraping
df.drop('property_details_url', axis=1, inplace=True)

# Drop invalid data with negative or zero-valued price
df = df[df.price > 0]

# BELOW CODE IN THIS SECTION IS ONLY MEANT TO GET THE SKELETON WORKING; RE-EVALUATE EACH ATTRIBUTE ONE BY ONE
df.drop('title', axis=1, inplace=True)
df.drop('address', axis=1, inplace=True)
df.drop('property_name', axis=1, inplace=True)
df.drop('property_type', axis=1, inplace=True)
df.drop('tenure', axis=1, inplace=True)
df.drop('built_year', axis=1, inplace=True)
df.drop('num_beds', axis=1, inplace=True)
df.drop('num_baths', axis=1, inplace=True)
df.drop('floor_level', axis=1, inplace=True)
df.drop('furnishing', axis=1, inplace=True)
df.drop('available_unit_types', axis=1, inplace=True)
df.drop('total_num_units', axis=1, inplace=True)
df.drop('lat', axis=1, inplace=True)
df.drop('lng', axis=1, inplace=True)
df.drop('subzone', axis=1, inplace=True)
df.drop('planning_area', axis=1, inplace=True)

In [5]:
df.head()
df.shape

(20153, 2)

In [6]:
# Utility functions



In [7]:
# Split data into train and validation set

X_housing = df.loc[:, df.columns != 'price']
y_housing = df['price']

X_train, X_val, y_train, y_val = train_test_split(X_housing, y_housing, train_size=TRAIN_SIZE, random_state=RANDOM_CONTROL, shuffle=True) 

# print(X_train.shape)
# print(y_train.shape)


In [8]:
# Perform further pre-processing steps only on train set

**Models**

(Add outline of steps here later)

- Linear Regression
- Random Forest
- Gradient Boosting
- AdaBoost
- Neural Net

# Linear Regression

In [9]:
# Train

linreg = LinearRegression().fit(X_train, y_train)

# Validate

y_hat_linreg = linreg.predict(X_val)
rmse_linreg = mean_squared_error(y_val, y_hat_linreg, squared=False)

print('Linear Regression Validation rmse: {:.3}'.format(rmse_linreg))

Linear Regression Validation rmse: 5.85e+06


# Random Forest

In [10]:
# Train

val_split_indices = [-1 if x in X_train.index else 0 for x in X_housing.index]
ps = PredefinedSplit(test_fold=val_split_indices)

estimator = RandomForestRegressor()
params = {'n_estimators': [25, 50, 100],
          'max_depth': [5, 10, 25, 50],
          'min_samples_split': [2],
          'min_samples_leaf': [1],
          'criterion': ["squared_error"],
          'max_features': [1],
          'bootstrap': [True, False],
          'random_state': [RANDOM_CONTROL]}
model_rf = GridSearchCV(estimator=estimator,
                     param_grid=params,
                     cv=ps)
model_rf.fit(X_housing, y_housing)

print('Random Forest Best Parameters: {}'.format(model_rf.best_params_))

# Validate

y_hat_rf = model_rf.predict(X_val)
rmse_rf = mean_squared_error(y_val, y_hat_rf, squared=False)

print('Random Forest Validation rmse: {:.3}'.format(rmse_rf))

Random Forest Best Parameters: {'bootstrap': True, 'criterion': 'squared_error', 'max_depth': 25, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Random Forest Validation rmse: 2.98e+06


# Gradient Boosting

In [11]:
# Train

estimator = GradientBoostingRegressor()
params = {'n_estimators': [25, 50, 100],
          'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
          'max_depth': [5, 10, 25, 50],
          'min_samples_split': [2],
          'min_samples_leaf': [1],
          'criterion': ["squared_error", "friedman_mse"],
          'max_features': [1],
          'random_state': [RANDOM_CONTROL]}
model_gb = GridSearchCV(estimator=estimator,
                     param_grid=params,
                     cv=ps)
model_gb.fit(X_housing, y_housing)

print('Random Forest Best Parameters: {}'.format(model_gb.best_params_))

# Validate

y_hat_gb = model_gb.predict(X_val)
rmse_gb = mean_squared_error(y_val, y_hat_gb, squared=False)

print('Gradient Boosting Validation rmse: {:.3}'.format(rmse_gb))

Random Forest Best Parameters: {'criterion': 'squared_error', 'learning_rate': 0.1, 'max_depth': 5, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25, 'random_state': 42}
Gradient Boosting Validation rmse: 3.51e+06


# AdaBoost

In [12]:
base_estimator = DecisionTreeRegressor()
estimator = AdaBoostRegressor(base_estimator=base_estimator)
params = {'base_estimator__max_depth': [5, 10, 25, 50],
          'base_estimator__splitter': ['best', 'random'],
          'n_estimators': [25, 50, 100],
          'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
          'random_state': [RANDOM_CONTROL]}
model_ab = GridSearchCV(estimator=estimator,
                     param_grid=params,
                     cv=ps)
model_ab.fit(X_housing, y_housing)

print('AdaBoost Best Parameters: {}'.format(model_ab.best_params_))

# Validate

y_hat_ab = model_ab.predict(X_val)
rmse_ab = mean_squared_error(y_val, y_hat_ab, squared=False)

print('AdaBoost Validation rmse: {:.3}'.format(rmse_ab))

AdaBoost Best Parameters: {'base_estimator__max_depth': 50, 'base_estimator__splitter': 'best', 'learning_rate': 1, 'n_estimators': 25, 'random_state': 42}
AdaBoost Validation rmse: 2.97e+06


# Neural Net

In [13]:
X_housing = X_housing.to_numpy()
y_housing = y_housing.to_numpy()
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()

#print(X_train.shape)
#print(y_train.shape)
train_dataloader = DataLoader([ [X_train[i], y_train[i]] for i in range(len(X_train)) ], batch_size=NN_BATCH_SIZE, shuffle=True, num_workers=4)

In [14]:
# Define MLP architecture
class Model(nn.Module):
    
    def __init__(self, device='cpu'):
        super(Model, self).__init__()
        self.device = device
        
        # Modify accordingly
        self.fc1 = nn.Linear(1, 4)
        self.fc2 = nn.Linear(4, 16)
        self.fc3 = nn.Linear(16, 64)
        self.fc4 = nn.Linear(64, 4)
        self.fc5 = nn.Linear(4, 1)
        self.relu = nn.ReLU()
        
        self.dense = nn.Sequential(self.fc1, self.relu, self.fc2, self.relu, self.fc3, self.relu, 
                                   self.fc4, self.relu, self.fc5)
        
    def forward(self, x):
        pred = self.dense(x)
        return pred
    
# Train
device = 'cpu'
model = Model()
optimizer = optim.Adam(model.parameters(), NN_LEARNING_RATE)
criterion = nn.MSELoss()
model.to(device)
for epoch in range(NN_NUM_EPOCHS):
    running_loss = 0
    for idx, (x_features, y_labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        x_features = x_features.to(device, dtype=torch.float)
        y_labels = y_labels.to(device, dtype=torch.float)
        prediction = model(x_features)
        loss = torch.sqrt(criterion(prediction, y_labels)) # Standardize RMSE loss
        loss.backward()
        optimizer.step()
        running_loss += loss
        if (idx+1) %100 == 0: 
            running_loss = format(running_loss/100, '.4f')
            print(f"Epoch [{epoch+1} Batches processed | {idx}] Loss: {running_loss}")
            running_loss = 0
print("Finished Training.")


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1 Batches processed | 99] Loss: 74088216.0000
Epoch [1 Batches processed | 199] Loss: 3812634.0000
Epoch [1 Batches processed | 299] Loss: 4383063.5000
Epoch [1 Batches processed | 399] Loss: 13035016.0000
Epoch [1 Batches processed | 499] Loss: 3967107.7500


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [2 Batches processed | 99] Loss: 4189813.0000
Epoch [2 Batches processed | 199] Loss: 3987484.7500
Epoch [2 Batches processed | 299] Loss: 4095743.7500
Epoch [2 Batches processed | 399] Loss: 82362232.0000
Epoch [2 Batches processed | 499] Loss: 4018017.5000
Epoch [3 Batches processed | 99] Loss: 3835343.0000
Epoch [3 Batches processed | 199] Loss: 82014840.0000
Epoch [3 Batches processed | 299] Loss: 3885355.5000
Epoch [3 Batches processed | 399] Loss: 4321278.5000
Epoch [3 Batches processed | 499] Loss: 4217107.0000
Epoch [4 Batches processed | 99] Loss: 4044569.2500
Epoch [4 Batches processed | 199] Loss: 73373376.0000
Epoch [4 Batches processed | 299] Loss: 4110977.2500
Epoch [4 Batches processed | 399] Loss: 4052472.2500
Epoch [4 Batches processed | 499] Loss: 13030081.0000
Epoch [5 Batches processed | 99] Loss: 4105910.7500
Epoch [5 Batches processed | 199] Loss: 73418200.0000
Epoch [5 Batches processed | 299] Loss: 3910133.7500
Epoch [5 Batches processed | 399] Loss: 42464

In [None]:
# Validate
X_features = torch.from_numpy(X_val)
y_labels = torch.from_numpy(y_val)
X_features = X_features.to(device, dtype=torch.float)
y_labels = y_labels.to(device, dtype=torch.float)
prediction = model(X_features)
rmse_nn = torch.sqrt(criterion(prediction, y_labels))

print('Neural Net Validation rmse: {:.3}'.format(rmse_nn))

**Wrap-up**

(Do this only before submitting to Kaggle; train over the entire set here after the hyperparameters are identified; then perform testing and submit results)

In [None]:
df = pd.read_csv('data/test.csv') 
# Pre-process similar to above. Need to refactor into a function.

# Linear Regression
linreg = LinearRegression().fit(X_housing, y_housing)
y_hat_linreg = linreg.predict(X_val)

# Random Forest

# Gradient Boosting

# AdaBoost

# Neural Net

**Do not execute**

(Add misc. code here that was not utilized)