# Tile2Vec embedded features for house pricing regression

In [1]:
def add_modules_to_path(modules):
    for module in modules:
        if module not in sys.path:
            sys.path.append(module)

In [2]:
import numpy as np
import os
import torch
from time import time
from torch.autograd import Variable
import sys
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn.decomposition import PCA
import pandas as pd
from tqdm.notebook import tqdm

modules = [os.path.abspath('../../../..')]
add_modules_to_path(modules)


from coord2vec.Noam_Adir.Adir.tile2vec.pre_trained_model.tilenet import make_tilenet
from coord2vec.Noam_Adir.Adir.tile2vec.pre_trained_model.resnet import ResNet18
from coord2vec.Noam_Adir.Adir.tile2vec.tiles_data.data import get_fast_the_tiles, show_some_tiles_images
# from coord2vec.Noam_Adir.Adir.tile2vec.tile2vec_utils import *
# from coord2vec.Noam_Adir.pipeline.base_pipeline import *
from coord2vec.Noam_Adir.manhattan.pipeline import init_pipeline, fit_and_score_models_on_datasets


%load_ext autoreload
%autoreload 2
%load_ext tensorboard
logs_dir = '/mnt/adir_logs/tile2vec'

## Loading pre-trained model and data

In [3]:
# Setting up model
cuda = torch.cuda.is_available()
# tilenet = make_tilenet(in_channels=in_channels, z_dim=z_dim)
# Use old model for now
tilenet = ResNet18(in_channels=4, z_dim=512, tile_size=100)
if cuda: tilenet.cuda(1)

In [4]:
# Load parameters
model_fn = 'pre_trained_model/naip_trained.ckpt'
checkpoint = torch.load(model_fn)
tilenet.load_state_dict(checkpoint)
tilenet.eval()

ResNet(
  (conv1): Conv2d(4, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_

In [5]:
# Get data
n_tiles = 16000
tiles = get_fast_the_tiles()
tiles = tiles[:n_tiles]

In [6]:
tiles.shape
# infrared = np.random.randint(0, 256, size=(n_tiles, 224, 224, 1))
infrared = np.zeros((n_tiles, 224, 224, 1))
tiles_with_infrared = np.concatenate([tiles, infrared], axis=3)
tiles_with_infrared[0, 0, 0]

array([ 94., 108., 117.,   0.])

## Show some tiles with or without histogram equalization and with red mark

In [None]:
print('Natural tiles:')
show_some_tiles_images(tiles, nrows=2, ncols=3, n_tiles=n_tiles)
print('Now with histogram equalization:')
show_some_tiles_images(tiles, nrows=2, ncols=3, n_tiles=n_tiles, with_hist_eq=True)
print('Now with red mark:')
show_some_tiles_images(tiles, nrows=2, ncols=3, n_tiles=n_tiles, with_red_mark=True)

## Embed Manhattan tiles

In [None]:
# # Embed tiles
# embeddings = np.zeros((n_tiles, 2048))
# for idx in range(n_tiles):
#     tile = tiles_with_infrared[idx]
#     # Rearrange to PyTorch order
#     tile = np.moveaxis(tile, -1, 0)
#     tile = np.expand_dims(tile, axis=0)
#     # Scale to [0, 1]
#     tile = tile / 255
#     # Embed tile
#     tile = torch.from_numpy(tile).float()
#     tile = Variable(tile)
#     if cuda: tile = tile.cuda(1)
#     z = tilenet.encode(tile)
#     if cuda: z = z.cpu()
#     z = z.data.numpy()
#     embeddings[idx] = z

# Embed tiles
batch_size = 25
z_lst = []
for i in tqdm(range(0, tiles_with_infrared.shape[0], batch_size), desc='Embedding tiles', unit='batch'):
    tile = tiles_with_infrared[i: i + batch_size]
    tiles_torch = torch.from_numpy(np.moveaxis(tile, -1, 1) / 255).float()
    tiles_torch = Variable(tiles_torch)
    if cuda: tiles_torch = tiles_torch.cuda(1)
    z = tilenet.encode(tiles_torch)
    if cuda: z = z.cpu()
    z = z.data.numpy()
    z_lst.append(z)
embeddings = np.vstack(z_lst)
embeddings.shape



HBox(children=(FloatProgress(value=0.0, description='Embedding tiles', max=640.0, style=ProgressStyle(descript…

In [None]:
## Show embedding on tensorboard

In [None]:
# run it only if you want to delete the logs_dir
!sudo rm -r {logs_dir}/*

In [None]:
pca = PCA(n_components=5)
emb_pca = principalComponents = pca.fit_transform(embeddings)
emb_pca_unique, indexes_unique = np.unique(emb_pca, axis=0, return_index=True)
n_vis = 100  # number of image that will ne visualized
vis_indexes = np.random.choice(indexes_unique, n_vis)
label_im = np.moveaxis(tiles[vis_indexes].astype(int) / 255, -1, 1)
writer = SummaryWriter(log_dir=logs_dir, comment='embedding_ermongroup')
writer.add_embedding(torch.from_numpy(embeddings[vis_indexes]), label_img=torch.from_numpy(label_im), tag='emb_ermongroup')
writer.close()

In [None]:
!kill 86567
%tensorboard --logdir {logs_dir} --port=8115 --host 0.0.0.0

## Train many regressors

In [None]:
models = [svm.SVR(verbose=False)
          , neighbors.KNeighborsRegressor(n_neighbors=10)
          , LinearRegression()
          , tree.DecisionTreeRegressor()
          , GradientBoostingRegressor(verbose=False)
          , AdaBoostRegressor()
          , RandomForestRegressor(verbose=False)
          , CatBoostRegressor(verbose=False)
         ]
pipe_dict = init_pipeline(models)
task_handler = pipe_dict.pop('task_handler')
pipe_dict = {k: v[:n_tiles] for k, v in pipe_dict.items()}
locals().update(pipe_dict)
features_without_geo_unique_coords = features_without_geo.values[unique_coords_idx]
emb_unique_coords = embeddings[unique_coords_idx]

### Concat data to embedding

In [None]:
features_without_geo_unique_coords_with_emb = np.concatenate([emb_unique_coords, features_without_geo_unique_coords], axis=1)
all_features_unique_coords_with_emb = np.concatenate([emb_unique_coords, all_features_unique_coords.values], axis=1)

### PCA

In [None]:
pca = PCA(n_components=5)
emb_pca = principalComponents = pca.fit_transform(emb_unique_coords)
features_without_geo_unique_coords_with_emb_pca = np.concatenate([emb_pca, features_without_geo_unique_coords], axis=1)
all_features_unique_coords_with_emb_pca = np.concatenate([emb_pca, all_features_unique_coords.values], axis=1)

### Create data_dict and fit and score all the models on each dataset in data_dict

In [None]:
price_unique_coords = price[unique_coords_idx]
data_dict = {
    'all_features_unique_coords': (all_features_unique_coords, price_unique_coords)
    , 'only_geo_features_unique_coords': (only_geo_features_unique_coords, price_unique_coords)
    , 'features_without_geo_unique_coords': (features_without_geo.values[unique_coords_idx], price_unique_coords)
    , 'features_without_geo_unique_coords_with_emb': (features_without_geo_unique_coords_with_emb, price_unique_coords)
    , 'all_features_unique_coords_with_emb': (all_features_unique_coords_with_emb, price_unique_coords)
    , 'features_without_geo_unique_coords_with_emb_pca': (features_without_geo_unique_coords_with_emb_pca, price_unique_coords)
    , 'all_features_unique_coords_with_emb_pca': (all_features_unique_coords_with_emb_pca, price_unique_coords)
            }

print('start fitting:')
results_df = fit_and_score_models_on_datasets(models, data_dict)
results_df.columns.name = 'dataset'
results_df.index.name = 'model'
# print min_mae from results_df
a, b = results_df.stack().idxmin()
print(results_df.loc[[a], [b]])
results_df

### find best performence model per dataset

In [None]:
best_model_df_argmin = results_df.idxmin(axis=0)
best_model_df_min = results_df.min(axis=0)
pd.DataFrame({'best performence model': best_model_df_argmin, 'mae': best_model_df_min})

### find best performence dataset per model

In [None]:
best_dataset_df_argmin = results_df.idxmin(axis=1)
best_dataset_df_min = results_df.min(axis=1)
pd.DataFrame({'best performence dataset': best_dataset_df_argmin, 'mae': best_dataset_df_min})

### find best performence dataset per model

In [None]:
best_dataset_df_argmin = results_df.idxmin(axis=1)
best_dataset_df_min = results_df.min(axis=1)
pd.DataFrame({'best performence dataset': best_dataset_df_argmin, 'mae': best_dataset_df_min})

results_df.T

## Predict some samples

In [None]:
X = X_no_emb
y = price

def train_catboost(X, y):
    N = y.shape[0]
    indexes = np.arange(N)
    np.random.seed()
    np.random.shuffle(indexes)
    train_ind, test_ind = train_test_split(indexes)
    y_train, y_test = y[train_ind], y[test_ind]
    X_train, X_test = X[train_ind], X[test_ind]
    X_norm_train, X_norm_test, X_normalizer = my_z_score_norm(X_train, X_test, return_scalers=True)
    args_tuple = ([CatBoostRegressor(verbose=False)], X_norm_train, y_train, X_norm_test, y_test)
    model = train_models_from_splitted_data(*args_tuple)[0][0]
    return model, X_normalizer, X_test, y_test

model, X_normalizer, X_test, y_test = train_catboost(X, y)

In [None]:
n = 11
def get_example_of_predictions(model, X_normalizer, X_test, y_test, n):
    ind = np.random.randint(0, 4000, size=n)
    y_pred = model.predict(X_normalizer.transform(X_test[ind])).astype(int)
    gt = y_test.values[ind].astype(int)
    df_dict = {'Prediction': y_pred, 'Truth': gt, 'Diff': np.abs(y_pred - gt)}
    df = pd.DataFrame(df_dict)
    return df
    
df = get_example_of_predictions(model, X_normalizer, X_test, y_test, n)
df

## garbage

In [None]:
data_dict = {
    'all_features_unique_coords': (all_features_unique_coords, price[unique_coords_idx])
    , 'only_geo_features_unique_coords': (only_geo_features_unique_coords, price[unique_coords_idx])
    , 'features_without_geo_unique_coords': (features_without_geo.values[unique_coords_idx], price[unique_coords_idx])
    , 'all_features': (all_features, price)
    , 'features_without_geo': (features_without_geo, price)
            }