<a href="https://colab.research.google.com/github/jonaden94/deepl-ecker-21-22/blob/main/feature_extraction_pictures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Preparation

In [1]:
#@title remove repos from disc
%cd /content
!rm -r airbnb_price

/content


In [2]:
#@title Clone repo
!git clone https://github.com/dernameistegal/airbnb_price.git

Cloning into 'airbnb_price'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 51 (delta 17), reused 24 (delta 2), pack-reused 0[K
Unpacking objects: 100% (51/51), done.


In [3]:
#@title add paths to library search path
import sys 

sys.path.append("/content/airbnb_price/custom_functions")
sys.path.append("/content/airbnb_price/feature_extraction")

In [121]:
#@title Imports and drive
import os
import torch
import torchvision
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# own modules
import general_utils as gu
import feature_extraction_utils as fu



from google.colab import drive

#@title Mount drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [5]:
#@title define device

# device
device = gu.get_device()
num_cpus = os.cpu_count()
print(num_cpus, 'CPUs available')

cuda available: True ; cudnn available: True ; num devices: 1
Using device Tesla K80
2 CPUs available


# 1. Data Cleaning

In [111]:
#@title function to detect one-channel images

from tqdm import tqdm
import numpy as np


def detect_one_channel_pics(data_dir):
    file_names = os.listdir(data_dir)
    data = np.empty((len(file_names), 224, 224, 3))
    ind_exceptions = []


    for i in tqdm(range(len(file_names))):
        try:
            np.transpose(np.load(data_dir + "/" + file_names[i]), (2, 0, 1))
        except:
            ind_exceptions.append(i)
            
    # calculate list of file_names where exceptions occured
    exceptions_file_names = [file_names[i] for i in ind_exceptions]

    return exceptions_file_names

In [117]:
#@title calculate file_names where pictures only have one channel and get corresponding picture ids
hostpics_dir = "/content/drive/MyDrive/Colab/airbnb/data/hostpics"

exceptions_file_names_npy = detect_one_channel_pics(hostpics_dir)

# get pure ids that correspond to these pictures
exceptions_file_names = [file_name.replace("hostpic", "") for file_name in exceptions_file_names_npy]
exceptions_file_names = [file_name.replace(".npy", "") for file_name in exceptions_file_names]

100%|██████████| 11375/11375 [00:30<00:00, 375.61it/s]


In [120]:
#@title check host_names of listings with generic picture to verify that this is the picture when host did not provide own picture
# see wich of these pictures are generic pictures
generic = []

for i in range(len(exceptions_file_names_npy)):
    reference = np.load("/content/drive/MyDrive/Colab/airbnb/data/hostpics/" + exceptions_file_names_npy[0])
    temp = np.load("/content/drive/MyDrive/Colab/airbnb/data/hostpics/" + exceptions_file_names_npy[i])
    if np.array_equal(reference, temp):
        generic.append(exceptions_file_names[i])

# get list of ids that correspond to generic pictures
ind_generic = listings_meta["id"].isin(generic)
ind_generic = list(ind_generic)

# instantiate listings_meta
listings_meta_path = "/content/drive/MyDrive/Colab/airbnb/data/data1/listings.csv.gz"
listings_meta = pd.read_csv(listings_meta_path)

listings_meta[ind_generic]["host_name"]


1451           Paul
2205            Eli
2552         Olivia
3163        Ugljesa
3952          Klara
            ...    
11265      Johannes
11301        Rafael
11310       Michael
11323    Ulziibuyan
11344        Daniel
Name: host_name, Length: 69, dtype: object

In [151]:
#@title convert one channel images to grey scale and overwrite original images for compatibility with neural net
for file_name in exceptions_file_names_npy:
    file_path = "/content/drive/MyDrive/Colab/airbnb/data/hostpics/" + file_name
    # add axis and repeat 3 times for 3 channels
    x = np.load(file_path)
    x = x[..., np.newaxis]
    x = np.repeat(x, 3, axis=2)

    # get max pixel value of image
    max_pixel_value = np.max(x)

    # scale pixel values to rgb range
    x = np.round(x * (255/max_pixel_value))
    x = x.astype(int)

    np.save(file_path, x)


In [1]:
x = 5

# 2. Feature Extraction

In [None]:
#@title make dataset


# initialize dataset and dataloader
dataset = fu.Dataset(hostpics_dir, means, stds, 10)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)

# get pretrained model
vgg = torchvision.models.vgg19(pretrained=True)
feature_extractor = vgg.features[0:31]

# compute features for later training
x = fu.compute_train_features(device, dataloader, feature_extractor)



























RuntimeError: ignored

In [None]:
model_name = "pointnet2_2" # model to be trained
model = importlib.import_module(model_name)
destination_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/"
destination_path = destination_path + model_name

In [None]:
# Use pretrained model? If yes run this chunk
source_path = "/content/drive/MyDrive/Colab/tree_learning/trained_models/PartSeg1"  # pretrained model to be used (check for compatibility with model to be trained)
source_path = source_path + "/."
!rm -r $destination_path
!mkdir $destination_path
!cp -a $source_path $destination_path

In [None]:
# generate train and val split to be used in dataloaders
gu.gen_split(percentages=[0.7, 0.3])

In [None]:
# train model
!python train_partseg.py --model $model_name --log_dir $model_name --npoint 8192 --epoch 30 --step_size 30 --batch_size 8 --weight 1

In [None]:
# Save trained model? If yes run this chunk
source_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/"
source_path = source_path + model_name
source_path = source_path + "/."
destination_path = "/content/drive/MyDrive/Colab/tree_learning/trained_models/test/test/" # destination path to save trained model and corresponding files

!mkdir -p $destination_path
!cp -a $source_path $destination_path

In [None]:
#@title plot performance
# plot loss and accuracy against epochs
    
loss_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/performance/loss.npy"
pu.plot("Loss", "Loss", loss_path, yscale='linear')

accs_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/performance/accs.npy"
pu.plot("Accuracy", "Accuracy", accs_path, yscale='linear')

w_accs_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/performance/w_accs.npy"
pu.plot("weighted Accuracy", "weighted Accuracy", w_accs_path, yscale='linear')

w_accs_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/performance/tree_accs.npy"
pu.plot("tree Accuracy", "tree Accuracy", w_accs_path, yscale='linear')

w_accs_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/performance/no_tree_accs.npy"
pu.plot("no tree Accuracy", "no tree Accuracy", w_accs_path, yscale='linear')

mious_path = w_accs_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/performance/mious.npy"
pu.plot("miou", "miou", mious_path, yscale='linear')

#Analysis

In [None]:
#@title definition of visual analysis function
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt


def explore2(points, prediction, label, mode="pointcloud", colormode="binary"):

    # explore results with pointcloud
    if mode == "pointcloud":
        if colormode == "binary":
            prediction[np.logical_and(label == 1, prediction == 1)] = 4  # true positive
            prediction[np.logical_and(label == 1, prediction == 0)] = 2  # false negative
            prediction[np.logical_and(label == 0, prediction == 1)] = 3  # false positive
            prediction[np.logical_and(label == 0, prediction == 0)] = 1  # true negative
        
        df = pd.DataFrame(data=np.column_stack((points, label, prediction)), columns=["x", "y", "z", "label", "prediction"])
        # create vector of sizes (two sizes for tree and non-tree points)
        size = df.iloc[:, -2] * 0.02 + 1

        fig = px.scatter_3d(df, x='x', y='y', z='z',
                            color='prediction',
                            symbol='label', size=size, opacity=0, size_max=5)

        # tight layout
        fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

        fig.show()

    # explore results with histogram
    if mode == "histogram":
        tp = sum(np.logical_and(label == 1, prediction >= 0.5))
        fn = sum(np.logical_and(label == 1, prediction <= 0.5))
        fp = sum(np.logical_and(label == 0, prediction >= 0.5))
        tn = sum(np.logical_and(label == 0, prediction <= 0.5))
        print(f"{tp / len(label)*100:.2f} are true positive")  # true positive
        print(f"{fn / len(label)*100:.2f} are false negative")  # false negative
        print(f"{fp / len(label)*100:.2f} are false positive")  # false positive
        print(f"{tn / len(label)*100:.2f} are true negative")  # true negative
        iou_tree = tp / (tp + fp + fn)
        iou_not_tree = tn / (tn + fn + fp)
        acc_tree, acc_not_tree = tp / (tp + fn), tn / (tn + fp)
        w_acc = (acc_tree + acc_not_tree) / 2
        print(f"the iou_tree is {iou_tree:.4f} the iou_not_tree is {iou_not_tree:.4f}")
        print(f"the miou is {(iou_tree + iou_not_tree) / 2:.4f}")
        print(f"the acc_tree is {acc_tree:.4f} the acc_not_tree is {acc_not_tree:.4f}")
        print(f"the weighted accuracy is {w_acc:.4f}")
        
        fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
        axs[0].hist(prediction)
        axs[1].hist(label)
        fig.show()

In [None]:
treenumber = 15
npoints = 2048
split_path = "/content/Pointnet_Pointnet2_pytorch/data/" + "valsplit.npy"
# um einen bestimmten Baum zu wählen, die folgende Zeile ausführen
#gu.gen_split(paths = [split_path], shuffle=False, percentages=[1])

In [None]:
#@title generate predictions for chosen tree from trained model (saved in result)

# load learned model
def inplace_relu(m):
    classname = m.__class__.__name__
    if classname.find('ReLU') != -1:
        m.inplace=True

classifier = model.get_model(2, normal_channel=False).to(device)
classifier.apply(inplace_relu)

model_path = "/content/Pointnet_Pointnet2_pytorch/log/part_seg/" + model_name + "/checkpoints/best_model.pth"
checkpoint = torch.load(model_path)
classifier.load_state_dict(checkpoint['model_state_dict'])

# instantiate dataset (choose from trainsplit, valsplit or nosplit)
root = "/content/Pointnet_Pointnet2_pytorch/data/"


testtransform = t.Compose([t.Normalize()])
TRAIN_DATASET = dset.PartNormalDataset(root=root, 
                                  npoints=npoints,
                                  transform=testtransform,
                                  splitpath=split_path, 
                                  normal_channel=False, mode="eval")

# predict targets for arbitrary tree number
points, label, target, _, upoints = TRAIN_DATASET[treenumber]
points, label, target = torch.tensor(points), torch.tensor(label), torch.tensor(target)
points, target = torch.unsqueeze(points, 0), torch.unsqueeze(target, 0)
points, label, target = points.float().to(device), label.long().to(device), target.long().to(device)
points = points.transpose(2, 1)


def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    new_y = torch.eye(num_classes)[y.cpu().data.numpy(),]
    if (y.is_cuda):
        return new_y.cuda()
    return new_y


with torch.no_grad():
    classifier.eval()
    result = classifier(points, to_categorical(label, 1))[0]


preds = torch.argmax(result[0], axis=1)
points = points[0].T
target = target[0]
points = points[:, :3]
points = points.detach().cpu().numpy()
preds = preds.detach().cpu().numpy()
target = target.detach().cpu().numpy()
m = torch.nn.Softmax()
pred_probabilities = m(result[0])[:,1].detach().cpu().numpy()


In [None]:
#@title Numerical Analysis for the chosen tree
correct = np.sum(preds == target)
print("Accuracy", correct / (npoints))
# confusion values in numbers and histograms of predicted probabilities and true labels
explore2(upoints, pred_probabilities, target, mode="histogram")

In [None]:
# plot binary predictions (yellow = true positive, red = false positive, blue = true negative, purple = false negative)
explore2(upoints, preds, target, mode="pointcloud", colormode="binary")

In [None]:
# non-binary predictions
explore2(upoints, pred_probabilities, np.array(target), mode="pointcloud")