# Dashboard experiments
---

Just a notebook with some modelling and data visualization experiments so as to adequately develop the dashboard.

## Setup

In [None]:
from glob import glob
from functools import partial
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from fastai.vision.all import (
    load_learner, 
    Normalize, 
    imagenet_stats, 
    DataBlock, 
    ImageBlock,
    MultiCategoryBlock,
    RandomSplitter,
    ColReader,
    Resize,
    aug_transforms,
    MixUp,
    cnn_learner,
    resnet50
)
from fastai.metrics import accuracy_multi, FBetaMulti
import torch
from torchviz import make_dot
from skimage.io import imread
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
import colorlover as cl
from google.cloud import storage, bigquery
import io
import os

In [None]:
import sys
sys.path.append("../data/")
import data_utils

In [None]:
MODEL_PATH = "../modeling/"
DATA_PATH = "/Users/andrecnf/Documents/datasets/fsdl/"
img_path = "planet/planet/train-jpg/"

In [None]:
perf_colors = cl.scales['8']['div']['RdYlGn']

## Modelling

In [None]:
model = load_learner(f"{MODEL_PATH}resnet50-128.pkl")

In [None]:
model.model

In [None]:
model.model.eval()
torch.no_grad()

In [None]:
len(model.dls.splits[0])

In [None]:
len(model.dls.splits[1])

In [None]:
model.dls[0]

In [None]:
model.predict?

In [None]:
# img = imread(f"{DATA_PATH}widsdatathon2019/leaderboard_holdout_data/img_000012018.jpg")
sample_name = "train_0"
img = imread(f"{DATA_PATH}{img_path}{sample_name}.jpg")

In [None]:
img.shape

In [None]:
model.predict(img)

In [None]:
# file_paths = glob(f"{DATA_PATH}widsdatathon2019/leaderboard_holdout_data/*.jpg")
file_paths = sorted(glob(f"{DATA_PATH}{img_path}*.jpg"))
file_paths[:5]

In [None]:
# [file_path.split("/")[-1] for file_path in file_paths]

In [None]:
n_samples = 100
imgs = np.empty(shape=(0, 0, 0, 0))
count = 0
for i in tqdm(range(n_samples)):
    img = imread(file_paths[i])
    img = np.expand_dims(img, axis=0)
    if count == 0:
        imgs = img
    else:
        imgs = np.concatenate((imgs, img))
    count += 1

In [None]:
pred = list()
for i in tqdm(range(n_samples)):
    img_pred = model.predict(imgs[i])[1]
    pred.append(img_pred)

In [None]:
pred

In [None]:
# def load_and_preprocess_image(img_path):
#     img = imread(file_paths[i])
#     img = np.expand_dims(img, axis=0)
#     return img

In [None]:
# from joblib import Parallel, delayed
# list_of_images = Parallel(n_jobs=3)(delayed(load_and_preprocess_image)(img_path) for img_path in tqdm(file_paths))
# list_of_images

In [None]:
imgs.shape

In [None]:
# model.predict(imgs)  # Can't directly use predict on multiple images

In [None]:
# model.data.add_test(imgs)

In [None]:
# This doesn't seem to change the images at all, but I'm keeping it
# for sanity sake, as we want images to have the same normalization
# as during training
imgs = Normalize.from_stats(*imagenet_stats)(imgs)
imgs

In [None]:
imgs = torch.from_numpy(imgs)
imgs = imgs.permute((0, 3, 1, 2))
imgs = imgs.float()

In [None]:
pred_logits = model.model(imgs)
pred_logits

In [None]:
fig = make_dot(pred_logits.mean(), params=dict(model.model.named_parameters()))
fig

In [None]:
type(fig)

In [None]:
fig.save?

In [None]:
pred_proba = torch.sigmoid(pred_logits)
pred_proba

In [None]:
pred = torch.round(pred_proba)
pred

In [None]:
labels_df = pd.read_csv(f"{DATA_PATH}planet/planet/train_classes.csv")
labels_df

In [None]:
labels_df.sort_values("image_name", inplace=True)
labels_df

In [None]:
labels_df = data_utils.encode_tags(labels_df, drop_tags_col=True)
labels_df

In [None]:
# labels_df.to_csv(f"{DATA_PATH}planet/planet/train_classes_ohe.csv", index=False)

In [None]:
labels = labels_df.iloc[:n_samples, 1:].values
labels = torch.from_numpy(labels)
labels

In [None]:
pred == labels

In [None]:
acc = float(accuracy_multi(inp=pred_logits, targ=labels, thresh=0.2))
acc

In [None]:
fbeta = FBetaMulti(beta=2, average="samples", thresh=0.2)(preds=pred, targs=labels)
fbeta

In [None]:
labels_df = pd.read_csv(f"{DATA_PATH}planet/planet/train_classes.csv")
labels_df

In [None]:
def get_data(size=224,bs=64,data_df=labels_df):
    dblock = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
                       splitter=RandomSplitter(seed=42),
                       get_x=ColReader(0, pref=f"{DATA_PATH}{img_path}", suff=".jpg"),
                       get_y=ColReader(1, label_delim=" "),
                       item_tfms = Resize(size),
                       batch_tfms = [*aug_transforms(flip_vert=True, max_lighting=0.1, max_zoom=1.05, max_warp=0.),
                                     Normalize.from_stats(*imagenet_stats)]
                      )
    return dblock.dataloaders(data_df,bs=bs)

In [None]:
dls = get_data(128, 256)

In [None]:
dls.show_batch()

In [None]:
len(dls.splits[1])

In [None]:
dls.splits[1] == model.dls.splits[1]

In [None]:
metrics = [partial(accuracy_multi, thresh=0.2), FBetaMulti(beta=2, average='samples', thresh=0.2)]
cbs = [MixUp]
learn = cnn_learner(dls, resnet50, metrics=metrics, cbs=cbs).to_fp16()

In [None]:
learn.model = model.model

In [None]:
pred_proba, pred = learn.get_preds(ds_idx=0)

## Data visualization

In [None]:
acc *= 100

In [None]:
color = perf_colors[int(max((acc/100)*len(perf_colors)-1, 0))]

In [None]:
color

In [None]:
acc

In [None]:
fig = go.Figure(go.Indicator(
    mode="gauge+number",
    value=acc,
    domain=dict(x=[0, 1], y=[0, 1]),
    gauge=dict(
        axis=dict(range=[0, 100]),
        bar=dict(
            thickness=1,
            color=color
        )
    ),
    title=dict(text="Accuracy")))
fig.update_layout(margin=dict(l=25, r=40, b=0, t=0, pad=0), height=380)
fig.show()

In [None]:
fig = px.histogram(labels)

In [None]:
fig.data[0]

In [None]:
for i, tag in enumerate(data_utils.TAGS):
    fig.data[i].name = tag
    fig.data[i].hovertemplate = fig.data[i].hovertemplate.replace(str(i), tag)
fig

In [None]:
fig = px.histogram(pred)

In [None]:
for i, tag in enumerate(data_utils.TAGS):
    fig.data[i].name = tag
    fig.data[i].hovertemplate = fig.data[i].hovertemplate.replace(str(i), tag)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0, pad=0))
fig

In [None]:
fig = go.Figure(go.Indicator(
    mode="number",
    value=len(labels_df),
    title=dict(text="Samples")
))
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0, pad=0), height=150)
fig.show()

In [None]:
imgs.shape

In [None]:
imgs_flat = np.empty((imgs.shape[1], imgs.shape[0] * imgs.shape[2] * imgs.shape[3]))
for i in range(imgs.shape[1]):
    imgs_flat[i, :] = imgs[:, i, :, :].reshape((-1)).numpy()

In [None]:
imgs_flat.shape

In [None]:
# px.histogram(imgs_flat)  # This would take too long to run; it's better to calculate the data manually

In [None]:
pixel_min = int(np.min(imgs_flat))
pixel_min

In [None]:
pixel_max = int(np.max(imgs_flat))
pixel_max

In [None]:
bins = [i for i in range(pixel_min, pixel_max, 1)]

In [None]:
y = np.empty((imgs_flat.shape[0], len(bins)-1))
for i in range(imgs_flat.shape[0]):
    y[i, :], _ = np.histogram(imgs_flat[i], bins)

In [None]:
bin_centers = list()
for i in range(len(bins) - 1):
    bin_centers.append((bins[i] + bins[i+1]) / 2)

In [None]:
y.shape

In [None]:
len(bin_centers)

In [None]:
pixels_df = pd.DataFrame(dict(pixel_value=bin_centers, blue=y[2], red=y[0], green=y[1]))
pixels_df.set_index("pixel_value", inplace=True)
pixels_df

In [None]:
fig = px.line(pixels_df, title="Distribution of pixel values per channel")
fig.update_layout(
    yaxis_title="count",
    margin=dict(l=0, r=0, b=0, t=50, pad=0), 
    height=300
)
fig

## Data loading from cloud

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/andrecnf/fsdl-305310-c35340ed449c.json"

In [None]:
storage_client = storage.Client()

In [None]:
bucket_name = "planet_amazon"
bucket = storage_client.bucket(bucket_name)

In [None]:
blob = bucket.get_blob("train_classes.csv")
blob = blob.download_as_string()
blob = blob.decode('utf-8')
blob = io.StringIO(blob)
pd.read_csv(blob)

In [None]:
blob = bucket.get_blob("train-jpg/train_0.jpg")
blob = blob.download_as_bytes()
blob = io.BytesIO(blob)
imread(blob)

In [None]:
files_list = storage_client.list_blobs("wids_oil_palm")
files_list = [f.name.split("/")[-1].split(".")[0] for f in files_list if ".jpg" in f.name]
files_list

In [None]:
pd.read_gbq("SELECT * FROM `fsdl-305310.deforestation_data.planet_labels`")

In [None]:
from datetime import datetime
user_id = 42
image_id = 50
test_data = dict(
    user_id=user_id,
    ts=datetime.now(),
    image_id=image_id,
    user_feedback_positive=1,
    user_comment="Good job 👍",
    output_agriculture=0,
    output_artisinal_mine=0,
    output_bare_ground=0,
    output_blooming=0,
    output_blow_down=0,
    output_clear=0,
    output_cloudy=0,
    output_conventional_mine=0,
    output_cultivation=0,
    output_habitation=0,
    output_haze=0,
    output_partly_cloudy=0,
    output_primary=0,
    output_road=0,
    output_selective_logging=0,
    output_slash_burn=0,
    output_water=0,
)
test_df = pd.Series(test_data).to_frame().transpose()
test_df

In [None]:
test_df.dtypes

In [None]:
test_df.user_id = test_df.user_id.astype(int)
# test_df.ts = test_df.ts.apply(lambda x: x.timestamp())
test_df.image_id = test_df.image_id.astype(int)
test_df.user_feedback_positive = test_df.user_feedback_positive.astype(bool)
test_df.user_comment = test_df.user_comment.astype(str)
test_df.output_agriculture = test_df.output_agriculture.astype(bool)
test_df.output_artisinal_mine = test_df.output_artisinal_mine.astype(bool)
test_df.output_bare_ground = test_df.output_bare_ground.astype(bool)
test_df.output_blooming = test_df.output_blooming.astype(bool)
test_df.output_blow_down = test_df.output_blow_down.astype(bool)
test_df.output_clear = test_df.output_clear.astype(bool)
test_df.output_cloudy = test_df.output_cloudy.astype(bool)
test_df.output_conventional_mine = test_df.output_conventional_mine.astype(bool)
test_df.output_cultivation = test_df.output_cultivation.astype(bool)
test_df.output_habitation = test_df.output_habitation.astype(bool)
test_df.output_haze = test_df.output_haze.astype(bool)
test_df.output_partly_cloudy = test_df.output_partly_cloudy.astype(bool)
test_df.output_primary = test_df.output_primary.astype(bool)
test_df.output_road = test_df.output_road.astype(bool)
test_df.output_selective_logging = test_df.output_selective_logging.astype(bool)
test_df.output_slash_burn = test_df.output_slash_burn.astype(bool)
test_df.output_water = test_df.output_water.astype(bool)

In [None]:
test_df.dtypes

In [None]:
from pandas_gbq import schema
schema.generate_bq_schema(test_df)

In [None]:
test_df.to_gbq("user_data.playground_uploads", if_exists="append")

In [None]:
sample_name = "train_0"
img = imread(f"{DATA_PATH}{img_path}{sample_name}.jpg")

In [None]:
bucket_name = "playground_images"
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob("1.jpg")
f = io.BytesIO()
pil_img = Image.fromarray(img)
pil_img.save(f, "jpeg")
pil_img.close()
blob.upload_from_string(f.getvalue(), content_type="image/jpeg")

In [None]:
bq_client = bigquery.Client()

In [None]:
user_id = 42
image_id = 50
dml_statement = (
    "DELETE user_data.playground_uploads "
    f"WHERE (user_id = {user_id} AND image_id = {image_id})"
)
query_job = bq_client.query(dml_statement)