In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import base64
import io
import json
import requests

In [4]:
csv_file = "/home/jupyter/dataset/subset_1_3/export_data-seagen_subset_1_3-2021-01-08T10:51:42.591419Z/image_classification_1.csv"
save_path_preds = "/home/jupyter/dataset/subset_1_3/subset_1_3_preds_edge_v1_2.csv"
model_url = "http://localhost:8501/v1/models/default:predict"

df = pd.read_csv(csv_file, header=None)
df.rename(columns={0:"split", 1:"path", 2:"gt_label"}, inplace=True)
df['strong_positive'] = 0.0 
df['negative']= 0.0
df['dim_positive'] = 0.0
df['no_tissue'] = 0.0
df['out_of_focus'] = 0.0
df['necrotic'] = 0.0
df['HE'] = 0.0
print(df["gt_label"].value_counts())
df

out_of_focus       2043
no_tissue          1091
negative            836
strong_positive     829
dim_positive        417
HE                  241
Necrotic            110
Name: gt_label, dtype: int64


Unnamed: 0,split,path,gt_label,strong_positive,negative,dim_positive,no_tissue,out_of_focus,necrotic,HE
0,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,out_of_focus,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,strong_positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,out_of_focus,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,no_tissue,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5562,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,strong_positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5563,TEST,gs://seagen-quantiphi/subset_1_3/export_data-s...,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5564,TEST,gs://seagen-quantiphi/subset_1_3/export_data-s...,out_of_focus,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5565,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,out_of_focus,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
BATCH_SIZE = 32
#df = df[df["split"] != 'TRAIN']
equal_parts = int(df.shape[0]/BATCH_SIZE) + 1
df_arr = np.array_split(df, equal_parts)
len(df_arr)

174

In [37]:
for ds in tqdm(df_arr):
    instances = {
            'instances': []}
    for indx in ds.index:
        img_path = "/home/jupyter/dataset/" + ds["path"][indx][22:]
        image_key = ds["path"][indx]
        with io.open(img_path, 'rb') as image_file:
            encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
        example = {
            'image_bytes': {'b64': str(encoded_image)},
            'key': image_key
        }
        instances['instances'].append(example)
    
    url = model_url
    response = requests.post(url, data=json.dumps(instances))
    res = response.json()
    for pred in res["predictions"]:
        key = pred['key']
        for i,label in enumerate(pred['labels']):
            ds.loc[ds.path == key, label] = pred['scores'][i]
            
df = pd.concat(df_arr)
df.to_csv(save_path_preds, index=False)

100%|██████████| 174/174 [07:25<00:00,  2.56s/it]


In [5]:
thresh = 0.75
loi = "out_of_focus"
df = pd.read_csv(save_path_preds)
df["max_conf"] = df[["strong_positive", "negative", "dim_positive", "no_tissue", "out_of_focus", "necrotic", "HE"]].max(axis=1)
df["preds"] = df[["strong_positive", "negative", "dim_positive", "no_tissue", "out_of_focus", "necrotic", "HE"]].idxmax(axis=1)

In [6]:
print("Total number of", loi, "images:", df[df["gt_label"] == loi].shape[0])
print("Auto classified as", loi, "with", thresh, "threshold:", df[(df["gt_label"] == loi) & (df[loi]>=thresh)].shape[0])
print("Auto classified as any class with", thresh, "confidence:", df[(df["gt_label"] == loi) & (df["max_conf"]>=thresh)].shape[0])

Total number of out_of_focus images: 2043
Auto classified as out_of_focus with 0.75 threshold: 1052
Auto classified as any class with 0.75 confidence: 1416


In [8]:
i = 0
for indx in df[df["gt_label"] == loi].index:
    i += 1
    if df["max_conf"][indx] >= thresh:
        df["gt_label"][indx] = df["preds"][indx]
    else:
        df["gt_label"][indx] = ""
    
print(i)

2043


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [10]:
print("Unlabeled count:", df[df["gt_label"] == ""].shape[0])

Unlabeled count: 627


In [11]:
df = df[["split", "path", "gt_label", ]]
df

Unnamed: 0,split,path,gt_label
0,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,no_tissue
1,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,negative
2,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,strong_positive
3,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,out_of_focus
4,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,no_tissue
...,...,...,...
5562,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,strong_positive
5563,TEST,gs://seagen-quantiphi/subset_1_3/export_data-s...,negative
5564,TEST,gs://seagen-quantiphi/subset_1_3/export_data-s...,
5565,TRAIN,gs://seagen-quantiphi/subset_1_3/export_data-s...,


In [12]:
df.to_csv("/home/jupyter/dataset/subset_1_3/automl_subset_1_3_auto_annotated_oof.csv", index=False, header=None)