In [None]:
import pandas as pd
import random
import missingno as msno
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./dataset_predictions_complete.csv')

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
msno.matrix(df)

# Predictors distribution

In [None]:
from pycocotools.coco import COCO
import skimage.io as io
from utils import get_local_dic, get_pairs
import matplotlib.patches as patches
import json

In [None]:
# plot the outliers
json_path = '/root/data/small_pen_data_collection/body_parts_detection_merged.json'
example_coco = COCO(json_path)

In [None]:
example_coco

In [None]:
json.load(open(json_path)).keys()

In [None]:
example_coco.getAnnIds[0]

In [None]:
example_coco.info()

In [None]:
# local_dic = get_local_dic()

In [None]:
ncats = len(example_coco.cats.keys())
distpairs = int((ncats * (ncats-1)) /2) + 1

In [None]:
outliers_dic = {}
for col_ind in range(1, distpairs):
    print(col_ind)
    col_values = np.array(df.iloc[:, col_ind])
    plt.hist(col_values[~np.isnan(col_values)] , bins=np.arange(0, 1.1, 0.05)) # <- remove high values
    col_name = list(df.columns)[col_ind]
    if col_ind < 37:
        part0 = example_coco.cats[int(col_name[0])]['name']
        part1 = example_coco.cats[int(col_name[1])]['name']
        plt.title('Distance from {} to {}'.format(part0, part1))
    else:
        plt.title('Ground truth')
    plt.show()
    
    # get some stats
    mean = np.nanmean(col_values)
    std = np.nanstd(col_values)
    
    thresh = mean + 3*std
    outliers = np.count_nonzero(col_values[col_values>thresh])
    outliers_index = np.nonzero(col_values>thresh)
    outliers_images = list(np.array(df.iloc[:, -1])[outliers_index])
    for oi in outliers_images:
        if oi not in outliers_dic:
            outliers_dic[oi] = []
        outliers_dic[oi].append(list(df.columns)[col_ind])
    
    print('Mean: {}'.format(mean))
    print('Std: {}'.format(std))
    print('Outliers number: {}'.format(outliers))
    print('#'*100)


In [None]:
outliers_dic

In [None]:
# print(len(list(outliers_dic.keys())))
# ct = 0
# for (k, v) in outliers_dic.items():
#     if len(v) > 3:
#         ct+=1
# print(ct)

In [None]:
# experience_181001010005 = df[df.iloc[:, -2] == 4480].iloc[:, -1]
# experience_181001010005_distance = list(df[df.iloc[:, -2] == 4480].iloc[:, 2])

In [None]:
pairs = get_pairs(example_coco)

In [None]:
for (k, image_id) in enumerate(outliers_dic.keys()):
    # category_ids = example_coco.getCatIds()
    category_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    # image_id = np.random.choice(list(outliers_dic.keys()))
    # image_id = 'cjn1i3clnubry0710vgak9umm'
    print('Image id: {}'.format(image_id))

    image_data = example_coco.loadImgs([image_id])[0]

    # load and display instance annotations
    # file_name = image_data['coco_url'].split('%2F')[2].split('?alt')[0]
    # print(file_name)
    # image = io.imread(local_dic[file_name])
    image = io.imread(image_data['local_path'])
    f ,ax = plt.subplots(1, figsize=(20, 20))
    ax.imshow(image); 
    # ax.axis('off')
    # plt.axis('off')
    # pylab.rcParams['figure.figsize'] = (8.0, 10.0)
    annotation_ids = example_coco.getAnnIds(imgIds=[image_data['id']], catIds=category_ids, iscrowd=None)
    annotations = example_coco.loadAnns(annotation_ids)
    example_coco.showAnns(annotations)
    for ann in annotations:
        bbox = ann['bbox']
        c = ann['category_id']
        ax.text(bbox[1], bbox[0]-10, example_coco.cats[c]['name'], fontsize=16, color='r')

    plt.axis('off')
    plt.show()
    
#     print("Head to Caudal fin distance: {} m" .format(experience_181001010005_distance[k]))
    outliers_distances = list(set(outliers_dic[image_id]))
    for od in outliers_distances:
        part0 = example_coco.cats[int(od[0])]['name']
        part1 = example_coco.cats[int(od[1])]['name']
        print('The distance between the {} and the {} is an outlier'.format(part0, part1))


    # get the right image
    for (k, v) in pairs.items():
        if 'right' in v:
            if v['right'] == image_id:
                right_id = v['left']
                break

    print(right_id)
    image_data = example_coco.loadImgs([right_id])[0]
    # load and display instance annotations
#     file_name = image_data['coco_url'].split('%2F')[2].split('?alt')[0]
#     print(file_name)
#     image = io.imread(local_dic[file_name])
    image = io.imread(image_data['local_path'])
    f ,ax = plt.subplots(1, figsize=(20, 20))
    ax.imshow(image); 
    # ax.axis('off')
    # plt.axis('off')
    # pylab.rcParams['figure.figsize'] = (8.0, 10.0)
    annotation_ids = example_coco.getAnnIds(imgIds=[image_data['id']], catIds=category_ids, iscrowd=None)
    annotations = example_coco.loadAnns(annotation_ids)
    example_coco.showAnns(annotations)
    for ann in annotations:
        bbox = ann['bbox']
        c = ann['category_id']
        ax.text(bbox[1], bbox[0]-10, example_coco.cats[c]['name'], fontsize=16, color='r')
    plt.axis('off')
    plt.show()
    print('#'*100)

# Let's try some models

In [None]:
# we can remove any values above 1 and turn it into a nan. 
# only valid because we have some prior on the fish length
df[df.iloc[:, 1:-2] > 1] = np.nan

In [None]:
# number of rows with at least one missing value...
sum(df.apply(lambda x: sum(x.isnull().values), axis = 1)>0)

In [None]:
outliers_dic = {}
gt_values = np.array(df.iloc[:, -2])
                      
for col_ind in range(1, distpairs):
    col_values = np.array(df.iloc[:, col_ind])
    plt.scatter(col_values, gt_values)
    col_name = list(df.columns)[col_ind]
    plt.xlabel('Length (m)')
    plt.ylabel('Weight (g)')
    print(col_ind)
    if col_ind < 37:
        part0 = example_coco.cats[int(col_name[0])]['name']
        part1 = example_coco.cats[int(col_name[1])]['name']
        plt.title('Distance from {} to {}'.format(part0, part1))
    else:
        plt.title(col_name)
    plt.show()