In [42]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [43]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Импорт данных

In [44]:
votes = pd.read_csv("train_data.csv")

In [45]:
answers = pd.read_csv("train_answers.csv")

## Фильтрация выбросов по интерквартильному размаху

In [46]:
def iqr(qr1, qr3):
    return (qr3 - qr1) * 1.5

In [48]:
q1_quantiles = votes.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.25], interpolation='nearest').reset_index().drop(columns=['level_1'])
q3_quantiles = votes.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.75], interpolation='nearest').reset_index().drop(columns=['level_1'])
q_merged = q1_quantiles.merge(q3_quantiles, on='itemId', suffixes=('_q1', '_q3'))
q_merged['Xmin_iqr'] = iqr(q_merged['Xmin_q1'], q_merged['Xmin_q3'])
q_merged['Ymin_iqr'] = iqr(q_merged['Ymin_q1'], q_merged['Ymin_q3'])
q_merged['Xmax_iqr'] = iqr(q_merged['Xmax_q1'], q_merged['Xmax_q3'])
q_merged['Ymax_iqr'] = iqr(q_merged['Ymax_q1'], q_merged['Ymax_q3'])
q_merged = q_merged.drop(columns=['Xmin_q1', 'Ymin_q1', 'Xmax_q1', 'Ymax_q1', 'Xmin_q3', 'Ymin_q3', 'Xmax_q3', 'Ymax_q3'])

In [50]:
q2_quantiles = votes.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].median().reset_index()
q_filter = q_merged.merge(q2_quantiles, on='itemId')
q_filter['Xmin_low'] = q_filter['Xmin'] - q_filter['Xmin_iqr']
q_filter['Ymin_low'] = q_filter['Ymin'] - q_filter['Ymin_iqr']
q_filter['Xmax_low'] = q_filter['Xmax'] - q_filter['Xmax_iqr']
q_filter['Ymax_low'] = q_filter['Ymax'] - q_filter['Ymax_iqr']
q_filter['Xmin_high'] = q_filter['Xmin'] + q_filter['Xmin_iqr']
q_filter['Ymin_high'] = q_filter['Ymin'] + q_filter['Ymin_iqr']
q_filter['Xmax_high'] = q_filter['Xmax'] + q_filter['Xmax_iqr']
q_filter['Ymax_high'] = q_filter['Ymax'] + q_filter['Ymax_iqr']
q_filter = q_filter.drop(columns=['Xmin', 'Ymin', 'Xmax', 'Ymax', 'Xmin_iqr', 'Ymin_iqr', 'Xmax_iqr', 'Ymax_iqr'])

In [51]:
filter_func = lambda df: (
    (df['Xmin'] >= df['Xmin_low']) & (df['Xmin'] <= df['Xmin_high']) &
    (df['Ymin'] >= df['Ymin_low']) & (df['Ymin'] <= df['Ymin_high']) &
    (df['Xmax'] >= df['Xmax_low']) & (df['Xmax'] <= df['Xmax_high']) &
    (df['Ymax'] >= df['Ymax_low']) & (df['Ymax'] <= df['Ymax_high'])
)
votes_filtered = votes.merge(q_filter, on='itemId').loc[lambda df: filter_func(df), :].drop(columns=['Xmin_low', 'Ymin_low', 'Xmax_low', 'Ymax_low', 'Xmin_high', 'Ymin_high', 'Xmax_high', 'Ymax_high'])

## Расчёт метрики Intersection over Union (IoU)

In [52]:
quorum = votes.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [53]:
data = quorum.merge(answers, on=["itemId"])

In [54]:
data["iou"] = data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
      'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [55]:
data["iou"].mean()

0.5058725791429202

### История изменений
* 0.5058725791429202 – начальное значение
* 0.5129929061634199 – фильтрация по интерквартильному размаху