In [56]:
import numpy as np
import pandas as pdinline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
%config IPCompleter.greedy=True
%config InlineBackend.figure_format = 'jpeg'
%matplotlib inline
pd.options.display.max_columns = 100

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Импорт данных

In [88]:
votes = pd.read_csv("train_data.csv")

In [89]:
answers = pd.read_csv("train_answers.csv")

## IoU пользователя

In [90]:
def calc_user_iou(votez, answerz):
    df_full = votes.merge(answers, on=["itemId"])
    df_full["iou"] = df_full[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
          'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)
    medians = df_full[['userId', 'itemId', 'iou']].groupby('userId')[['iou']].median().reset_index()
    medians.columns = [['userId', 'user_iou']]
    return medians


df_user_iou = calc_user_iou(votes, answers)

## Фильтрация выбросов по интерквартильному размаху

In [43]:
def iqr(qr1, qr3):
    return (qr3 - qr1) * 1.5

In [93]:
def calc_iqrs(votez):
    q1_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.25]).reset_index().drop(columns=['level_1'])
    q3_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.75]).reset_index().drop(columns=['level_1'])
    q_merged = q1_quantiles.merge(q3_quantiles, on='itemId', suffixes=('_q1', '_q3'))
    q_merged['Xmin_iqr'] = iqr(q_merged['Xmin_q1'], q_merged['Xmin_q3'])
    q_merged['Ymin_iqr'] = iqr(q_merged['Ymin_q1'], q_merged['Ymin_q3'])
    q_merged['Xmax_iqr'] = iqr(q_merged['Xmax_q1'], q_merged['Xmax_q3'])
    q_merged['Ymax_iqr'] = iqr(q_merged['Ymax_q1'], q_merged['Ymax_q3'])
    q_merged = q_merged.drop(columns=['Xmin_q1', 'Ymin_q1', 'Xmax_q1', 'Ymax_q1', 'Xmin_q3', 'Ymin_q3', 'Xmax_q3', 'Ymax_q3'])
    return q_merged


calc_iqrs(votes).columns

Index(['itemId', 'Xmin_iqr', 'Ymin_iqr', 'Xmax_iqr', 'Ymax_iqr'], dtype='object')

In [94]:
def calc_low_high_iqrs(votez, iqrs):
    q2_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].median().reset_index()
    q2_quantiles.columns = q2_quantiles.columns.map(lambda x: str(x) + '_median' if x != 'itemId' else x)
    q_filter = iqrs.merge(q2_quantiles, on='itemId')
    q_filter['Xmin_low'] = q_filter['Xmin_median'] - q_filter['Xmin_iqr']
    q_filter['Ymin_low'] = q_filter['Ymin_median'] - q_filter['Ymin_iqr']
    q_filter['Xmax_low'] = q_filter['Xmax_median'] - q_filter['Xmax_iqr']
    q_filter['Ymax_low'] = q_filter['Ymax_median'] - q_filter['Ymax_iqr']
    q_filter['Xmin_high'] = q_filter['Xmin_median'] + q_filter['Xmin_iqr']
    q_filter['Ymin_high'] = q_filter['Ymin_median'] + q_filter['Ymin_iqr']
    q_filter['Xmax_high'] = q_filter['Xmax_median'] + q_filter['Xmax_iqr']
    q_filter['Ymax_high'] = q_filter['Ymax_median'] + q_filter['Ymax_iqr']
    q_filter = q_filter.drop(columns=['Xmin_iqr', 'Ymin_iqr', 'Xmax_iqr', 'Ymax_iqr'])
    return q_filter


calc_low_high_iqrs(votes, calc_iqrs(votes)).columns

Index(['itemId', 'Xmin_median', 'Ymin_median', 'Xmax_median', 'Ymax_median',
       'Xmin_low', 'Ymin_low', 'Xmax_low', 'Ymax_low', 'Xmin_high',
       'Ymin_high', 'Xmax_high', 'Ymax_high'],
      dtype='object')

In [8]:
def filter_by_iqr(votez):
    # verify
    imgs_before = len(votez['itemId'].unique())
    # prepare
    q_merged = calc_iqrs(votez)
    q_filter = calc_low_high_iqrs(votez, q_merged)
    votez_filtered = votez.merge(q_filter, on='itemId')
    # fix Xmin
    filter_Xmin = lambda df: (df['Xmin'] <= df['Xmin_low']) | (df['Xmin'] >= df['Xmin_high'])
    votez_filtered.loc[filter_Xmin, 'Xmin'] = votez_filtered.loc[filter_Xmin, 'Xmin_median']
    # fix Ymin
    filter_Ymin = lambda df: (df['Ymin'] <= df['Ymin_low']) | (df['Ymin'] >= df['Ymin_high'])
    votez_filtered.loc[filter_Ymin, 'Ymin'] = votez_filtered.loc[filter_Ymin, 'Ymin_median']
    # fix Xmax
    filter_Xmax = lambda df: (df['Xmax'] <= df['Xmax_low']) | (df['Xmax'] >= df['Xmax_high'])
    votez_filtered.loc[filter_Xmax, 'Xmax'] = votez_filtered.loc[filter_Xmax, 'Xmax_median']
    # fix Ymax
    filter_Ymax = lambda df: (df['Ymax'] <= df['Ymax_low']) | (df['Ymax'] >= df['Ymax_high'])
    votez_filtered.loc[filter_Xmin, 'Ymax'] = votez_filtered.loc[filter_Xmin, 'Ymax_median']
    # drop unecessary columns
    votez_filtered = votez_filtered.drop(columns=[
        'Xmin_low', 'Ymin_low', 'Xmax_low', 'Ymax_low', 
        'Xmin_high', 'Ymin_high', 'Xmax_high', 'Ymax_high',
        'Xmin_median', 'Ymin_median', 'Xmax_median', 'Ymax_median'
    ])
    # verify
    imgs_after = len(votez_filtered['itemId'].unique())
    if (imgs_before != imgs_after):
        raise Exception(f'imgs_before: {imgs_before}, imgs_after: {imgs_after}')
    # return result
    return votez_filtered

## Расчёт метрики Intersection over Union (IoU)

In [44]:
quorum = filter_by_iqr(votes).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [45]:
data = quorum.merge(answers, on=["itemId"])

In [46]:
data["iou"] = data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
      'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [47]:
data["iou"].mean()

0.5120520537832914

### История изменений
* 0.5058725791429202 – начальное значение
* 0.5120520537832914 – фильтрация по интерквартильному размаху

# Экспорт тестовых данных

In [13]:
votes_test = pd.read_csv("test_data.csv")
votes_quorum = filter_by_iqr(votes_test).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index().round().astype(int)
votes_quorum.columns = ['itemId', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']
votes_quorum.to_csv('AIatsuk_predictions.csv', index=False, header=None)