In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Импорт данных

In [3]:
votes = pd.read_csv("train_data.csv")

In [4]:
answers = pd.read_csv("train_answers.csv")

## Фильтрация выбросов по интерквартильному размаху

In [56]:
def iqr(qr1, qr3):
    return (qr3 - qr1) * 1.5

In [64]:
def calc_iqrs(votez):
    q1_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.25]).reset_index().drop(columns=['level_1'])
    q3_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.75]).reset_index().drop(columns=['level_1'])
    q_merged = q1_quantiles.merge(q3_quantiles, on='itemId', suffixes=('_q1', '_q3'))
    q_merged['Xmin_iqr'] = iqr(q_merged['Xmin_q1'], q_merged['Xmin_q3'])
    q_merged['Ymin_iqr'] = iqr(q_merged['Ymin_q1'], q_merged['Ymin_q3'])
    q_merged['Xmax_iqr'] = iqr(q_merged['Xmax_q1'], q_merged['Xmax_q3'])
    q_merged['Ymax_iqr'] = iqr(q_merged['Ymax_q1'], q_merged['Ymax_q3'])
    q_merged = q_merged.drop(columns=['Xmin_q1', 'Ymin_q1', 'Xmax_q1', 'Ymax_q1', 'Xmin_q3', 'Ymin_q3', 'Xmax_q3', 'Ymax_q3'])
    return q_merged

In [69]:
def calc_low_high_iqrs(votez, iqrs):
    q2_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].median().reset_index()
    q_filter = iqrs.merge(q2_quantiles, on='itemId')
    q_filter['Xmin_low'] = q_filter['Xmin'] - q_filter['Xmin_iqr']
    q_filter['Ymin_low'] = q_filter['Ymin'] - q_filter['Ymin_iqr']
    q_filter['Xmax_low'] = q_filter['Xmax'] - q_filter['Xmax_iqr']
    q_filter['Ymax_low'] = q_filter['Ymax'] - q_filter['Ymax_iqr']
    q_filter['Xmin_high'] = q_filter['Xmin'] + q_filter['Xmin_iqr']
    q_filter['Ymin_high'] = q_filter['Ymin'] + q_filter['Ymin_iqr']
    q_filter['Xmax_high'] = q_filter['Xmax'] + q_filter['Xmax_iqr']
    q_filter['Ymax_high'] = q_filter['Ymax'] + q_filter['Ymax_iqr']
    q_filter = q_filter.drop(columns=['Xmin', 'Ymin', 'Xmax', 'Ymax', 'Xmin_iqr', 'Ymin_iqr', 'Xmax_iqr', 'Ymax_iqr'])
    return q_filter

In [91]:
def filter_by_iqr(votez):
    print('изображений до фильтрации:', len(votez['itemId'].unique()))
    q_merged = calc_iqrs(votez)
    q_filter = calc_low_high_iqrs(votez, q_merged)
    filter_func = lambda df: (
        (df['Xmin'] >= df['Xmin_low']) & (df['Xmin'] <= df['Xmin_high']) &
        (df['Ymin'] >= df['Ymin_low']) & (df['Ymin'] <= df['Ymin_high']) &
        (df['Xmax'] >= df['Xmax_low']) & (df['Xmax'] <= df['Xmax_high']) &
        (df['Ymax'] >= df['Ymax_low']) & (df['Ymax'] <= df['Ymax_high'])
    )
    # изменить логику. Если значение выходит за рамки интерквартильного интервала, то присваивать ему среднее значение
    # сейчас часть данных отсеилось за счёт выбросов хотя бы в одном из 4 входных свойств
    votes_filtered = votez.merge(q_filter, on='itemId')#.loc[lambda df: filter_func(df), :].drop(columns=['Xmin_low', 'Ymin_low', 'Xmax_low', 'Ymax_low', 'Xmin_high', 'Ymin_high', 'Xmax_high', 'Ymax_high'])
    print('изображений после фильтрации:', len(votes_filtered['itemId'].unique()))
    return votes_filtered

In [95]:
# set(votes['itemId'].unique()) - set(filter_by_iqr(votes)['itemId'].unique())
filter_by_iqr(votes) \
.loc[lambda df: df['itemId'].isin({678, 7761, 8398, 8774, 10521, 15247, 16237, 21836, 22723, 30034, 34214, 34339}), :] \
.sort_values('itemId')

изображений до фильтрации: 943
изображений после фильтрации: 943


Unnamed: 0,userId,itemId,Xmin,Ymin,Xmax,Ymax,Xmin_low,Ymin_low,Xmax_low,Ymax_low,Xmin_high,Ymin_high,Xmax_high,Ymax_high
1388,366,678,63,593,213,738,39.0,580.75,195.0,717.75,87.0,591.25,225.0,758.25
1387,141,678,32,586,210,739,39.0,580.75,195.0,717.75,87.0,591.25,225.0,758.25
1386,903,678,64,586,193,712,39.0,580.75,195.0,717.75,87.0,591.25,225.0,758.25
2587,1508,7761,22,452,31,475,21.25,451.25,30.75,475.25,22.75,452.75,53.25,476.75
2588,1389,7761,23,452,42,476,21.25,451.25,30.75,475.25,22.75,452.75,53.25,476.75
2586,1442,7761,22,453,46,476,21.25,451.25,30.75,475.25,22.75,452.75,53.25,476.75
12,29,8398,53,1020,61,1027,52.25,1019.25,61.0,1027.25,53.75,1020.75,61.0,1028.75
14,1050,8398,53,1019,61,1028,52.25,1019.25,61.0,1027.25,53.75,1020.75,61.0,1028.75
13,421,8398,54,1020,61,1028,52.25,1019.25,61.0,1027.25,53.75,1020.75,61.0,1028.75
17,311,8774,288,967,296,975,144.0,903.25,380.75,1043.5,432.0,1030.75,1129.25,1556.5


## Расчёт метрики Intersection over Union (IoU)

In [23]:
quorum = votes.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [24]:
data = quorum.merge(answers, on=["itemId"])

In [25]:
data["iou"] = data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
      'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [26]:
data["iou"].mean()

0.5058725791429202

### История изменений
* 0.5058725791429202 – начальное значение
* 0.5129929061634199 – фильтрация по интерквартильному размаху

# Экспорт тестовых данных

In [82]:
votes_test = pd.read_csv("test_data.csv")
votes_quorum = filter_by_iqr(votes_test).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
# votes_quorum.columns = ['itemId', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']
votes_quorum.round().astype(int).to_csv('AIatsuk_predictions.csv', index=False)

изображений до фильтрации: 630
изображений после фильтрации: 622
