In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
%config IPCompleter.greedy=True
%config InlineBackend.figure_format = 'jpeg'
%matplotlib inline
pd.options.display.max_columns = 100

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Импорт данных

In [3]:
votes = pd.read_csv("train_data.csv")

In [4]:
answers = pd.read_csv("train_answers.csv")

# Коррекция данных

## Коррекция выбросов по интерквартильному размаху

In [5]:
def iqr(qr1, qr3):
    return (qr3 - qr1) * 1.5

In [6]:
def calc_iqrs(votez):
    q1_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.25]).reset_index().drop(columns=['level_1'])
    q3_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.75]).reset_index().drop(columns=['level_1'])
    q_merged = q1_quantiles.merge(q3_quantiles, on='itemId', suffixes=('_q1', '_q3'))
    q_merged['Xmin_iqr'] = iqr(q_merged['Xmin_q1'], q_merged['Xmin_q3'])
    q_merged['Ymin_iqr'] = iqr(q_merged['Ymin_q1'], q_merged['Ymin_q3'])
    q_merged['Xmax_iqr'] = iqr(q_merged['Xmax_q1'], q_merged['Xmax_q3'])
    q_merged['Ymax_iqr'] = iqr(q_merged['Ymax_q1'], q_merged['Ymax_q3'])
    q_merged = q_merged.drop(columns=['Xmin_q1', 'Ymin_q1', 'Xmax_q1', 'Ymax_q1', 'Xmin_q3', 'Ymin_q3', 'Xmax_q3', 'Ymax_q3'])
    return q_merged

In [7]:
def calc_low_high_iqrs(votez, iqrs):
    q2_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].median().reset_index()
    q2_quantiles.columns = q2_quantiles.columns.map(lambda x: str(x) + '_median' if x != 'itemId' else x)
    q_filter = iqrs.merge(q2_quantiles, on='itemId')
    q_filter['Xmin_low'] = q_filter['Xmin_median'] - q_filter['Xmin_iqr']
    q_filter['Ymin_low'] = q_filter['Ymin_median'] - q_filter['Ymin_iqr']
    q_filter['Xmax_low'] = q_filter['Xmax_median'] - q_filter['Xmax_iqr']
    q_filter['Ymax_low'] = q_filter['Ymax_median'] - q_filter['Ymax_iqr']
    q_filter['Xmin_high'] = q_filter['Xmin_median'] + q_filter['Xmin_iqr']
    q_filter['Ymin_high'] = q_filter['Ymin_median'] + q_filter['Ymin_iqr']
    q_filter['Xmax_high'] = q_filter['Xmax_median'] + q_filter['Xmax_iqr']
    q_filter['Ymax_high'] = q_filter['Ymax_median'] + q_filter['Ymax_iqr']
    q_filter = q_filter.drop(columns=['Xmin_iqr', 'Ymin_iqr', 'Xmax_iqr', 'Ymax_iqr'])
    return q_filter

In [8]:
def correct_by_iqr(votez):
    # verify
    imgs_before = len(votez['itemId'].unique())
    # prepare
    q_merged = calc_iqrs(votez)
    q_filter = calc_low_high_iqrs(votez, q_merged)
    votez_filtered = votez.merge(q_filter, on='itemId')
    # fix Xmin
    filter_Xmin = lambda df: (df['Xmin'] <= df['Xmin_low']) | (df['Xmin'] >= df['Xmin_high'])
    votez_filtered.loc[filter_Xmin, 'Xmin'] = votez_filtered.loc[filter_Xmin, 'Xmin_median']
    # fix Ymin
    filter_Ymin = lambda df: (df['Ymin'] <= df['Ymin_low']) | (df['Ymin'] >= df['Ymin_high'])
    votez_filtered.loc[filter_Ymin, 'Ymin'] = votez_filtered.loc[filter_Ymin, 'Ymin_median']
    # fix Xmax
    filter_Xmax = lambda df: (df['Xmax'] <= df['Xmax_low']) | (df['Xmax'] >= df['Xmax_high'])
    votez_filtered.loc[filter_Xmax, 'Xmax'] = votez_filtered.loc[filter_Xmax, 'Xmax_median']
    # fix Ymax
    filter_Ymax = lambda df: (df['Ymax'] <= df['Ymax_low']) | (df['Ymax'] >= df['Ymax_high'])
    votez_filtered.loc[filter_Xmin, 'Ymax'] = votez_filtered.loc[filter_Xmin, 'Ymax_median']
    # drop unecessary columns
    votez_filtered = votez_filtered.drop(columns=[
        'Xmin_low', 'Ymin_low', 'Xmax_low', 'Ymax_low', 
        'Xmin_high', 'Ymin_high', 'Xmax_high', 'Ymax_high',
        'Xmin_median', 'Ymin_median', 'Xmax_median', 'Ymax_median'
    ])
    # verify
    imgs_after = len(votez_filtered['itemId'].unique())
    if (imgs_before != imgs_after):
        raise Exception(f'imgs_before: {imgs_before}, imgs_after: {imgs_after}')
    # return result
    return votez_filtered

## Коррекция выбросов по среднему IoU пользователя

In [9]:
def calc_iou(votez, answerz):
    df_full = votez.merge(answerz, on=["itemId"])
    df_full["iou"] = df_full[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
          'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)
    medians = df_full.groupby('userId')[['iou']].mean().reset_index()
    return medians


users_iou = calc_iou(votes, answers)

In [10]:
def correct_by_iou(votez, userz_iou):
    means = votez.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
    merged_iou = votez.merge(userz_iou, on='userId', how='left')
    merged_iou[['iou']] = merged_iou[['iou']].fillna(0)
    merged_iou_means = merged_iou.merge(means, on='itemId', suffixes=['', '_mean'])
    fixed = merged_iou_means.copy()
    # fix Xmin
    fixed['Xmin'] = merged_iou_means['Xmin']*(merged_iou_means['iou']) + merged_iou_means['Xmin_mean']*(1 - merged_iou_means['iou'])
    # fix Ymin
    fixed['Ymin'] = merged_iou_means['Ymin']*(merged_iou_means['iou']) + merged_iou_means['Ymin_mean']*(1 - merged_iou_means['iou'])
    # fix Xmax
    fixed['Xmax'] = merged_iou_means['Xmax']*(merged_iou_means['iou']) + merged_iou_means['Xmax_mean']*(1 - merged_iou_means['iou'])
    # fix Ymax
    fixed['Ymax'] = merged_iou_means['Ymax']*(merged_iou_means['iou']) + merged_iou_means['Ymax_mean']*(1 - merged_iou_means['iou'])
    # drop temp data
    fixed = fixed.drop(columns=['iou','Xmin_mean', 'Ymin_mean', 'Xmax_mean', 'Ymax_mean'])
    return fixed

## Коррекция corner diff per user (CDU)

In [11]:
def calc_cdu(votez, answerz):
    df_full = votez.merge(answerz, on=["itemId"])
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        df_full[col + '_diff'] = df_full[col + '_true'] / df_full[col]
    diffs = df_full.groupby('userId')[['Xmin_diff','Ymin_diff', 'Xmax_diff', 'Ymax_diff']].mean().reset_index()
    return diffs


users_cdu = calc_cdu(votes, answers)

In [12]:
def correct_by_cdu(votez, userz_cdu):
    means = votez.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
    merged_diff = votez.merge(userz_cdu, on='userId', how='left')
    merged_diff_means = merged_diff.merge(means, on='itemId', suffixes=['', '_mean'])
    fixed = merged_diff_means.copy()
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        merged_diff_means[[col + '_diff']] = merged_diff_means[[col + '_diff']].fillna(0)
        fixed[col] = merged_diff_means[col]*merged_diff_means[col + '_diff']
        fixed.drop(columns=[col + '_mean', col + '_diff'], inplace=True)
    return fixed

# Расчёт метрики Intersection over Union (IoU)

In [13]:
def prepare_data(votez):
    prepared = correct_by_cdu(votez.copy(), users_cdu)
    prepared = correct_by_iqr(prepared.copy())
    prepared = correct_by_iou(prepared.copy(), users_iou)
    return prepared

In [14]:
quorum = prepare_data(votes).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [15]:
data = quorum.merge(answers, on=["itemId"])

In [16]:
data["iou"] = data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
      'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [17]:
data["iou"].mean()

0.5694073669046027

In [18]:
# todo: использовать train_test_split, чтобы избежать переобучения

## История изменений
| IoU              | How               |
|------------------|-------------------|
|0.5694073669046027| CDU -> IQR -> IOU |
|0.5504101684974193| CDU -> IOU        |
|0.5474648382831486| IOU -> CDU        |
|0.5473417381685897| CDU               |
|0.5169390514779687| IQR -> IOU        |
|0.5138072133073681| IOU               |
|0.5120520537832914| IQR               |
|0.5058725791429202|                   |

# Экспорт тестовых данных

In [19]:
votes_test = pd.read_csv("test_data.csv")
votes_quorum = prepare_data(votes_test).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index().round().astype(int)
votes_quorum.columns = ['itemId', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']
votes_quorum.to_csv('AIatsuk_predictions.csv', index=False, header=None)