In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
%config IPCompleter.greedy=True
%config InlineBackend.figure_format = 'jpeg'
%matplotlib inline
pd.options.display.max_columns = 100

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Импорт данных

In [3]:
votes = pd.read_csv("train_data.csv")

In [4]:
answers = pd.read_csv("train_answers.csv")

# Коррекция данных

## Коррекция выбросов по интерквартильному размаху

In [5]:
def iqr(qr1, qr3):
    return (qr3 - qr1) * 1.5

In [6]:
def calc_iqrs(votez):
    q1_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.25]).reset_index().drop(columns=['level_1'])
    q3_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].quantile(q=[0.75]).reset_index().drop(columns=['level_1'])
    q_merged = q1_quantiles.merge(q3_quantiles, on='itemId', suffixes=('_q1', '_q3'))
    q_merged['Xmin_iqr'] = iqr(q_merged['Xmin_q1'], q_merged['Xmin_q3'])
    q_merged['Ymin_iqr'] = iqr(q_merged['Ymin_q1'], q_merged['Ymin_q3'])
    q_merged['Xmax_iqr'] = iqr(q_merged['Xmax_q1'], q_merged['Xmax_q3'])
    q_merged['Ymax_iqr'] = iqr(q_merged['Ymax_q1'], q_merged['Ymax_q3'])
    q_merged = q_merged.drop(columns=['Xmin_q1', 'Ymin_q1', 'Xmax_q1', 'Ymax_q1', 'Xmin_q3', 'Ymin_q3', 'Xmax_q3', 'Ymax_q3'])
    return q_merged

In [7]:
def calc_low_high_iqrs(votez, iqrs):
    q2_quantiles = votez.groupby('itemId')[['Xmin', 'Ymin', 'Xmax', 'Ymax']].median().reset_index()
    q2_quantiles.columns = q2_quantiles.columns.map(lambda x: str(x) + '_median' if x != 'itemId' else x)
    q_filter = iqrs.merge(q2_quantiles, on='itemId')
    q_filter['Xmin_low'] = q_filter['Xmin_median'] - q_filter['Xmin_iqr']
    q_filter['Ymin_low'] = q_filter['Ymin_median'] - q_filter['Ymin_iqr']
    q_filter['Xmax_low'] = q_filter['Xmax_median'] - q_filter['Xmax_iqr']
    q_filter['Ymax_low'] = q_filter['Ymax_median'] - q_filter['Ymax_iqr']
    q_filter['Xmin_high'] = q_filter['Xmin_median'] + q_filter['Xmin_iqr']
    q_filter['Ymin_high'] = q_filter['Ymin_median'] + q_filter['Ymin_iqr']
    q_filter['Xmax_high'] = q_filter['Xmax_median'] + q_filter['Xmax_iqr']
    q_filter['Ymax_high'] = q_filter['Ymax_median'] + q_filter['Ymax_iqr']
    q_filter = q_filter.drop(columns=['Xmin_iqr', 'Ymin_iqr', 'Xmax_iqr', 'Ymax_iqr'])
    return q_filter

In [8]:
def correct_by_iqr(votez):
    # verify
    imgs_before = len(votez['itemId'].unique())
    # prepare
    q_merged = calc_iqrs(votez)
    q_filter = calc_low_high_iqrs(votez, q_merged)
    votez_filtered = votez.merge(q_filter, on='itemId')
    # fix Xmin
    filter_Xmin = lambda df: (df['Xmin'] <= df['Xmin_low']) | (df['Xmin'] >= df['Xmin_high'])
    votez_filtered.loc[filter_Xmin, 'Xmin'] = votez_filtered.loc[filter_Xmin, 'Xmin_median']
    # fix Ymin
    filter_Ymin = lambda df: (df['Ymin'] <= df['Ymin_low']) | (df['Ymin'] >= df['Ymin_high'])
    votez_filtered.loc[filter_Ymin, 'Ymin'] = votez_filtered.loc[filter_Ymin, 'Ymin_median']
    # fix Xmax
    filter_Xmax = lambda df: (df['Xmax'] <= df['Xmax_low']) | (df['Xmax'] >= df['Xmax_high'])
    votez_filtered.loc[filter_Xmax, 'Xmax'] = votez_filtered.loc[filter_Xmax, 'Xmax_median']
    # fix Ymax
    filter_Ymax = lambda df: (df['Ymax'] <= df['Ymax_low']) | (df['Ymax'] >= df['Ymax_high'])
    votez_filtered.loc[filter_Xmin, 'Ymax'] = votez_filtered.loc[filter_Xmin, 'Ymax_median']
    # drop unecessary columns
    votez_filtered = votez_filtered.drop(columns=[
        'Xmin_low', 'Ymin_low', 'Xmax_low', 'Ymax_low', 
        'Xmin_high', 'Ymin_high', 'Xmax_high', 'Ymax_high',
        'Xmin_median', 'Ymin_median', 'Xmax_median', 'Ymax_median'
    ])
    # verify
    imgs_after = len(votez_filtered['itemId'].unique())
    if (imgs_before != imgs_after):
        raise Exception(f'imgs_before: {imgs_before}, imgs_after: {imgs_after}')
    # return result
    return votez_filtered

## Коррекция выбросов по среднему IoU пользователя

In [9]:
def calc_iou(votez, answerz):
    df_full = votez.merge(answerz, on=["itemId"])
    df_full["iou"] = df_full[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
          'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)
    medians = df_full.groupby('userId')[['iou']].mean().reset_index()
    return medians

In [10]:
def correct_by_iou(votez, userz_iou):
    # оставим только наибольшие iou в каждом itemId
    votes_with_user_iou = votez.merge(userz_iou, on='userId', how='left')
    votes_with_user_iou['iou'] = votes_with_user_iou['iou'].fillna(0)
    max_iou_per_item = votes_with_user_iou.groupby('itemId')[['iou']].max().reset_index()  # находим наибольшие user_iou на каждый item
    votes_with_user_iou_and_max_item_iou =  votes_with_user_iou.merge(max_iou_per_item, on='itemId', suffixes=['', '_max'])
    votes_with_max__item_iou = votes_with_user_iou_and_max_item_iou.loc[lambda df: np.abs(df['iou_max'] - df['iou']) < 0.1, :]  # оставляем только строки с наибольшим iou
    votes_with_max__item_iou.drop(columns=['iou', 'iou_max'], inplace=True)
    # находим средние
    means = votes_with_max__item_iou.groupby('itemId')[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
    merged_iou_means = votes_with_user_iou.merge(means, on='itemId', suffixes=['', '_mean'])
    fixed = merged_iou_means.copy()
    # fix values
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        fixed[col] = merged_iou_means[col]*merged_iou_means['iou'] + merged_iou_means[col + '_mean']*(1 - merged_iou_means['iou'])
    # drop temp data
    fixed = fixed.drop(columns=['iou', 'Xmin_mean', 'Ymin_mean', 'Xmax_mean', 'Ymax_mean'])
    return fixed

## Коррекция corner diff per user (CDU)

In [11]:
def calc_cdu(votez, answerz):
    df_full = votez.merge(answerz, on=["itemId"])
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        df_full[col + '_diff'] = df_full[col + '_true'] / df_full[col]
        df_full.loc[lambda df: np.abs(df[col + '_diff'] - 1) > 0.5, col + '_diff'] = 1
    diffs = df_full.groupby('userId')[['Xmin_diff','Ymin_diff', 'Xmax_diff', 'Ymax_diff']].median().reset_index()
    return diffs

In [12]:
def correct_by_cdu(votez, userz_cdu):
    means = votez.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
    merged_diff = votez.merge(userz_cdu, on='userId', how='left')
    merged_diff_means = merged_diff.merge(means, on='itemId', suffixes=['', '_mean'])
    fixed = merged_diff_means.copy()
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        merged_diff_means[[col + '_diff']] = merged_diff_means[[col + '_diff']].fillna(0)
        fixed[col] = merged_diff_means[col]*merged_diff_means[col + '_diff']
        fixed.drop(columns=[col + '_mean', col + '_diff'], inplace=True)
    return fixed

# Расчёт метрики Intersection over Union (IoU)

In [13]:
def fit_models(votez, answerz):
    userz_iou = calc_iou(votez, answerz)
    userz_cdu = calc_cdu(votez, answerz)
    return userz_iou, userz_cdu


def transform_data(votez, userz_iou_model, userz_cdu_model):
    #prepared = correct_by_cdu(votez.copy(), userz_cdu_model)
    prepared = correct_by_iqr(votez.copy())
    prepared = correct_by_iou(prepared.copy(), userz_iou_model)
    return prepared

In [14]:
# data split on train and test datasets
train_answers, test_answers = train_test_split(answers, test_size=0.25, random_state=42)
train_votes = votes.merge(train_answers[["itemId"]], on='itemId')
test_votes = votes.merge(test_answers[["itemId"]], on='itemId')
users_iou_model, users_cdu_model = fit_models(train_votes, train_answers)

In [15]:
train_transformed = transform_data(train_votes, users_iou_model, users_cdu_model)
test_transformed = transform_data(test_votes, users_iou_model, users_cdu_model)

In [16]:
train_quorum = train_transformed.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
test_quorum = test_transformed.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [17]:
train_data = train_quorum.merge(train_answers, on=["itemId"])
test_data = test_quorum.merge(test_answers, on=["itemId"])

In [18]:
train_data["iou"] = train_data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)
test_data["iou"] = test_data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [19]:
print(str(train_data["iou"].mean()) + '|' + str(test_data["iou"].mean()))

0.5402426668546622|0.5165365841721786


## История изменений
| Train            | Test             | How               |
|------------------|------------------|-------------------|
|0.5644950152740668|0.3339338082780439| CDU               |
|0.5402426668546622|0.5165365841721786| IQR -> IOU        |
|0.5537928983528820|0.5097876622021996| IOU               |
|0.5124386740459799|0.5108938312166782| IQR               |
|0.5077808282151023|0.5001559177275294|                   |

# Экспорт тестовых данных

In [20]:
votes_test = pd.read_csv("test_data.csv")
users_iou_model, users_cdu_model = fit_models(votes, answers)
votes_quorum = transform_data(votes_test, users_iou_model, users_cdu_model).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()  #.round().astype(int)
votes_quorum.columns = ['itemId', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']
votes_quorum.to_csv('AIatsuk_predictions.csv', index=False, header=None)