In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
%config IPCompleter.greedy=True
%config InlineBackend.figure_format = 'jpeg'
%matplotlib inline
pd.options.display.max_columns = 100

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Импорт данных

In [10]:
votes = pd.read_csv("train_data.csv")
cols = ['Xmin', 'Ymin', 'Xmax', 'Ymax']

In [4]:
answers = pd.read_csv("train_answers.csv")

# Коррекция данных

## Коррекция выбросов по интерквартильному размаху

In [5]:
def iqr(qr1, qr3):
    return (qr3 - qr1) * 1.5

In [17]:
def correct_by_iqr(votez):


    def calc_iqrs(votez):
        q1_quantiles = votez.groupby('itemId')[cols].quantile(q=[0.25]).reset_index().drop(columns=['level_1'])
        q3_quantiles = votez.groupby('itemId')[cols].quantile(q=[0.75]).reset_index().drop(columns=['level_1'])
        q_merged = q1_quantiles.merge(q3_quantiles, on='itemId', suffixes=('_q1', '_q3'))
        for col in cols:
            q_merged[col + '_iqr'] = iqr(q_merged[col + '_q1'], q_merged[col + '_q3'])
            q_merged.drop(columns=[col + '_q1', col +'_q3'], inplace=True)
        return q_merged


    def calc_low_high_iqrs(votez, iqrs):
        q2_quantiles = votez.groupby('itemId')[cols].median().reset_index()
        q2_quantiles.columns = q2_quantiles.columns.map(lambda x: str(x) + '_median' if x != 'itemId' else x)
        q_filter = iqrs.merge(q2_quantiles, on='itemId')
        for col in cols:
            q_filter[col + '_low'] = q_filter[col + '_median'] - q_filter[col + '_iqr']
            q_filter[col + '_high'] = q_filter[col + '_median'] + q_filter[col + '_iqr']
            q_filter.drop(columns=[col + '_iqr'], inplace=True)
        return q_filter


    # verify
    imgs_before = len(votez['itemId'].unique())
    # prepare
    q_merged = calc_iqrs(votez)
    q_filter = calc_low_high_iqrs(votez, q_merged)
    votez_filtered = votez.merge(q_filter, on='itemId')
    # fix values
    for col in cols:
        filter_ = lambda df: (df[col] <= df[col + '_low']) | (df[col] >= df[col + '_high'])
        votez_filtered.loc[filter_, col] = votez_filtered.loc[filter_, col + '_median']
        votez_filtered.drop(columns=[col + '_low', col + '_high', col + '_median'], inplace=True)
    # verify
    imgs_after = len(votez_filtered['itemId'].unique())
    if (imgs_before != imgs_after):
        raise Exception(f'imgs_before: {imgs_before}, imgs_after: {imgs_after}')
    # return result
    return votez_filtered

## Коррекция выбросов по медианному IoU пользователя

In [49]:
def calc_iou(votez, answerz):
    df_full = votez.merge(answerz, on='itemId')
    df_full['iou'] = df_full[['Xmin', 'Ymin', 'Xmax', 'Ymax', 'Xmin_true', 'Ymin_true', 'Xmax_true', 'Ymax_true']].apply(intersection_over_union, axis=1)
    medians = df_full.groupby('userId')[['iou']].median().reset_index()
    return medians

In [92]:
def correct_by_iou(votez, userz_iou):
    threshold = 0.1
    # оставим только наибольшие iou в каждом itemId
    votes_with_user_iou = votez.merge(userz_iou, on='userId', how='left')
    votes_with_user_iou['iou'] = votes_with_user_iou['iou'].fillna(0)
    max_iou_per_item = votes_with_user_iou.groupby('itemId')[['iou']].max().reset_index()  # находим наибольшие user_iou на каждый item
    votes_with_user_iou_and_max_item_iou =  votes_with_user_iou.merge(max_iou_per_item, on='itemId', suffixes=['', '_max'])
    votes_with_max__item_iou = votes_with_user_iou_and_max_item_iou.loc[lambda df: np.abs(df['iou_max'] - df['iou']) < threshold, :]  # оставляем только строки с наибольшим iou
    votes_with_max__item_iou.drop(columns=['iou', 'iou_max'], inplace=True)
    # находим средние
    means = votes_with_max__item_iou.groupby('itemId')[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
    merged_iou_means = votes_with_user_iou.merge(means, on='itemId', suffixes=['', '_mean'])
    fixed = merged_iou_means.copy()
    # fix values
    for col in cols:
        fixed[col] = merged_iou_means[col]*merged_iou_means['iou'] + merged_iou_means[col + '_mean']*(1 - merged_iou_means['iou'])
        fixed.drop(columns=[col + '_mean'], inplace=True)
    fixed.drop(columns=['iou'], inplace=True)
    return fixed

## Коррекция corner diff per user (CDU)

In [20]:
def calc_cdu(votez, answerz, userz_iou_model):
    threshold = 0.3
    df_full = votez.merge(answerz, on='itemId').merge(userz_iou_model, on='userId')
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        df_full[col + '_diff'] = df_full[col + '_true'] / df_full[col]
        df_full.loc[lambda df: (df[col + '_diff'] < 1 - threshold) | (df[col + '_diff'] > 1 + threshold), col + '_diff'] = 1
    diffs = df_full.groupby('userId')[['Xmin_diff','Ymin_diff', 'Xmax_diff', 'Ymax_diff']].median().reset_index()
    return df_full


#calc_cdu(votes, answers, calc_iou(votes, answers))

In [298]:
def correct_by_cdu(votez, userz_cdu):
    merged_diff = votez.merge(userz_cdu, on='userId', how='left')
    fixed = merged_diff.copy()
    for col in ['Xmin','Ymin', 'Xmax', 'Ymax']:
        merged_diff[[col + '_diff']] = merged_diff[[col + '_diff']].fillna(1)
        fixed[col] = merged_diff[col] + merged_diff[col + '_diff']
        fixed.drop(columns=[col + '_diff'], inplace=True)
    return fixed

# Расчёт метрики Intersection over Union (IoU)

In [100]:
def fit_models(votez, answerz):
    userz_iou = calc_iou(votez, answerz)
    userz_cdu = calc_cdu(votez, answerz, userz_iou)
    return userz_iou, userz_cdu


def transform_data(votez, userz_iou_model, userz_cdu_model):
    prepared = correct_by_iqr(votez.copy())
    #prepared = correct_by_cdu(votez.copy(), userz_cdu_model)
    prepared = correct_by_iou(prepared.copy(), userz_iou_model)
    return prepared

In [101]:
# data split on train and test datasets
train_answers, test_answers = train_test_split(answers, test_size=0.25, random_state=42)
train_votes = votes.merge(train_answers[["itemId"]], on='itemId')
test_votes = votes.merge(test_answers[["itemId"]], on='itemId')
users_iou_model, users_cdu_model = fit_models(train_votes, train_answers)

In [102]:
train_transformed = transform_data(train_votes, users_iou_model, users_cdu_model)
test_transformed = transform_data(test_votes, users_iou_model, users_cdu_model)

In [103]:
train_quorum = train_transformed.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()
test_quorum = test_transformed.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [104]:
train_data = train_quorum.merge(train_answers, on=["itemId"])
test_data = test_quorum.merge(test_answers, on=["itemId"])

In [105]:
train_data["iou"] = train_data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)
test_data["iou"] = test_data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [106]:
print(str(train_data["iou"].mean()) + '|' + str(test_data["iou"].mean()))

0.5372553930485575|0.5178115156035272


## История изменений
| Train            | Test             | How               |
|------------------|------------------|-------------------|
|0.5430922962075330|0.5169763275452912| IQR -> CDU -> IOU |
|0.5372553930485575|0.5178115156035272| IQR -> IOU        |
|0.5519478651834866|0.5098729733114180| IOU               |
|0.5124386740459799|0.5108938312166782| IQR               |
|0.5077808282151023|0.5001559177275294|                   |

# Экспорт тестовых данных

In [107]:
votes_test = pd.read_csv("test_data.csv")
users_iou_model, users_cdu_model = fit_models(votes, answers)
votes_quorum = transform_data(votes_test, users_iou_model, users_cdu_model).groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()  #.round().astype(int)
votes_quorum.columns = ['itemId', 'Xmin_true', 'Ymin_true', 'Xmax_true','Ymax_true']
votes_quorum.to_csv('AIatsuk_predictions.csv', index=False, header=None)