In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/chugun_train.csv').drop([1024, 2041])
test = pd.read_csv('../data/chugun_test.csv')
target = pd.read_csv('../data/target_train.csv').drop([1024, 2041])

In [4]:
%load_ext autoreload

In [5]:
%autoreload 2

from features_chugun import process_data

In [6]:
def metric(answers, user_csv):

    delta_c = np.abs(np.array(answers['C']) - np.array(user_csv['C']))
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(np.array(answers['TST']) - np.array(user_csv['TST']))
    hit_rate_t = np.int64(delta_t < 20)

    N = np.size(answers['C'])
    
    return np.sum(hit_rate_c) / N, np.sum(hit_rate_t) / N, np.sum(hit_rate_c + hit_rate_t) / 2 / N

In [7]:
data = process_data(train, test)[0]
data = data.drop(['total_seconds', 'NPLV'], axis=1)
# target = target.drop([1024, 2041])

In [8]:
from collections import deque

In [22]:
dq = deque()
eps = 0.02
best_len = 0
best_const = 0
for x in target.C.sort_values().values:
    while len(dq) > 0 and x - dq[0] > 2 * eps:
        dq.popleft()
    dq.append(x)
    if len(dq) > best_len:
        best_len = len(dq)
        best_const = x

In [26]:
eps = 0.02
best_c = 0
best_c_score = 0

for x in target.C.sort_values().values:
    best_consts = pd.DataFrame([{'TST': 1590, 'C': x}])
    c, t, tot = metric(target, best_consts)
    if c > best_c_score:
        best_c_score = c
        best_c = x

In [38]:
best_c

0.051

In [34]:
eps = 0.02
best_t = 0
best_t_score = 0

for x in target.TST.sort_values().values:
    best_consts = pd.DataFrame([{'TST': x, 'C': 0.051}])
    c, t, tot = metric(target, best_consts)
    if t > best_t_score:
        best_t_score = t
        best_t = x

In [36]:
best_t

1657

In [46]:
best_consts = pd.DataFrame([{'TST': 1657, 'C': 0.051}])
metric(target, best_consts)

(0.6569626394953906, 0.4949053857350801, 0.5759340126152354)

In [39]:
def make_submission(predictionC, predictionTST, name='constant'):
    submission = pd.read_csv('../data/sample_submission.csv')
    submission['C'] = predictionC
    submission['TST'] = predictionTST
    submission.to_csv(f'../data/submissions/{name}.csv', index=False)

In [41]:
make_submission(best_c, 0, 'not_best_consts')

In [43]:
(0.5025641025641026 - 0.2743589743589744) * 2 # test t constant score

0.45641025641025634

In [44]:
0.2743589743589744 * 2 # text c constant score

0.5487179487179488

In [50]:
submission = pd.read_csv('../data/submissions/two_ridge_boxcox.csv')
submission['TST'] = best_t
submission.to_csv(f'../data/submissions/ridgec_constt_boxcox.csv', index=False)