In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import csv
import pandas as pd # not key to functionality of kernel
import glob
import timeit
import sys
import csv
import string
import torch
import string
import random
import mean_average_precision_calculator as map_calculator
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from bayes_opt import BayesianOptimization
from bayes_opt.observer import JSONLogger
from bayes_opt.util import load_logs
from bayes_opt.event import Events
from absl import app
from absl import flags
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

maxInt = sys.maxsize

while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

In [2]:
"""Eval mAP@N metric from inference file."""

class Labels(object):
  """Contains the class to hold label objects.

  This class can serialize and de-serialize the groundtruths.
  The ground truth is in a mapping from (segment_id, class_id) -> label_score.
  """

  def __init__(self, labels):
    """__init__ method."""
    self._labels = labels

  @property
  def labels(self):
    """Return the ground truth mapping. See class docstring for details."""
    return self._labels

  def to_file(self, file_name):
    """Materialize the GT mapping to file."""
    with tf.gfile.Open(file_name, "w") as fobj:
      for k, v in self._labels.items():
        seg_id, label = k
        line = "%s,%s,%s\n" % (seg_id, label, v)
        fobj.write(line)

  @classmethod
  def from_file(cls, file_name):
    """Read the GT mapping from cached file."""
    labels = {}
    with tf.gfile.Open(file_name) as fobj:
      for line in fobj:
        line = line.strip().strip("\n")
        seg_id, label, score = line.split(",")
        labels[(seg_id, int(label))] = float(score)
    return cls(labels)


def read_labels(data_pattern, cache_path=""):
  """Read labels from TFRecords.

  Args:
    data_pattern: the data pattern to the TFRecords.
    cache_path: the cache path for the label file.

  Returns:
    a Labels object.
  """
  if cache_path:
    if tf.gfile.Exists(cache_path):
      tf.logging.info("Reading cached labels from %s..." % cache_path)
      return Labels.from_file(cache_path)
  tf.enable_eager_execution()

  if 'validate' in data_pattern:
    with tf.name_scope("eval_input"):
      '''
      # randomly chosen 60 validate files
      # note that validate file names are different on gcloud and locally, due to `curl` download command
      results = []
      for i in range(3844):
          results.append(str(i).zfill(4))
      random.seed(7)
      random.shuffle(results)
      validate_file_nums = results[:300]

      validate_file_list_60 = [data_pattern.split('*')[0]\
                               + x +'.tfrecord' for x in validate_file_nums]
      data_paths = validate_file_list_60
      #print(data_paths)
      '''
      a_list = list(string.ascii_lowercase)
      A_list = list(string.ascii_uppercase)

      n_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
      n_list = n_list + a_list + A_list

      results = []
      for a in n_list:
        for n in n_list:
          results.append(n + a)
      #random.seed(7)
      #random.shuffle(results)
      validate_file_nums = results[:300]
      validate_file_list_60 = [data_pattern.split('*')[0]\
                               + x +'.tfrecord' for x in validate_file_nums]
      data_paths = validate_file_list_60
      #print(data_paths)
  else:
    data_paths = tf.gfile.Glob(data_pattern)
    print(data_paths)

  ds = tf.data.TFRecordDataset(data_paths, num_parallel_reads=50)
  context_features = {
      "id": tf.FixedLenFeature([], tf.string),
      "segment_labels": tf.VarLenFeature(tf.int64),
      "segment_start_times": tf.VarLenFeature(tf.int64),
      "segment_scores": tf.VarLenFeature(tf.float32)
  }

  def _parse_se_func(sequence_example):
    return tf.parse_single_sequence_example(
        sequence_example, context_features=context_features)

  ds = ds.map(_parse_se_func)
  rated_labels = {}
  tf.logging.info("Reading labels from TFRecords...")
  last_batch = 0
  batch_size = 5000
  for cxt_feature_val, _ in ds:
    video_id = cxt_feature_val["id"].numpy()
    segment_labels = cxt_feature_val["segment_labels"].values.numpy()
    segment_start_times = cxt_feature_val["segment_start_times"].values.numpy()
    segment_scores = cxt_feature_val["segment_scores"].values.numpy()
    for label, start_time, score in zip(segment_labels, segment_start_times,
                                        segment_scores):
      rated_labels[("%s:%d" % (video_id.decode("utf-8"), start_time), label)] = score
    batch_id = len(rated_labels) // batch_size
    if batch_id != last_batch:
      tf.logging.info("%d examples processed.", len(rated_labels))
      last_batch = batch_id
  tf.logging.info("Finish reading labels from TFRecords...")
  labels_obj = Labels(rated_labels)
  if cache_path:
    tf.logging.info("Caching labels to %s..." % cache_path)
    labels_obj.to_file(cache_path)
  return labels_obj


def read_segment_predictions(file_path, labels, top_n=None):
  """Read segement predictions.

  Args:
    file_path: the submission file path.
    labels: a Labels object containing the eval labels.
    top_n: the per-class class capping.

  Returns:
    a segment prediction list for each classes.
  """
  cls_preds = {}  # A label_id to pred list mapping.
  with tf.gfile.Open(file_path) as fobj:
    tf.logging.info("Reading predictions from %s..." % file_path)
    for line in fobj:
      label_id, pred_ids_val = line.split(",")
      pred_ids = pred_ids_val.split(" ")
      if top_n:
        pred_ids = pred_ids[:top_n]
      pred_ids = [
          pred_id for pred_id in pred_ids
          if (pred_id, int(label_id)) in labels.labels
      ]
      cls_preds[int(label_id)] = pred_ids
      if len(cls_preds) % 50 == 0:
        tf.logging.info("Processed %d classes..." % len(cls_preds))
    tf.logging.info("Finish reading predictions.")
  return cls_preds

In [4]:
def prep_sort_files(sub_files):
    for i, file in enumerate( sub_files ):
        print(file)
        sample_sub = pd.read_csv('./sample_submission.csv')
        column_names = sample_sub.columns
        sample_sub.set_index('Class', inplace=True)

        pred = pd.read_csv(file, names=column_names)
        print(pred.shape)
        print(pred.head())
        pred.set_index('Class', inplace=True)
        for cls in sample_sub.index:
            if cls in pred.index:
                sample_sub.loc[cls, 'Segments'] = pred.loc[cls, 'Segments']
        sample_sub.reset_index().to_csv(SUBDIR_SINGLE_MODEL+'sorted/'+file.split('\\')[-1].split('.')[0]+'_sorted.csv', index=None)

def combine_tta(files):
    for file in files:
        print(file)
        sub_files = []
        sub_files.append(SUBDIR_SINGLE_MODEL + file + '_oof_sorted.csv')
        sub_files.append(SUBDIR_SINGLE_MODEL + file + '_oof_shift1_sorted.csv')
        sub_files.append(SUBDIR_SINGLE_MODEL + file + '_oof_shift-1_sorted.csv')

        lg = len(sub_files)
        sub = []

        w0 = 3.72384689385059
        sub_weight = []
        w_sum = 0
        count = 0
        for i, sub_file in enumerate( sub_files ):
            try:
                print("Reading {}: {}". format(i, sub_file))
                reader = csv.DictReader(open(sub_file,"r"))
                sub.append(sorted(reader, key=lambda d: str(d[Hlabel])))
            except:
                print("Can't find  it")
                lg -= 1
                continue
            sub_weight.append(1)
            w_sum += 1
            count += 1

        start = timeit.default_timer()
        print('Start submission generation')

        place_weights = {}
        for i in range(npt):
            place_weights[i] = ( w0 / (i + w0) )

        print(f'W0: {w0}; Weight Percentage: {[i/w_sum*100 for i in sub_weight]}')

        result = {}
        result['Class'] = []
        result['Segments'] = []
        for p, row in enumerate(sub[0]):
            target_weight = {}
            for s in range(lg):
                row1 = sub[s][p]
                for ind, trgt in enumerate(row1[Htarget].split(' ')):
                    target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
            tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
            result['Class'].append(row1[Hlabel])
            result['Segments'].append(" ".join(tops_trgt))

        result_df = pd.DataFrame.from_dict(result)
        print(result_df.shape)

        print(f'Finished submission oof generation! Spent {timeit.default_timer() - start}s')

        result_df.to_csv(SUBDIR_SINGLE_MODEL + f'sorted_single/{file}_all{count}.csv', index=False)
        
def submission_generate(sub, w1, w2, w3, w4, w5, w6, w7, w8, w9):
    start = timeit.default_timer()
    print('Start submission generation')

    place_weights = {}
    for i in range(npt):
        place_weights[i] = ( w0 / (i + w0) )

    sub_weight = [w1, w2, w3, w4, w5, w6, w7, w8, w9]
    w_sum = w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9

    print(f'W0: {w0}; Weight Percentage: {[i/w_sum*100 for i in sub_weight]}')

    result = {}
    result['Class'] = []
    result['Segments'] = []
    for p, row in enumerate(sub[0]):
        target_weight = {}
        for s in range(lg):
            row1 = sub[s][p]
            for ind, trgt in enumerate(row1[Htarget].split(' ')):
                target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
        tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
        result['Class'].append(row1[Hlabel])
        result['Segments'].append(" ".join(tops_trgt))

    result_df = pd.DataFrame.from_dict(result)
    
    print(f'Finished submission oof generation! Spent {timeit.default_timer() - start}s')
    return result_df

# I. Bayesian Optimization for OOF Files

## 1.1 Prepare Sorted OOF_Files

In [None]:
# Specify the fold that contains your oof files
SUBDIR_SINGLE_MODEL = './oof/'
sub_files = glob.glob(SUBDIR_SINGLE_MODEL+'*.csv')

# Sort files based on class id for future ensemble use
prep_sort_files(sub_files)

## 1.2 Combine Different TTA Inferece Files Into One File

In [None]:
SUBDIR_SINGLE_MODEL = './oof/sorted/'
sub_files = glob.glob(SUBDIR_SINGLE_MODEL+'*.csv')

temp = []
for file in sub_files:
    temp.append(file.split('_oof')[0].split('\\')[1])
temp = set(temp)
files = []
for file in temp:
    files.append(file)
    
Hlabel = 'Class' 
Htarget = 'Segments'
npt = 100000 # number of places in target

combine_tta(files)    

In [None]:
# Read data for bayesian optimization
SUBDIR_SINGLE_MODEL = './oof/sorted/sorted_single/'
sub_files = glob.glob(SUBDIR_SINGLE_MODEL+'*.csv')

lg = len(sub_files)
sub = [None]*lg
for i, file in enumerate( sub_files ):
    ## input files ##
    print("Reading {}: {}". format(i, file))
    reader = csv.DictReader(open(file,"r"))
    sub[i] = sorted(reader, key=lambda d: str(d[Hlabel]))
    #if i > 1:
    #    break

# Use the same valid data pattern used for training. If train on the google cloud, you need to download them manually
valid_data_pattern = './inputs/data/frame/3/validate_data/validate*.tfrecord'

eval_labels = read_labels(valid_data_pattern)
positive_counter = {}
for k, v in eval_labels.labels.items():
    _, label_id = k
    if v > 0:
        positive_counter[label_id] = positive_counter.get(label_id, 0) + 1

## 1.3 Run Bayesian Optimization

In [19]:
def run_ensemble_bayesian(w0, w1, w2, w3, w4, w5, w6, w7, w8, w9):
    start = timeit.default_timer()
    print('Start submission generation')
    
    place_weights = {}
    for i in range(npt):
        place_weights[i] = ( w0 / (i + w0) )
    
    sub_weight = [w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9]
    w_sum = w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9
    print(f'W0: {w0}; Weight Percentage: {sub_weight / w_sum}')
    
    result = {}
    ## output file ##
    for p, row in enumerate(sub[0]):
        target_weight = {}
        for s in range(lg):
            row1 = sub[s][p]
            for ind, trgt in enumerate(row1[Htarget].split(' ')):
                target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
        tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
        result[row1[Hlabel]] = tops_trgt
    
    print(f'Finished submission generation! Spent {timeit.default_timer() - start}s')
    
    start = timeit.default_timer()
    print('Start evaluation')
    
    seg_preds = {}
    for label_id in result.keys():
        pred_ids = result[label_id][:100000]
        pred_ids = [
          pred_id for pred_id in pred_ids
          if (pred_id, int(label_id)) in eval_labels.labels
        ]
        seg_preds[int(label_id)] = pred_ids
    #"""
    map_cal = map_calculator.MeanAveragePrecisionCalculator(len(seg_preds))
    seg_labels = []
    seg_scored_preds = []
    num_positives = []
    for label_id in sorted(seg_preds):
        class_preds = seg_preds[label_id]
        seg_label = [eval_labels.labels[(pred, label_id)] for pred in class_preds]
        seg_labels.append(seg_label)
        seg_scored_pred = []
        if class_preds:
            seg_scored_pred = [
              float(x) / len(class_preds) for x in range(len(class_preds), 0, -1)
            ]
        seg_scored_preds.append(seg_scored_pred)
        num_positives.append(positive_counter.get(label_id, 0))
    map_cal.accumulate(seg_scored_preds, seg_labels, num_positives)
    map_at_n = np.mean(map_cal.peek_map_at_n())
    print(f'Finished evaluation with MAP {map_at_n}! Spent {timeit.default_timer() - start}s')
    print('')    
    
    return map_at_n

In [21]:
# Specify the range of potential space of each parameters
optimizer = BayesianOptimization(
    f=run_ensemble_bayesian,
    pbounds={
        "w0": [0, 10], 
        "w1": [0, 200], 
        "w2": [0, 200], 
        "w3": [0, 200], 
        "w4": [0, 200], 
        "w5": [0, 200], 
        "w6": [0, 200], 
        "w7": [0, 200], 
        "w8": [0, 200],
        "w9": [0, 200],
    },
    verbose=2,
    random_state=7,
)

In [None]:
# uncomment to load previous logs and continue tuning
#from bayes_opt.util import load_logs
#load_logs(optimizer, logs=["./logs.json"])
#print("New optimizer is now aware of {} points.".format(len(optimizer.space)))

logger = JSONLogger(path="./logs.json")
optimizer.subscribe(Events.OPTMIZATION_STEP, logger)
optimizer.maximize(
    init_points=3,
    n_iter=1000,
)

## 1.4 Generate Final Ensembled OOF Files

In [None]:
print(optimizer.max)
params = optimizer.max['params']

w0 = params['w0']
w1 = params['w1']
w2 = params['w2']
w3 = params['w3']
w4 = params['w4']
w5 = params['w5']
w6 = params['w6']
w7 = params['w7']
w8 = params['w8']
w9 = params['w9']

result_df = submission_generate(sub, w1, w2, w3, w4, w5, w6, w7, w8, w9)
print(result_df.shape)

result_df.to_csv(SUBDIR_SINGLE_MODEL + '/blend/blend1_9_oof.csv', index=False)

# II. Final Submission File Genration

## 2.1 Prepare Sorted Submission Files

In [None]:
# Specify the fold that contains your oof files
SUBDIR_SINGLE_MODEL = './submission/'
sub_files = glob.glob(SUBDIR_SINGLE_MODEL+'*.csv')

# Sort files based on class id for future ensemble use
prep_sort_files(sub_files)

## 2.2 Combine Different TTA Inferece Files Into One File

In [None]:
SUBDIR_SINGLE_MODEL = './submission/sorted/'
sub_files = glob.glob(SUBDIR_SINGLE_MODEL+'*.csv')

temp = []
for file in sub_files:
    temp.append(file.split('_finetune')[0].split('\\')[1])
temp = set(temp)
files = []
for file in temp:
    files.append(file)
    
Hlabel = 'Class' 
Htarget = 'Segments'
npt = 100000 # number of places in target

combine_tta(files)    

## 2.3 Generate Final Ensembled Submission Files

In [7]:
# Read data for bayesian optimization
SUBDIR_SINGLE_MODEL = './submission/sorted/sorted_single/'
SUBDIR_SINGLE_MODEL = 'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/'
sub_files = glob.glob(SUBDIR_SINGLE_MODEL+'*.csv')

Hlabel = 'Class' 
Htarget = 'Segments'
npt = 100000 # number of places in target

lg = len(sub_files)
sub = [None]*lg
for i, file in enumerate( sub_files ):
    ## input files ##
    print("Reading {}: {}". format(i, file))
    reader = csv.DictReader(open(file,"r"))
    sub[i] = sorted(reader, key=lambda d: str(d[Hlabel]))

Reading 0: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single\BertCross_bhidden2_bhead8_milattn_predictions_all3.csv
Reading 1: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single\BertCross_bhidden2_bhead8_milmean_predictions_all3.csv
Reading 2: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single\Bert_bhidden2_bhead12_milfirst_predictions_all4.csv
Reading 3: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single\Bert_bhidden2_bhead12_milmean_predictions_all4.csv
Reading 4: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single\Bert_bhidden2_bhead12_predictions_all3.csv
Reading 5: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single\Bert_bhidden3_bhead12_predictions_all3.csv
Reading 6: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/s

In [9]:
#params = optimizer.max['params']
params = {
  'w0': 6.058430980648939,
  'w1': 30.31334691,
  'w2': 24.89993526,
  'w3': 99.90311048,
  'w4': 78.51473969,
  'w5': 14.28880107,
  'w6': 70.49505912,
  'w7': 101.7196348,
  'w8': 87.66822585,
  'w9': 1.108667853
}
print(params)

w0 = params['w0']
w1 = params['w1']
w2 = params['w2']
w3 = params['w3']
w4 = params['w4']
w5 = params['w5']
w6 = params['w6']
w7 = params['w7']
w8 = params['w8']
w9 = params['w9']

result_df = submission_generate(sub, w1, w2, w3, w4, w5, w6, w7, w8, w9)
print(result_df.shape)

result_df.to_csv(SUBDIR_SINGLE_MODEL + '/blend/blend_1-9_final.csv', index=False)

{'w0': 6.058430980648939, 'w1': 30.31334691, 'w2': 24.89993526, 'w3': 99.90311048, 'w4': 78.51473969, 'w5': 14.28880107, 'w6': 70.49505912, 'w7': 101.7196348, 'w8': 87.66822585, 'w9': 1.108667853}
Start submission generation
W0: 6.058430980648939; Weight Percentage: [5.956506319304639, 4.892782778715159, 19.630742545819054, 15.427974499502199, 2.8077181355604353, 13.85212482061863, 19.98768559877112, 17.22661449519733, 0.2178508065114347]
Finished submission oof generation! Spent 781.2326618s
(1000, 2)


In [72]:
'''
params = {
  'w0': 6.058430980648939,
  'w1': 14.2888010723968,
  'w2': 99.90311047695229,
  'w3': 70.49505912223611,
  'w4': 78.51473969408502,
  'w5': 54.899935258593764,
  'w6': 47.668225850046696+40,
  'w8': 1.1086678532452932
}
'''
params = {
  'w0': 6.058430980648939,
  'w1': 30.31334691,
  'w2': 24.89993526,
  'w3': 99.90311048,
  'w4': 78.51473969,
  'w5': 14.28880107,
  'w6': 70.49505912,
  'w7': 101.7196348,
  'w8': 87.66822585,
  'w9': 1.108667853
}
#'''
#v3
params = {
  'w0': 6.058430980648939,
  'w1': 30.31334691,
  'w2': 24.89993526,
  'w3': 87.66822585,
  'w4': 99.90311048,
  'w5': 14.28880107,
  'w6': 70.49505912,
  'w7': 78.51473969,
  'w8': 101.7196348,
  'w9': 1.108667853
}
#'''

params = {
  'w0': 6.058430980648939,
  'w1': 60.31334691,
  'w2': 44.89993526,
  'w3': 87.66822585,
  'w4': 99.90311048,
  'w5': 34.28880107,
  'w6': 0,
  'w7': 78.51473969,
  'w8': 101.7196348,
  'w9': 1.108667853
}

w0 = params['w0']
w1 = params['w1']
w2 = params['w2']
w3 = params['w3']
w4 = params['w4']
w5 = params['w5']
w6 = params['w6']
w7 = params['w7']
w8 = params['w8']
w9 = params['w9']

start = timeit.default_timer()
print('Start submission generation')

place_weights = {}
for i in range(npt):
    place_weights[i] = ( w0 / (i + w0) )

sub_weight = [w1, w2, w3, w4, w5, w6, w7, w8, w9]
w_sum = w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9
    
print(f'W0: {w0}; Weight Percentage: {[i/w_sum*100 for i in sub_weight]}')

result_final = {}
result_final['Class'] = []
result_final['Segments'] = []
for p, row in enumerate(sub[0]):
    target_weight = {}
    for s in range(lg):
        row1 = sub[s][p]
        for ind, trgt in enumerate(row1[Htarget].split(' ')):
            target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
    tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
    result_final['Class'].append(row1[Hlabel])
    result_final['Segments'].append(" ".join(tops_trgt))

result_df_final = pd.DataFrame.from_dict(result_final)
result_df_final['Class'] = result_df_final['Class'].astype(int)
print(result_df_final.shape)
print(result_df_final.head())

print(f'Finished submission generation! Spent {timeit.default_timer() - start}s')

Start submission generation
W0: 6.058430980648939; Weight Percentage: [11.862980731005676, 8.83132994770796, 17.24338852446555, 19.649857540823565, 6.744235019649597, 0.0, 15.44299714343935, 20.007148159062996, 0.2180629338453079]
(1000, 2)
   Class                                           Segments
0    100  AmKH:155 iAON:285 ZvNy:140 ari1:160 yeEH:150 7...
1   1000  pyZt:220 pyZt:180 pyZt:175 pyZt:170 pyZt:210 p...
2   1001  Odh2:140 TLOW:150 3y2q:50 eCpP:0 1bhQ:45 1bhQ:...
3   1002  rtuW:125 rtuW:130 rtuW:20 rtuW:120 rtuW:140 rt...
4   1005  lSZ6:5 37Nx:140 37Nx:135 lSZ6:0 uDuW:105 lSZ6:...
Finished submission generation! Spent 958.7436024000053s


In [73]:
result_df_final.to_csv('G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/blend/blend1_8_v4.csv', index=False)

### Local and Cloud merge

In [29]:
sub_files = [
    'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/blend\\MixNeXtVladModel_finetune_iter300_new.csv',
    'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/blend\\mixnextvlad_iter60_shift2.csv'
]

In [4]:
#'''
SUBDIR_SINGLE_MODEL = 'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/blend/'

for i, file in enumerate( sub_files ):
    #file = 'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/oof/sorted/v2\\NetVLADModelLF_hs2048_cs16_predictions_finetune_terence_oof.csv'
    print(file)
    sample_sub = pd.read_csv('../../../submissions/sample_submission.csv')
    column_names = sample_sub.columns
    sample_sub.set_index('Class', inplace=True)
    
    #pred = pd.read_csv(file, names=column_names)
    pred = pd.read_csv(file)
    print(pred.shape)
    print(pred.head())
    pred.set_index('Class', inplace=True)
    for cls in sample_sub.index:
        if cls in pred.index:
            sample_sub.loc[cls, 'Segments'] = pred.loc[cls, 'Segments']
    sample_sub.reset_index().to_csv(SUBDIR_SINGLE_MODEL+'sorted/'+file.split('\\')[-1].split('.')[0]+'_sorted.csv', index=None)
    #break
#'''

G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/blend\MixNeXtVladModel_finetune_iter300_new.csv
(1000, 2)
   Class                                           Segments
0      3  d5yW:280 LEa7:135 4LI3:15 d5yW:80 LEa7:75 d5yW...
1      7  xot7:80 pnAP:155 pnAP:150 sgFt:275 HhUk:170 LL...
2      8  GLc8:15 tN2Z:35 KG7g:255 Tefx:165 SDNb:165 iOe...
3     11  RBoT:110 zdov:165 RBoT:120 zdov:95 RBoT:100 RB...
4     12  Qr1y:175 VzcL:45 yr73:140 0leM:245 JJ9a:155 0l...
G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/blend\mixnextvlad_iter60_shift2.csv
(1000, 2)
   Class                                           Segments
0      3  LEa7:55 2tpp:20 d5yW:280 d5yW:75 F4ZL:185 mUYZ...
1      7  i74x:5 i74x:0 rQKZ:60 xUFD:125 uf3s:150 NCE6:1...
2      8  tN2Z:70 icIs:165 tN2Z:255 a5cE:155 tN2Z:150 Cr...
3     11  hdTT:100 gF7h:115 0Ycu:100 nutU:270 hdTT:95 rZ...
4     12  L7U7:30 uX9U:35 FXxf:90 yr73:230 yr73:280 L7

In [10]:
sub_files = [
    'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/paper/back\\MixNeXtVladModel_finetune_iter300_new.csv',
    'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/paper/blend\\blend1-6_final.csv',
]

In [11]:
Hlabel = 'Class' 
Htarget = 'Segments'
npt = 100000 # number of places in target

lg = len(sub_files)
sub = [None]*lg
for i, file in enumerate( sub_files ):
    ## input files ##
    print("Reading {}: {}". format(i, file))
    reader = csv.DictReader(open(file,"r"))
    sub[i] = sorted(reader, key=lambda d: str(d[Hlabel]))
    #if i > 1:
    #    break

Reading 0: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/paper/back\MixNeXtVladModel_finetune_iter300_new.csv
Reading 1: G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/paper/blend\blend1-6_final.csv


In [15]:
w0 = 3.72384689385059
w1_merge = 1
w2_merge = 9

In [16]:
start = timeit.default_timer()
print('Start submission generation')

place_weights = {}
for i in range(npt):
    place_weights[i] = ( w0 / (i + w0) )

sub_weight = [w1_merge, w2_merge]
w_sum = w1_merge + w2_merge
print(f'W0: {w0}; Weight Percentage: {sub_weight}')

merge_final = {}
merge_final['Class'] = []
merge_final['Segments'] = []
for p, row in enumerate(sub[0]):
    target_weight = {}
    for s in range(lg):
        row1 = sub[s][p]
        for ind, trgt in enumerate(row1[Htarget].split(' ')):
            target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
    tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
    merge_final['Class'].append(row1[Hlabel])
    merge_final['Segments'].append(" ".join(tops_trgt))

merge_df_final = pd.DataFrame.from_dict(merge_final)
merge_df_final['Class'] = merge_df_final['Class'].astype(int)
print(merge_df_final.shape)
print(merge_df_final.head())

print(f'Finished submission generation! Spent {timeit.default_timer() - start}s')

Start submission generation
W0: 3.72384689385059; Weight Percentage: [1, 9]
(1000, 2)
   Class                                           Segments
0    100  AmKH:155 iAON:285 yeEH:275 ZvNy:140 yeEH:150 b...
1   1000  pyZt:170 pyZt:220 pyZt:180 pyZt:175 pyZt:210 p...
2   1001  1bhQ:45 1bhQ:80 1bhQ:145 TLOW:150 TLOW:235 1bh...
3   1002  rtuW:125 rtuW:130 rtuW:170 rtuW:120 rtuW:155 r...
4   1005  lSZ6:5 lSZ6:0 37Nx:140 37Nx:135 lSZ6:285 e2tL:...
Finished submission generation! Spent 209.96366780000017s


In [17]:
merge_df_final.to_csv(f'G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/final/sorted/sorted_single/paper/blend/blend1-7.csv', index=False)

In [52]:
ttt = pd.read_csv('G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/sorted/ensemble/ensemble_w1-w15.csv')
ttt.head()

Unnamed: 0,Class,Segments
0,100,ari1:160 AmKH:155 lIpW:265 ZvNy:140 iAON:285 b...
1,1000,pyZt:220 pyZt:175 pyZt:210 pyZt:180 pyZt:170 J...
2,1001,1bhQ:45 1bhQ:150 TLOW:200 TLOW:225 1bhQ:80 TLO...
3,1002,rtuW:125 rtuW:130 rtuW:155 rtuW:145 rtuW:140 r...
4,1005,37Nx:140 lSZ6:5 37Nx:135 pRH7:75 eDaV:110 lSZ6...


In [59]:
ttt = pd.read_csv('G:/Competition/Kaggle-Youtube8M/submissions/local_CV/submission/sorted/ensemble/ensemble_w1-w8.csv')
ttt.head()

Unnamed: 0,Class,Segments
0,100,LdsT:265 ZvNy:140 bCnN:75 AmKH:155 LdsT:275 ar...
1,1000,pyZt:220 pyZt:180 pyZt:210 pyZt:170 pyZt:80 lK...
2,1001,TLOW:200 TLOW:235 1bhQ:45 eCpP:0 TLOW:215 TLOW...
3,1002,rtuW:125 XrI5:20 3F8V:50 rtuW:155 rtuW:130 rtu...
4,1005,eDaV:110 LAIT:25 37Nx:135 37Nx:140 e2tL:190 lS...
