In [1]:
import sys
sys.path.insert(0, r'../..')

In [8]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import imageio
import pickle
import os
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict, Counter

In [3]:
from lib.video import *
from lib.cross_val import *
from lib.utils import convert_time

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut, train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [5]:
MODEL_DIR = '../../output/features/image_32_16/'
Y_DATA = '../../data/train/events_data.csv'
TRAIN_FILES = np.array(['641579_3.mp4','643734_5.mp4','633012_5.mp4','631638_5.mp4','631646_5.mp4','631750_5.mp4'])
EVENTS_TYPE = ['удар по воротам', 'угловой', 'замена', 'желтая карточка', 'гол']
MIN_COMBO = 4

In [7]:
y_data = pd.read_csv(Y_DATA)
y_data['event_time'] = y_data['event_time'].apply(convert_time)

In [52]:
def gen_segments(X, y=None, win_len=10):
    sx, sy = [], []
    nX, ny = [], []
    for i in range(len(X)):
        sx.append(X[i])
        if y is not None:
            sy.append(y[i])
        if i % win_len == win_len - 1:
            nX.append(sx)
            if y is not None:
                ny.append(Counter(sy).most_common(1)[0][0])
            sx, sy = [], []
    if y is None:
        return nX
    return nX, ny

In [53]:
def sampling(Xv, yv, k=1.0, max_size=13000):
    nX, ny = [], []
    bad_pairs = []
    for X, y in zip(Xv, yv):
        if y in EVENTS_TYPE:
            nX.append(X)
            ny.append(y)
        else:
            bad_pairs.append((X, y))
    bad_count = len(bad_pairs)
    for i in np.random.choice(bad_count, size=int(len(nX) * k)):
        X, y = bad_pairs[i]
        nX.append(X)
        ny.append(y)
    nX, ny = np.array(nX), np.array(ny)
    if max_size is not None and len(nX) > max_size:
        ind = np.random.choice(len(nX), size=max_size)
        nX, ny = nX[ind], ny[ind]
    return list(nX), list(ny)

In [54]:
def replace_none(y):
    return ['NONE' if v is None else v for v in y]

In [55]:
all_results = []

In [60]:
def load_Xy(files, is_sampling=False):
    X, y = [], []
    for file in files:
        path = MODEL_DIR + file + '.pickle'
        with open(path, 'rb') as f:
            Xv, yv, X_time = pickle.load(f)
        yv = replace_none(yv)
        Xs, ys = gen_segments(Xv, yv)
        if is_sampling:
            Xs, ys = sampling(Xs, ys)
        X += Xs
        y += ys
    print(len(X))
    X = np.array(X).reshape((len(X), -1, 32*16*3))
    y = np.array(y)
    return X, y

def load_X(files):
    X, X_file_names, X_time = [], [], []
    for file in files:
        path = MODEL_DIR + file + '.pickle'
        with open(path, 'rb') as f:
            Xv, yv, X_time = pickle.load(f)
        Xs = gen_segments(Xv)
        X += Xs
        X_time += X_time
        X_file_names += [file] * len(Xv)
    X = np.array(X).reshape((len(X), -1))
    return X, X_file_names, X_time

class MetaModel():
    def __init__(self, model):
        self.model = model
        self.pred_buf = {}
        self.fit_set = set()
    
    def fit(self, files):
        files = frozenset(files)
        if files in self.fit_set:
            return
        self.fit_set.add(files)
        X, y = load_Xy(files)
        print('Fit with', len(X), 'samples')
        self.model.fit(X, y)
        
    def score(self, files):
        X, y = load_Xy(files, False)
        print('Score with', len(X), 'samples')
        y_pred = self.model.predict(X)
        return f1_score(y, y_pred, average='micro')
        return pd.DataFrame({
            'true': y,
            'pred': y_pred
        })
        
        # Разбить по отрезкам фиксированной длины. Сделать мета модель по отрезкам. Одновременно тоже самое для звуков. (как дрозовфилы)
        # Поставить нормальную метрику, без извращений с матчингом
    def predict(self, files):
        files = frozenset(files)
        X, X_file_names, X_time = load_X(files)
        print('Predict for', len(X), 'samples')
        if files in self.pred_buf:
            print('From pred_buf')
            y_pred = self.pred_buf[files]
        else:
            y_pred = self.model.predict(X)
            self.pred_buf[files] = y_pred
        result = []
        combo = 0
        combo_type = None
        last_event = defaultdict(lambda: -10000)
        for i in range(len(X)):
            if 0 <= X_time[i] <= 6200:
                continue
            if y_pred[i] in EVENTS_TYPE:
                if combo_type == y_pred[i]:
                    combo += 1
                else:
                    combo = 1
                combo_type = y_pred[i]
            else:
                combo = 0
            if combo == MIN_COMBO and last_event[combo_type] < X_time[i] - 59:
                result.append({
                    'file_name': X_file_names[i],
                    'event_type': combo_type,
                    'event_time': X_time[i]
                })
                last_event[combo_type] = X_time[i]
        result = pd.DataFrame(result)
        all_results.append(result)
        return result

In [61]:
def only_one_fold(cv):
    def fun(X):
        return [next(cv.split(X))]
    class s: pass
    obj = s
    s.split = fun
    return obj

In [62]:
ridge = MetaModel(RidgeClassifier(alpha=10, normalize=True))

In [63]:
ridge.fit(TRAIN_FILES[:2])
ans = ridge.score(TRAIN_FILES[-2:])
ans

1575
Fit with 1575 samples


ValueError: Found array with dim 3. Estimator expected <= 2.

In [36]:
print(cross_val_score(ridge, y_data, TRAIN_FILES, LeaveOneOut()).mean())

A Jupyter Widget

['удар по воротам' 'удар по воротам' 'удар по воротам' ..., 'NONE' 'NONE'
 'NONE']
Fit with 58104 samples
Predict for 7047 samples
task_score: true_positives=12 false_positives=22 false_negatives=24
cross_val_score: score=0.20689655172413793 test_files=['641579_3.mp4']
['удар по воротам' 'удар по воротам' 'удар по воротам' ..., 'NONE' 'NONE'
 'NONE']
Fit with 58530 samples
Predict for 8714 samples
task_score: true_positives=20 false_positives=45 false_negatives=21
cross_val_score: score=0.23255813953488372 test_files=['643734_5.mp4']
['удар по воротам' 'удар по воротам' 'удар по воротам' ..., 'NONE' 'NONE'
 'NONE']
Fit with 58104 samples
Predict for 8618 samples
task_score: true_positives=26 false_positives=63 false_negatives=27
cross_val_score: score=0.22413793103448276 test_files=['633012_5.mp4']
['удар по воротам' 'удар по воротам' 'удар по воротам' ..., 'NONE' 'NONE'
 'NONE']
Fit with 58104 samples
Predict for 11891 samples
task_score: true_positives=8 false_positives=60 false_nega

In [19]:
all_results[-1]

Unnamed: 0,event_time,event_type,file_name
0,-297,удар по воротам,631750_5.mp4
1,13,удар по воротам,631750_5.mp4
2,77,удар по воротам,631750_5.mp4
3,141,удар по воротам,631750_5.mp4
4,201,удар по воротам,631750_5.mp4
5,261,удар по воротам,631750_5.mp4
6,325,удар по воротам,631750_5.mp4
7,419,удар по воротам,631750_5.mp4
8,481,удар по воротам,631750_5.mp4
9,550,удар по воротам,631750_5.mp4
