this approach takes a VERY long time and, at first glance, doesn't give a fundamental advantage.

In [1]:
import os
import auxiliary as aux
import numpy as np
import pandas as pd
from collections.abc import Iterable

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error

from copy import deepcopy
from tqdm import tqdm

In [2]:
aux.data.head()

Unnamed: 0_level_0,F_1_0,F_1_1,F_1_2,F_1_3,F_1_4,F_1_5,F_1_6,F_1_7,F_1_8,F_1_9,...,F_4_5,F_4_6,F_4_7,F_4_8,F_4_9,F_4_10,F_4_11,F_4_12,F_4_13,F_4_14
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.354591,-0.464038,2.304115,0.734486,1.696395,0.136285,-0.518344,0.50264,-1.852504,-0.500665,...,3.744152,0.794438,0.265185,-0.561809,0.19648,0.373434,6.206995,3.809505,1.236486,1.182055
1,1.38094,-0.499626,-0.418548,1.911725,-0.82613,-1.715371,-0.577091,-1.041486,0.596067,-0.363425,...,-2.895826,-0.738275,2.361818,-0.060753,0.727249,-0.271882,5.232157,-4.218259,-2.724883,-0.063775
2,0.256023,-1.059874,,0.345678,1.513814,1.243864,-0.509648,-0.800481,-0.115945,0.595777,...,2.252834,0.472496,2.491386,0.353381,-0.260682,-0.000833,-0.116457,-2.131747,3.661499,-0.131576
3,-0.72842,-2.432399,-2.453602,-0.020509,0.333397,0.086049,-1.787601,0.667011,0.761564,-2.217847,...,2.0046,-4.664806,-0.847211,-0.264249,0.664334,-0.557868,8.499483,-4.738799,-3.054611,0.494152
4,0.590212,-0.066127,0.468009,-1.096038,0.119399,-1.80971,0.466358,-0.053196,-0.58032,-1.1435,...,0.976937,2.558883,3.377724,0.846891,0.696032,0.554121,-5.979714,-2.869631,3.733057,-0.722943


## ALS similarity

The essence of the method:
- fit ALS to get similar objects
- impute with statistics of `N` similar objects

In [2]:
param = {
    'factors': 25,
    'regularization': 0.001,
    'iterations': 15,
    'calculate_training_loss': True,
    'use_gpu': False,
    'random_state': 23
}

In [3]:
class Similarity:
    def __init__(self, **params):
        self.__als = AlternatingLeastSquares(**params)
        self.__items = None

    def fit(self, df):
        self.__als.fit(csr_matrix(df))
        self.__size = df.index.size
        return self

    def get_own_similars(self, uid, *, frac=1.0, threshold=0.7):
        """
        :param uid - single object id
        :param frac - use a fraction of sample
        """
        uid_list, proba = self.__als.similar_users(uid, N=int(self.__size * frac))
        return uid_list[1:][proba[1:] > threshold]

    def get_similars(self, uid, *, frac=1.0, threshold=0.7):
        """ This provides LOTS OF DATA (each-to-each)
        :param uid - object ids collection
        :param frac - subsample ratio of the training instance
        """
        _uid = uid if isinstance(uid, Iterable) else [uid, ]
        for idx in _uid:
            uid_list, proba = self.__als.similar_users(idx, N=int(self.__size * frac))
            similars = uid_list[1:][proba[1:] > threshold]
            yield idx, similars

In [4]:
os.environ['MKL_NUM_THREADS'] = '1'
sim = Similarity(**param).fit(aux.subset[2])

  0%|          | 0/15 [00:00<?, ?it/s]

In [5]:
def similar_stats(df, model, frac=1.0, threshold=0.9, aggfunc='mean'):
    """ Collect statistics based on similar items for each item containing NaN
    :param df - full original dataset
    :param model - Similarity model
    :param frac - subsample ratio of the training instance
    :param threshold - minimal accepted score
    :param aggfunc - data aggregation function

    IMPORTANT! frac=1.0 will take a very long time on large datasets
    """
    nan_rows = df.isna().any(axis=1)
    index = df[nan_rows].index
    similars_iterator = model.get_similars(index, frac=frac, threshold=threshold)
    print(f'fraction size: {int(frac * index.size)}')

    sim_stats = {}
    for idx, similars in tqdm(similars_iterator, total=index.size):
        sim_stats[idx] = df.loc[similars, :].agg(aggfunc)
    return pd.DataFrame(sim_stats).T

In [42]:
frac = 1e-4     # kaggle 1.42: 63 similars is very low
stats = similar_stats(aux.data, sim, frac=frac)
stats.to_csv(f'data/similar_stats_frac_{frac}.csv')
# NOTE it still contains NaN

In [9]:
frac = 0.05
stats = similar_stats(aux.data, sim, frac=frac)
stats.to_csv(f'data/similar_stats_frac_{frac}.csv')
# NOTE it still contains NaN

fraction size: 31761


100%|██████████| 635226/635226 [10:56:50<00:00, 16.12it/s]  


In [14]:
frac = 0.05
stats = pd.read_csv(f'data/similar_stats_frac_{frac}.csv', index_col='Unnamed: 0')
predicted = aux.data.fillna(stats)
predicted = predicted.fillna(predicted.mean())

## cosine similarity

In [73]:
# # collect individual stats for each item containing NaN based on similar items with specified cosine similarity threshold
# # NOTE this will take ~35hours
# from sklearn.metrics.pairwise import cosine_similarity

# threshold = 0.9

# nan_rows = aux.data.isna().any(axis=1)
# index = aux.data[nan_rows].index
# stats = {}

# for row_idx in tqdm(aux.data[nan_rows].index):
#     exclude_own = aux.subset[2].index != row_idx
#     target = aux.subset[2].loc[exclude_own, :].copy()
#     # calculate cosine similarity between this and all other items
#     sim = cosine_similarity([aux.subset[2].loc[row_idx, :]], target)
#     target['cosine'] = sim[0]
#     # apply threshold and sort
#     mask = target['cosine'] > threshold
#     similars = target.loc[mask, 'cosine'].sort_values(ascending=False).index
#     # get statistics
#     stats[row_idx] = aux.data.loc[similars, :].mean()

# stats = pd.DataFrame(stats).T
# stats.to_csv('data/similars.csv')

In [2]:
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.utils import parallel_backend
# from joblib import Parallel, delayed

# threshold = 0.9
# nan_rows = aux.data.isna().any(axis=1)
# index = aux.data[nan_rows].index


# def calc_stats(idx, base, target, *, threshold=0.9):
#     """
#     :param idx - index of current item
#     :param base - dataset for cosine similarity calculation
#     :param target - dataset for statistics calculation
#     :threshold - minimal accepted cosine value
#     """
#     # calculate cosine similarity between this and all other items
#     cosine = cosine_similarity([base.loc[idx, :]], base)[0]
#     sim = pd.Series(cosine, index=base.index, name='cosine')
#     # apply threshold and sort
#     mask = (sim > threshold) & (sim.index != idx)
#     similars = sim[mask].sort_values(ascending=False).index

#     # get statistics
#     return idx, target.loc[similars, :].mean()

# def impute_cosine(base, target):
#     with parallel_backend('threading'):
#         result = dict(Parallel(n_jobs=-1)(delayed(calc_stats)(idx, base, target) for idx in tqdm(index[:100])))
#     return result

In [38]:
# stats = pd.DataFrame(impute_cosine(aux.subset[2], aux.subset[1])).T
# stats

## cosine with columns autodetection

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import parallel_backend
from joblib import Parallel, delayed


def calc_stats(idx, df, threshold=0.9):
    """
    :param idx - index of current item
    :param df - original dataset
    :threshold - minimal accepted cosine value
    """
    # get columns without NaN
    cols = df.columns[~df.isna().any()]

    # calculate cosine similarity between this and all other items
    cosine = cosine_similarity([df.loc[idx, cols]], df[cols])[0]
    sim = pd.Series(cosine, index=df.index, name='cosine')
    # apply threshold and sort
    mask = (sim > threshold) & (sim.index != idx)
    similars = sim[mask].sort_values(ascending=False).index
    # get statistics
    return idx, df.loc[similars, :].mean()

def impute_cosine(df, threshold=0.9, backend='threading'):
    nan_rows = df.isna().any(axis=1)
    index = df[nan_rows].index

    with parallel_backend(backend):
        result = dict(Parallel(n_jobs=-1)(delayed(calc_stats)(idx, df, threshold) for idx in tqdm(index)))
    return result

In [4]:
stats = pd.DataFrame(impute_cosine(aux.data, threshold=0.7, backend='loky')).T
stats

## tests

In [18]:
# arr = aux.subset[2].to_numpy().copy()
# for idx in tqdm(aux.subset[2].index[:18]):
#     cosine_similarity([aux.subset[2].iloc[0]], arr)[0]

## submission

In [None]:
aux.save_submission(predicted)

In [None]:
#