this approach takes a VERY long time and, at first glance, doesn't give a fundamental advantage.

In [1]:
import os
import auxiliary as aux
import numpy as np
import pandas as pd
from collections.abc import Iterable

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import parallel_backend
from joblib import Parallel, delayed

## ALS similarity

The essence of the method:
- fit ALS to get similar objects
- impute with statistics of `N` similar objects

In [2]:
param = {
    'factors': 25,
    'regularization': 0.001,
    'iterations': 15,
    'calculate_training_loss': True,
    'use_gpu': False,
    'random_state': 23
}

In [3]:
class Similarity:
    def __init__(self, **params):
        self.__als = AlternatingLeastSquares(**params)
        self.__items = None

    def fit(self, df):
        self.__als.fit(csr_matrix(df))
        self.__size = df.index.size
        return self

    def get_own_similars(self, uid, *, frac=1.0, threshold=0.7):
        """
        :param uid - single object id
        :param frac - use a fraction of sample
        """
        uid_list, proba = self.__als.similar_users(uid, N=int(self.__size * frac))
        return uid_list[1:][proba[1:] > threshold]

    def get_similars(self, uid, *, frac=1.0, threshold=0.7):
        """ This provides LOTS OF DATA (each-to-each)
        :param uid - object ids collection
        :param frac - subsample ratio of the training instance
        """
        _uid = uid if isinstance(uid, Iterable) else [uid, ]
        for idx in _uid:
            uid_list, proba = self.__als.similar_users(idx, N=int(self.__size * frac))
            similars = uid_list[1:][proba[1:] > threshold]
            yield idx, similars

In [4]:
os.environ['MKL_NUM_THREADS'] = '1'
sim = Similarity(**param).fit(aux.subset[2])

  0%|          | 0/15 [00:00<?, ?it/s]

In [5]:
def similar_stats(df, model, frac=1.0, threshold=0.9, aggfunc='mean'):
    """ Collect statistics based on similar items for each item containing NaN
    :param df - full original dataset
    :param model - Similarity model
    :param frac - subsample ratio of the training instance
    :param threshold - minimal accepted score
    :param aggfunc - data aggregation function

    IMPORTANT! frac=1.0 will take a very long time on large datasets
    """
    nan_rows = df.isna().any(axis=1)
    index = df[nan_rows].index
    similars_iterator = model.get_similars(index, frac=frac, threshold=threshold)
    print(f'fraction size: {int(frac * index.size)}')

    sim_stats = {}
    for idx, similars in tqdm(similars_iterator, total=index.size):
        sim_stats[idx] = df.loc[similars, :].agg(aggfunc)
    return pd.DataFrame(sim_stats).T

In [42]:
frac = 1e-4     # kaggle 1.42: 63 similars is very low
stats = similar_stats(aux.data, sim, frac=frac)
stats.to_csv(f'data/similar_stats_frac_{frac}.csv')
# NOTE it still contains NaN

In [9]:
frac = 0.05
stats = similar_stats(aux.data, sim, frac=frac)
stats.to_csv(f'data/similar_stats_frac_{frac}.csv')
# NOTE it still contains NaN

fraction size: 31761


100%|██████████| 635226/635226 [10:56:50<00:00, 16.12it/s]  


In [14]:
frac = 0.05
stats = pd.read_csv(f'data/similar_stats_frac_{frac}.csv', index_col='Unnamed: 0')
predicted = aux.data.fillna(stats)
predicted = predicted.fillna(predicted.mean())

## cosine

In [2]:
# # MANUAL
# from sklearn.metrics.pairwise import cosine_similarity
# chunksize = 100
# threshold = None
# k = 50
# df = aux.data.iloc[:1000]

# use_cols = df.columns[~df.isna().any()]     # select columns without NaN
# nan_rows = df.isna().any(axis=1)

# stats = []
# df_size = df[nan_rows].index.size
# chunk_count = df_size // chunksize + (1 if df_size % chunksize else 0)
# # iterate through chunks
# for start_idx in tqdm(range(0, df_size, chunksize), total=chunk_count):
#     end_idx = start_idx + chunksize
#     cosine = cosine_similarity(df.loc[nan_rows, use_cols].iloc[start_idx:end_idx], df[use_cols])

#     if threshold is not None:       # collect by threshold
#         mask = cosine > threshold       # apply threshold
#         np.fill_diagonal(mask, False)   # exclude ownes
#         # stats.extend(list(map(lambda m: df[m].mean().to_list(), mask)))
#         stats.extend(np.apply_along_axis(lambda m: df[m].mean(), 1, mask))
#     elif k is not None:     # collect by k nearest
#         # stats.extend(list(map(lambda c: df.iloc[np.argsort(c)[::-1][1:]].head(k).mean(), cosine)))
#         stats.extend(np.apply_along_axis(lambda c: df.iloc[np.argsort(c)[::-1][1:]].head(k).mean(), 1, cosine))
# pd.DataFrame(stats, index=df[nan_rows].index)

In [4]:
%%time
csim = aux.CosineSimilarity(aux.data.iloc[:50000])
# stats = csim.calculate(threshold=0.7, backend='threading')
stats = csim.calculate(k=50000, backend=None, chunksize=10)

In [1]:
%%time
csim = aux.CosineSimilarity(aux.data.iloc[:50000])
# stats = csim.calculate(threshold=0.7, backend='threading')
stats = csim.calculate(k=50000, backend='loky', chunksize=10)

In [2]:
%%time
csim = aux.CosineSimilarity(aux.data)
# stats = csim.calculate(threshold=0.7, backend='threading')
stats = csim.calculate(k=50000, backend='loky', chunksize=10)

Chunks: 25410


  1%|          | 176/25410 [31:39<12:18:39,  1.76s/it] 

In [7]:
# import gc
# gc.collect()

0

In [None]:
#