In [12]:
import pandas as pd
from pandas import Series, DataFrame
import networkx as nx
import numpy as np
def get_y_true(df):
    """
    Get true prediction indices
    """
    index = Series(np.arange(df.shape[0]))
    grp = index.groupby(df['label_group'].values).agg(list)

    return df['label_group'].map(grp).tolist()

def get_validation_folds(df, nfolds=5, random_state=42):
    """
    Function to create validation folds. Split not only by label group, but also by title, image, phash
    """
    np.random.seed(random_state)
    G = nx.Graph()

    for col in ['label_group', 'title', 'image_phash', 'image']:

        agg = df.groupby(col)['posting_id'].agg(list).tolist()
        for p in agg:
            nx.add_path(G, p)

    cc = {}
    for n, c in enumerate(nx.connected_components(G)):
        val = min(c)
        for x in c:
            cc[x] = val

    group_idx = df['posting_id'].map(cc).values
    groups = np.unique(group_idx)
    np.random.shuffle(groups)

    split = np.array_split(groups, nfolds)

    folds = np.zeros(df.shape[0], dtype=np.int32)

    for n, s in enumerate(split):
        folds[np.isin(group_idx, s)] = n

    return folds


data = pd.read_csv('../input/train_min.csv')
device = 'cuda:0'
y_true = get_y_true(data)
folds = get_validation_folds(data, 5, 42)

In [13]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import torch,gc
def get_dist_features(D):
    """
    Get density features for embed point
    """
    features = []
    for i in [2, 3, 5, 10, 20, 50]:
        features.append(D[:, 1: i].mean(axis=1))

    for i in [.5, .6, .7, .8, .9, .95, .97, .99]:
        features.append((D >= i).sum(axis=1))

    return np.stack(features, axis=1).astype(np.float32)
def csr_to_torch_sparse(csr_mat):
    """
    Transform csr matrix to torch Sparse format
    """
    coo_mat = csr_mat.astype(np.float32).tocoo()

    row = torch.from_numpy(coo_mat.row).type(torch.int64)
    col = torch.from_numpy(coo_mat.col).type(torch.int64)
    edge_index = torch.stack([row, col], dim=0)

    val = torch.from_numpy(coo_mat.data)
    out = torch.sparse.FloatTensor(edge_index, val, torch.Size(coo_mat.shape))

    return out
def get_di_torch(embed, n_candidates=50, batch_size=1000):
    """
    Calc distances/indices matrices from embeddings
    """
    D = np.zeros((embed.shape[0], n_candidates), dtype=np.float32)
    I = np.zeros((embed.shape[0], n_candidates), dtype=np.int32)

    flg_dense = isinstance(embed, np.ndarray)

    if flg_dense:
        embed_cuda = torch.from_numpy(embed).cuda()
    else:
        embed_cuda = csr_to_torch_sparse(embed).cuda()

    for i in range(0, embed.shape[0], batch_size):

        if flg_dense:
            embed_batch = embed_cuda[i: i + batch_size]
            d = torch.matmul(embed_cuda, embed_batch.T).T
        else:
            embed_batch = torch.from_numpy(embed[i: i + batch_size].toarray().T).cuda()
            d = torch.matmul(embed_cuda, embed_batch).T

        idx = torch.argsort(d, dim=1, descending=True)[:, :n_candidates]
        I[i: i + batch_size, :idx.shape[1]] = idx.cpu().numpy()
        D[i: i + batch_size, :idx.shape[1]] = torch.gather(d, 1, idx).cpu().numpy()

    del d, idx, embed_cuda, embed_batch
    torch.cuda.empty_cache()

    return D, I

def cutoff_prediction(D, I, cutoff, exact_add=2):
    """
    Cutoff prediction of distances/indices matrices
    """
    res = []

    ranger = np.arange(D.shape[1])

    for d, i in zip(D, I):
        res.append(i[(d > cutoff) | (ranger < exact_add)])
    return res


def get_tfidf_embed(data, param_list, cutoffs):
    """
    Get TfIdf embeddings with different tokenize params
    """
    tfidf_embed, tfidf_D, tfidf_I, tfidf_points = [], [], [], []

    for params in param_list:
        vect = TfidfVectorizer(**params, dtype=np.float32)
        tfidf_embed.append(vect.fit_transform(data['title']))

        _d, _i = get_di_torch(tfidf_embed[-1])
        tfidf_D.append(_d)
        tfidf_I.append(_i)

        tfidf_points.append(get_dist_features(tfidf_D[-1]))
        print(params)
        
    tfidf_preds = []

    for d, i, co in zip(tfidf_D, tfidf_I, cutoffs):

        tfidf_preds.append(cutoff_prediction(d, i, co))
        print(sum(map(len, tfidf_preds[-1])) / len(data))

    del tfidf_I, tfidf_D
    gc.collect()
        
    return tfidf_embed, tfidf_preds, tfidf_points
    

tfidf_embed, tfidf_preds, tfidf_points = get_tfidf_embed(data,                  
                                        param_list = [
                                            {'lowercase': True, 'ngram_range': (1, 1)}, 
                                            {'lowercase': True, 'ngram_range': (3, 3),
                                             'analyzer': 'char'},  
                                        ], 
                                        cutoffs=[0.45, 0.45])

{'lowercase': True, 'ngram_range': (1, 1)}
{'lowercase': True, 'ngram_range': (3, 3), 'analyzer': 'char'}
4.938139674894642
4.746989765201686
CPU times: user 706 ms, sys: 307 ms, total: 1.01 s
Wall time: 858 ms


In [None]:
print(type(tfidf_embed),tfidf_embed[0:1])

<class 'list'> [<6644x9564 sparse matrix of type '<class 'numpy.float32'>'
	with 54743 stored elements in Compressed Sparse Row format>, <6644x14186 sparse matrix of type '<class 'numpy.float32'>'
	with 322282 stored elements in Compressed Sparse Row format>]
