# TF-IDF inference

## Import packages

In [1]:
import gc
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GroupKFold

## Configures

In [2]:
class CFG:
    TRAIN_CSV = '../input/shopee-product-matching/train.csv'
    
    # groupkfold
    N_SPLITS = 5
    TEST_FOLD = 0

## Read and split dataset

In [3]:
df = pd.read_csv(CFG.TRAIN_CSV)

tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
df['matches'] = df['label_group'].map(tmp)
df['matches'] = df['matches'].apply(lambda x: ' '.join(x))

gkf = GroupKFold(n_splits=CFG.N_SPLITS)
df['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(gkf.split(X=df, groups=df['label_group'])):
    df.loc[valid_idx, 'fold'] = i

test_df = df[df['fold']==CFG.TEST_FOLD].reset_index(drop=True)

## Utils

In [4]:
def precision_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    precision = intersection / len_y_pred
    return precision

def recall_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_true = y_true.apply(lambda x: len(x)).values
    recall = intersection / len_y_true
    return recall

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [5]:
def get_neighbors(df, embeddings, KNN = 50):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN, metric = "cosine")
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    thresholds = list(np.linspace(0.1, 0.9, num=9))
    scores = []
    for threshold in thresholds:
        predictions = []
        for k in range(embeddings.shape[0]):
            idx = np.where(distances[k,] < threshold)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        df['pred_matches'] = predictions
        df['f1'] = f1_score(df['matches'], df['pred_matches'])
        score = df['f1'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')

    # Use threshold
    predictions = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k,] < best_threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
    
    del model, distances, indices
    gc.collect()
    return df, best_threshold

In [6]:
def get_knn_neighbors(df, embeddings, KNN = 50, threshold=0.6):
    model = NearestNeighbors(n_neighbors = KNN, metric = "cosine")
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    # Use threshold
    predictions = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
    df['pred_matches'] = predictions
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    df['recall'] = recall_score(df['matches'], df['pred_matches'])
    df['precision'] = precision_score(df['matches'], df['pred_matches'])
        
    del model, distances, indices
    gc.collect()
    return df, best_threshold

In [7]:
def get_text_embeddings(df_cu, max_features = 15000, n_components = 5000):
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    del model
    gc.collect()
    return text_embeddings

## Find best thresholds

In [8]:
# df_cu = cudf.DataFrame(test_df)
text_embeddings = get_text_embeddings(test_df, max_features = 15000, n_components = 5000)

# Get neighbors for text_embeddings
test_df, best_threshold = get_neighbors(test_df, text_embeddings, KNN = 50)

Our title text embedding shape is (6851, 9433)
Our f1 score for threshold 0.1 is 0.5196603413209685
Our f1 score for threshold 0.2 is 0.5735217462333471
Our f1 score for threshold 0.30000000000000004 is 0.6353535429478462
Our f1 score for threshold 0.4 is 0.7054377187245263
Our f1 score for threshold 0.5 is 0.7601620552843771
Our f1 score for threshold 0.6 is 0.7842075100158862
Our f1 score for threshold 0.7000000000000001 is 0.7343672790238077
Our f1 score for threshold 0.8 is 0.5234245852741843
Our f1 score for threshold 0.9 is 0.21400221568056407
Our best score is 0.7842075100158862 and has a threshold 0.6


## Find best knn

In [10]:
test_df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,fold,pred_matches,f1,recall,precision
0,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,Jubah anak size 1-12 thn,1835033137,train_1802986387 train_1396161074 train_713073...,0,train_1802986387 train_2490201622 train_139616...,0.461538,0.428571,0.5
1,train_1598329973,001d7f5d9a2fac714f4d5f37b3baffb4.jpg,bec8d09693634b4b,Atasan Rajut Wanita LISDIA SWEATER,2462407944,train_1598329973 train_841015183 train_4224502769,0,train_841015183 train_1598329973 train_4224502...,0.75,1.0,0.6
2,train_4196427721,002039aaf8618627a0442d5e89e5dda6.jpg,e98c873acc65946e,Korek Kuping LED untuk balita CherryBabyKidsSh...,349297863,train_4196427721 train_1482447822 train_234660...,0,train_4196427721 train_2221959828 train_148244...,0.545455,0.375,1.0
3,train_2985955659,002f978c58a44a00aadfca71c3cad2bb.jpg,bf38f0e083d7c710,HnKfashion Sweater Hoodie WHO Printing BabyTer...,3415582503,train_2985955659 train_3916258742 train_415673...,0,train_2985955659 train_3916258742 train_415673...,1.0,1.0,1.0
4,train_3466601092,004076b57135e761ab8b41d84acc4c94.jpg,aa2c0ee4eb6ba0cd,[Shiyan] mainan gigitan bayi set pack baby tee...,2933791615,train_3466601092 train_354147588,0,train_3466601092 train_354147588,1.0,1.0,1.0
