In [7]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

In [8]:
class CFG:
    TRAIN_CSV = '../input/shopee-product-matching/train.csv'
    
    # data split
    N_SPLITS = 5
    TEST_FOLD = 0
    VALID_FOLD = 1

In [9]:
def read_dataset():
    df = pd.read_csv(CFG.TRAIN_CSV)
    df['matches'] = df.label_group.map(df.groupby('label_group').posting_id.agg('unique').to_dict())
    df['matches'] = df['matches'].apply(lambda x: ' '.join(x))

    gkf = GroupKFold(n_splits=CFG.N_SPLITS)
    df['fold'] = -1
    for i, (train_idx, valid_idx) in enumerate(gkf.split(X=df, groups=df['label_group'])):
        df.loc[valid_idx, 'fold'] = i

    labelencoder= LabelEncoder()
    df['label_group'] = labelencoder.fit_transform(df['label_group'])

    train_df = df[df['fold']!=CFG.TEST_FOLD].reset_index(drop=True)
    train_df = train_df[train_df['fold']!=CFG.VALID_FOLD].reset_index(drop=True)
    valid_df = df[df['fold']==CFG.VALID_FOLD].reset_index(drop=True)
    test_df = df[df['fold']==CFG.TEST_FOLD].reset_index(drop=True)

    train_df['label_group'] = labelencoder.fit_transform(train_df['label_group'])

    return train_df, valid_df, test_df

In [10]:
def get_text_embeddings(df, max_features = 15000, n_components = 5000):
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df['title']).toarray()
    pca = PCA(n_components = n_components)
    text_embeddings = pca.fit_transform(text_embeddings)
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    return text_embeddings

In [11]:
def get_neighbors(df, embeddings, KNN = 50, image = True):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        if image:
            thresholds = list(np.arange(2,4,0.1))
        else:
            thresholds = list(np.arange(0.1, 1, 0.1))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            if image:
                idx = np.where(distances[k,] < 2.7)[0]
            else:
                idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                idx = np.where(distances[k,] < 2.7)[0]
            else:
                idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    return df, predictions

In [12]:
train_df, valid_df, test_df = read_dataset()
text_embeddings = get_text_embeddings(test_df, max_features = 15000, n_components = 5000)

# Get neighbors for text_embeddings
df, text_predictions = get_neighbors(test_df, text_embeddings, KNN = 50, image = False)

Our title text embedding shape is (6851, 9433)
