In [1]:
%%bash
# install sentence transformers from dataset
cp -r ../input/sentencetransformers-sourceandsomemodels/sentence-transformers ./sentence-transformers
pip install -U --no-build-isolation --no-deps ./sentence-transformers

## copy sentence transformers pretrained models and configuration files from dataset to local caches
mkdir -p  /root/.cache/torch/

cp -r ../input/sentencetransformers-sourceandsomemodels/torch/sentence_transformers /root/.cache/torch

Processing ./sentence-transformers
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=124483 sha256=c0d7a796f2323694525dcd9f1f2c70df9f32c2d364d3bf5477ba7148203117a4
  Stored in directory: /root/.cache/pip/wheels/2a/dd/1f/d5e3ed645ab6e4c3bf10ce5be36e121cf4289cb69861525509
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.0




In [2]:
!unzip  ../input/pykakasi/pykakasi.deps -d ./
!pip install --no-index --find-links=./pykakasi pykakasi

Archive:  ../input/pykakasi/pykakasi.deps
   creating: ./pykakasi/
  inflating: ./pykakasi/zipp-3.8.0-py3-none-any.whl  
  inflating: ./pykakasi/jaconv-0.3.tar.gz  
  inflating: ./pykakasi/wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl  
  inflating: ./pykakasi/typing_extensions-4.2.0-py3-none-any.whl  
  inflating: ./pykakasi/importlib_metadata-4.11.4-py3-none-any.whl  
  inflating: ./pykakasi/pykakasi-2.2.1-py3-none-any.whl  
  inflating: ./pykakasi/Deprecated-1.2.13-py2.py3-none-any.whl  
Looking in links: ./pykakasi
Processing ./pykakasi/pykakasi-2.2.1-py3-none-any.whl
Processing ./pykakasi/jaconv-0.3.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
[?25hProcessing ./pykakasi/Deprecated-1.2.13-py2.py3-none-any.whl
Building wheels for collected packages: jaconv
  Building wheel for jaconv (setup.py) ... [?25l- done
[?25h  Created wheel for jaconv: filename=jaconv-0.3-py3-none-any.whl size=15

In [3]:
import pandas as pd


VERBOSE = False

def display_data():
    if VERBOSE:
        data = pd.read_feather("data.feather")
        display(data.head())

def display_pairs():
    if VERBOSE:
        pairs = pd.read_feather("pairs.feather")
        display(pairs.head())
def display_features():
    if VERBOSE:
        pairs = pd.read_feather("pairs.feather")
        cols = [c for c in pairs.columns]
        print(len(cols))
        display(cols)

In [4]:
%%writefile setup.py
import pandas as pd
import sys
def main():
    args = sys.argv[1:]
    if args[0] == "test":
        df = pd.read_csv("../input/foursquare-location-matching/test.csv")
        cols = [ c for c in df.columns]
        if df.shape[0] == 5:
            df = pd.read_csv("../input/foursquare-location-matching/train.csv")
            df = df[cols][:100]
    elif args[0] == "train":
        df = pd.read_feather("../input/flm-kfold-pairs/train_data.feather")
        if DEBUG:
            df = df[:100]
    elif args[0] == "valid":
        df = pd.read_feather("../input/flm-kfold-pairs/valid_data.feather")
        if DEBUG:
            df = df[:100]        
    
    print (f"{args[0]} data shape:{df.shape}")
    cols = [c for c in df.columns]
    print (cols)
    df.to_feather ("data.feather")
    
DEBUG = False
if __name__ == "__main__":
    main()

Writing setup.py


In [5]:
%%time
!python setup.py test

test data shape:(100, 12)
['id', 'name', 'latitude', 'longitude', 'address', 'city', 'state', 'zip', 'country', 'url', 'phone', 'categories']
CPU times: user 94.2 ms, sys: 25.8 ms, total: 120 ms
Wall time: 9.48 s


## Text2Vec

In [6]:
%%writefile text2vec.py
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
import pickle

def sentence_transformer_feat2Vec (model_name,df, feat): 

    model = SentenceTransformer(model_name)
    text_df = df[[feat]].drop_duplicates().reset_index(drop=True)
    
    vec =  model.encode(text_df[feat].values, show_progress_bar=True)

    return  text_df, vec                    

def main():


    df = pd.read_feather("data.feather")

    df["name"] =  df["name"].fillna("unknow")
    df["address"] =  df["address"].fillna("unknow")
    df["city"] =  df["city"].fillna("unknow")
    df["state"] =  df["state"].fillna("unknow")
    df["country"] =  df["country"].fillna("unknow")
    df["categories"] =  df["categories"].fillna("unknow")
    #df["city_state_country"] = df["city"]+ ", " +  df["state"] + ", " + df["country"]


    for model_name in [
        'sentence-transformers_paraphrase-xlm-r-multilingual-v1', 
        'sentence-transformers_paraphrase-multilingual-mpnet-base-v2',
        'sentence-transformers_all-mpnet-base-v2',
    ]:

        for feat in [ "categories", "city", "state", "name", "address" ]:

            print(f"{model_name}: {feat}")
            text_df, vec = sentence_transformer_feat2Vec (f"{LOCAL_CACHE}sentence_transformers/{model_name}",   df, feat)
            text_df.to_csv(f"{model_name}_{feat}.csv", index=False)

            with open(f'{model_name}_{feat}.vec', 'wb') as handle:
                pickle.dump(vec, handle)
            print(f"vec:{vec.shape}")

            
LOCAL_CACHE = '../input/sentencetransformers-sourceandsomemodels/torch/'
if __name__ == "__main__":
    main()

Writing text2vec.py


In [7]:
%%time
!python text2vec.py

sentence-transformers_paraphrase-xlm-r-multilingual-v1: categories
Batches: 100%|████████████████████████████████████| 3/3 [00:00<00:00,  3.11it/s]
vec:(82, 768)
sentence-transformers_paraphrase-xlm-r-multilingual-v1: city
Batches: 100%|████████████████████████████████████| 3/3 [00:00<00:00, 91.28it/s]
vec:(77, 768)
sentence-transformers_paraphrase-xlm-r-multilingual-v1: state
Batches: 100%|████████████████████████████████████| 2/2 [00:00<00:00, 61.72it/s]
vec:(57, 768)
sentence-transformers_paraphrase-xlm-r-multilingual-v1: name
Batches: 100%|████████████████████████████████████| 4/4 [00:00<00:00, 58.84it/s]
vec:(99, 768)
sentence-transformers_paraphrase-xlm-r-multilingual-v1: address
Batches: 100%|████████████████████████████████████| 3/3 [00:00<00:00, 65.79it/s]
vec:(71, 768)
sentence-transformers_paraphrase-multilingual-mpnet-base-v2: categories
Batches: 100%|████████████████████████████████████| 3/3 [00:00<00:00, 74.49it/s]
vec:(82, 768)
sentence-transformers_par

## LatLong2Vect

In [8]:
%%writefile latlong2vec.py
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle


#https://stackoverflow.com/questions/10473852/convert-latitude-and-longitude-to-point-in-3d-space
def LLHtoECEF(lat, lon):
    rad = np.float64(6378137.0)        # Radius of the Earth (in meters)
    f = np.float64(1.0/298.257223563)  # Flattening factor WGS84 Model
    LLHtoECEF_FF = (1.0-f)**2

    cosLat = np.cos(lat)
    sinLat = np.sin(lat)
    C = 1/np.sqrt(cosLat**2 + LLHtoECEF_FF * sinLat**2)
    S = C * LLHtoECEF_FF

    x = (rad * C)*cosLat * np.cos(lon)
    y = (rad * C)*cosLat * np.sin(lon)
    z = (rad * S)*sinLat
    

    mat = np.vstack((x,y,z)).T

    mat = mat / np.linalg.norm(mat, axis=1).reshape((-1, 1))

    
    return mat

def lat_lon_feat2vec(df):
    lat_lon_matrix = df[["latitude","longitude"]].values
    vec = LLHtoECEF ( lat_lon_matrix[:,0], lat_lon_matrix[:,1] )
    
    return vec

def main():
    data = pd.read_feather("data.feather")
    vec = lat_lon_feat2vec(data)
    with open(f'lat_lon.vec', 'wb') as handle:
        pickle.dump(vec, handle)

    print(f"latlong2vec shape:{vec.shape}")
    
if __name__ == "__main__":
    main()

Writing latlong2vec.py


In [9]:
%%time
!python latlong2vec.py

latlong2vec shape:(100, 3)
CPU times: user 11.4 ms, sys: 5.24 ms, total: 16.7 ms
Wall time: 1.43 s


## Generate pairs (pykakasi)

In [10]:
%%writefile convert.py
import pandas as pd
import pykakasi


def convert_japanese_alphabet(df: pd.DataFrame):
    kakasi = pykakasi.kakasi()
    kakasi.setMode('H', 'a')  # Convert Hiragana into alphabet
    kakasi.setMode('K', 'a')  # Convert Katakana into alphabet
    kakasi.setMode('J', 'a')  # Convert Kanji into alphabet
    conversion = kakasi.getConverter()

    def convert(row):
        for column in ["name", "address", "city", "state"]:
            try:
                row[column] = conversion.do(row[column])
            except:
                pass
        return row

    df[df["country"] == "JP"] = df[df["country"] == "JP"].apply(convert, axis=1)
    return df


def main():
    data = pd.read_feather("data.feather")
    print(data.query("country == 'JP'")["name"].values[:10])
    
    data = data [["id", "name", "address", "city", "state", "country"]]
    print(f"shape:{data.shape}")

    data = convert_japanese_alphabet(data)
    print(data.query("country == 'JP'")["name"].values[:10])    
    print(f"shape:{data.shape}")
    
    data.to_feather("convert_data.feather")


if __name__ == "__main__":
    main()

Writing convert.py


In [11]:
%%time
!python convert.py

['つじ田 味噌の章' 'ローソン 弘前撫牛子店' 'ざま駅前歯科医院' '下高井戸 どどん' '虎萬元 南青山店'
 'イオン乙金ショッピングセンター' '甲武信小屋']
shape:(100, 6)
  kakasi.setMode('H', 'a')  # Convert Hiragana into alphabet
  kakasi.setMode('K', 'a')  # Convert Katakana into alphabet
  kakasi.setMode('J', 'a')  # Convert Kanji into alphabet
  conversion = kakasi.getConverter()
  row[column] = conversion.do(row[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df["country"] == "JP"] = df[df["country"] == "JP"].apply(convert, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to b

In [12]:
%%writefile generate_pairs.py
import warnings
warnings.filterwarnings('ignore')
import os

import random
import pandas as pd
import numpy as np
import pickle

import torch
if torch.cuda.is_available():
    import cuml
    from cuml.neighbors import NearestNeighbors 
    print(f"cuml:{cuml.__version__}")
else:
    from sklearn.neighbors import NearestNeighbors 
from sklearn.feature_extraction.text import TfidfVectorizer    
    
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    
    


from tqdm import tqdm

#https://stackoverflow.com/questions/10473852/convert-latitude-and-longitude-to-point-in-3d-space
def LLHtoECEF(lat, lon):
    rad = np.float64(6378137.0)        # Radius of the Earth (in meters)
    f = np.float64(1.0/298.257223563)  # Flattening factor WGS84 Model
    LLHtoECEF_FF = (1.0-f)**2

    cosLat = np.cos(lat)
    sinLat = np.sin(lat)
    C = 1/np.sqrt(cosLat**2 + LLHtoECEF_FF * sinLat**2)
    S = C * LLHtoECEF_FF

    x = (rad * C)*cosLat * np.cos(lon)
    y = (rad * C)*cosLat * np.sin(lon)
    z = (rad * S)*sinLat
    

    mat = np.vstack((x,y,z)).T

    mat = mat / np.linalg.norm(mat, axis=1).reshape((-1, 1))

    
    return mat

def lat_lon_feat2vec(df):
    lat_lon_matrix = df[["latitude","longitude"]].values
    vec = LLHtoECEF ( lat_lon_matrix[:,0], lat_lon_matrix[:,1] )
    
    return vec


def recall_knn_latlong (df, neighbors, total_neighbors, threshold  ):


    
    pairs = []
    for country, country_df in tqdm(df.groupby('country')):
        pairs_country = []
        size = country_df.shape[0] 
        vec = lat_lon_feat2vec(country_df)
        

 
        total_neighbors_ = min ( size, total_neighbors)
        neighbors_ = min(size, neighbors)
   
        if total_neighbors_ > 1:        
            knn = NearestNeighbors(n_neighbors = total_neighbors_, metric = 'cosine',  algorithm="brute", n_jobs = -1)
          
            knn.fit(vec)
            dists, nears = knn.kneighbors(vec, return_distance = True)    
            mean_1 = dists[:, :neighbors_].mean(axis=1)
            mean_2 = dists[:, :total_neighbors_].mean(axis=1)
            for k in range(0,total_neighbors_):            
                cur_df = country_df[['id']]
                cur_df['match_id'] = country_df['id'].values[nears[:, k]]
                cur_df[f'kdist'] = dists[:, k]
                cur_df[f'kneighbors'] = k
                cur_df[f'kdist_mean_1'] = mean_1
                cur_df[f'kdist_mean_2'] = mean_2
                cur_df = cur_df.query("id != match_id  and (kneighbors < @neighbors_ or kdist < @threshold) ")

                if len(cur_df) > 0:

                    flag = cur_df["id"] > cur_df["match_id"]
                    ids = cur_df[flag]["id"].values
                    match_ids = cur_df[flag]["match_id"].values

                    cur_df.loc[flag, "id"] = match_ids
                    cur_df.loc[flag, "match_id"] = ids
                    pairs_country.append(cur_df)
                    
            pairs_country = pd.concat (pairs_country)
                    
            pairs_country = pairs_country.groupby ( ["id","match_id"]).agg ( {
                                                                f'kneighbors':["mean"],
                                                                f'kdist_mean_1': ["mean"],
                                                                f'kdist_mean_2': ["mean"],
                                                                f'kdist':["count"]
                                                                   } ).reset_index() 

            pairs_country.columns =  ['_'.join(col) for col in pairs_country.columns.values]
            pairs_country = pairs_country.rename(columns={"id_":"id", "match_id_":"match_id"})
            pairs.append(pairs_country)


    pairs = pd.concat(pairs)
    
    print(f"recall_knn_latlong: {pairs.shape}")
    
    return pairs


def recall_knn_vec ( df, neighbors, total_neighbors, threshold, vec_name ):

    
    name = pd.read_csv(f"{TEXT2VEC_PATH}/{TEXT2VEC_PREFIX}{vec_name}.csv")
    with open(f'{TEXT2VEC_PATH}/{TEXT2VEC_PREFIX}{vec_name}.vec', 'rb') as handle:
        name_vec = pickle.load(handle)
    
    d = name[vec_name].to_dict()
    name2ids = { d[k]:k for k in d  }        
    pairs = []
    for country, country_df in tqdm(df.groupby('country')):
        pairs_country = []
        country_df = country_df[ country_df[vec_name].notnull() ]
        names = country_df[vec_name].values
        size = names.shape[0] 
        vec = np.zeros ((size, name_vec.shape[1]))
        for i in range (size):
            vec[i] = name_vec[ name2ids [names[i]] ,:] 

        total_neighbors_ = min ( size, total_neighbors)
        neighbors_ = min(size, neighbors)
        
        if total_neighbors_ > 1:        
            knn = NearestNeighbors(n_neighbors = total_neighbors_, metric = 'cosine',  algorithm="brute", n_jobs = -1)
            
            #print (country, size, total_neighbors_, neighbors_)
            
            knn.fit(vec)
            dists, nears = knn.kneighbors(vec, return_distance = True)    
            mean_1 = dists[:, :neighbors_].mean(axis=1)
            mean_2 = dists[:, :total_neighbors_].mean(axis=1)
            
            for k in range(0,total_neighbors_):            
                cur_df = country_df[['id']]
                cur_df['match_id'] = country_df['id'].values[nears[:, k]]
                cur_df[f'xml_{vec_name}_kdist'] = dists[:, k]
                cur_df[f'xml_{vec_name}_kneighbors'] = k
                cur_df[f'xml_{vec_name}_kdist_mean_1'] = mean_1
                cur_df[f'xml_{vec_name}_kdist_mean_2'] = mean_2
                cur_df = cur_df.query(f"id != match_id  and (xml_{vec_name}_kneighbors < @neighbors_ or xml_{vec_name}_kdist < @threshold) ")
                if len(cur_df) > 0:
                    flag = cur_df["id"] > cur_df["match_id"]
                    ids = cur_df[flag]["id"].values
                    match_ids = cur_df[flag]["match_id"].values

                    cur_df.loc[flag, "id"] = match_ids
                    cur_df.loc[flag, "match_id"] = ids
                    pairs_country.append(cur_df)
                    
            pairs_country = pd.concat (pairs_country)
                    
            pairs_country = pairs_country.groupby ( ["id","match_id"]).agg ( {
                                                                f'xml_{vec_name}_kneighbors':["mean"],
                                                                f'xml_{vec_name}_kdist_mean_1': ["mean"],
                                                                f'xml_{vec_name}_kdist_mean_2': ["mean"],
                                                                f'xml_{vec_name}_kdist':["count"]
                                                                   } ).reset_index() 

            pairs_country.columns =  ['_'.join(col) for col in pairs_country.columns.values]
            pairs_country = pairs_country.rename(columns={"id_":"id", "match_id_":"match_id", f'xml_{vec_name}_kdist': f"xml_{vec_name}_count"})
            pairs.append(pairs_country)

    pairs = pd.concat(pairs)
    
    print(f"recall_knn_vec: {vec_name}: {pairs.shape}")
    
    return pairs

def recall_knn_tfid ( df, neighbors, total_neighbors, col ):
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(df[col].fillna('nan'))
    pairs = []
    for country, country_df in tqdm(df.groupby('country')):
        pairs_country = []
        country_df = country_df[ country_df[col].notnull() ]
        size = len(country_df)
        total_neighbors_ = min ( size, total_neighbors)
        neighbors_ = min(size, neighbors)
        
        
        
        if total_neighbors_ > 1:        

            knn = NearestNeighbors(n_neighbors = total_neighbors_, metric = 'cosine', algorithm="brute", n_jobs = -1)
            
            #print (country, size, total_neighbors_, neighbors_)

            
            idx = country_df.index
            knn.fit(tv_fit[idx])            
            
            dists, nears = knn.kneighbors(tv_fit[idx], return_distance = True)    
            mean_1 = dists[:, :neighbors_].mean(axis=1)
            mean_2 = dists[:, :total_neighbors_].mean(axis=1)
            
            for k in range(0,neighbors_):            
                cur_df = country_df[['id']]
                cur_df['match_id'] = country_df['id'].values[nears[:, k]]
                cur_df[f'tfidf_{col}_kdist'] = dists[:, k]
                cur_df[f'tfid_{col}_kneighbors'] = k
                cur_df[f'tfid_{col}_kdist_mean_1'] = mean_1
                cur_df[f'tfid_{col}_kdist_mean_2'] = mean_2
                cur_df = cur_df.query("id != match_id")
                if len(cur_df) > 0:
                    flag = cur_df["id"] > cur_df["match_id"]
                    ids = cur_df[flag]["id"].values
                    match_ids = cur_df[flag]["match_id"].values

                    cur_df.loc[flag, "id"] = match_ids
                    cur_df.loc[flag, "match_id"] = ids

                    pairs_country.append(cur_df)
            pairs_country = pd.concat (pairs_country)
            pairs_country = pairs_country.groupby ( ["id","match_id"]).agg ( {
                                                            f'tfid_{col}_kneighbors':["mean"],
                                                            f'tfid_{col}_kdist_mean_1': ["mean"],
                                                            f'tfid_{col}_kdist_mean_2': ["mean"],
                                                            f'tfidf_{col}_kdist':["count"]
                                                               } ).reset_index() 
            pairs_country.columns =  ['_'.join(col) for col in pairs_country.columns.values]
            pairs_country = pairs_country.rename(columns={"id_":"id", "match_id_":"match_id" })
            pairs.append(pairs_country)
            
    pairs = pd.concat(pairs)
    print(f"recall_knn_tfid: {col}: {pairs.shape}")
    return pairs

def recall_knn_tfid_char ( df, neighbors, total_neighbors, col ):
    tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
    tv_fit = tfidf.fit_transform(df[col].fillna('nan'))
    pairs = []
    for country, country_df in tqdm(df.groupby('country')):
        pairs_country = []
        country_df = country_df[ country_df[col].notnull() ]
        size = len(country_df)
        total_neighbors_ = min ( size, total_neighbors)
        neighbors_ = min(size, neighbors)
        
        
        
        if total_neighbors_ > 1:        

            knn = NearestNeighbors(n_neighbors = total_neighbors_, metric = 'cosine', algorithm="brute", n_jobs = -1)
            
            #print (country, size, total_neighbors_, neighbors_)

            
            idx = country_df.index
            knn.fit(tv_fit[idx])            
            
            dists, nears = knn.kneighbors(tv_fit[idx], return_distance = True)    
            mean_1 = dists[:, :neighbors_].mean(axis=1)
            mean_2 = dists[:, :total_neighbors_].mean(axis=1)
            
            for k in range(0,neighbors_):            
                cur_df = country_df[['id']]
                cur_df['match_id'] = country_df['id'].values[nears[:, k]]
                cur_df[f'tfid_char_{col}_kdist'] = dists[:, k]
                cur_df[f'tfid_char_{col}_kneighbors'] = k
                cur_df[f'tfid_char_{col}_kdist_mean_1'] = mean_1
                cur_df[f'tfid_char_{col}_kdist_mean_2'] = mean_2
                cur_df = cur_df.query("id != match_id")
                if len(cur_df) > 0:
                    flag = cur_df["id"] > cur_df["match_id"]
                    ids = cur_df[flag]["id"].values
                    match_ids = cur_df[flag]["match_id"].values

                    cur_df.loc[flag, "id"] = match_ids
                    cur_df.loc[flag, "match_id"] = ids

                    pairs_country.append(cur_df)
            pairs_country = pd.concat (pairs_country)
            pairs_country = pairs_country.groupby ( ["id","match_id"]).agg ( {
                                                            f'tfid_char_{col}_kneighbors':["mean"],
                                                            f'tfid_char_{col}_kdist_mean_1': ["mean"],
                                                            f'tfid_char_{col}_kdist_mean_2': ["mean"],
                                                            f'tfid_char_{col}_kdist':["count"]
                                                               } ).reset_index() 
            pairs_country.columns =  ['_'.join(col) for col in pairs_country.columns.values]
            pairs_country = pairs_country.rename(columns={"id_":"id", "match_id_":"match_id" })
            pairs.append(pairs_country)
            
    pairs = pd.concat(pairs)
    print(f"recall_knn_tfid_char: {col}: {pairs.shape}")
    return pairs


def recall_knn(df, verbose=False):
    
    pairs = recall_knn_latlong (df, LATLONG_NEIGHBORS,  TOTAL_NEIGHBORS, LATLONG_THRESHOLD)

    pairs = reduce_mem_usage(pairs)
    print(f"shape: {pairs.shape}")

    
    #tfidf char name
    data =  pd.read_feather( "convert_data.feather")
    recall_df = recall_knn_tfid_char(data, NEIGHBORS, TOTAL_NEIGHBORS, "name")
    recall_df = reduce_mem_usage(recall_df)
    pairs = pairs.merge(recall_df, on=["id","match_id"], how="outer")    
    print(f"tfidf_char name shape: {pairs.shape}")
    del data
                       
    #vec name
    recall_df = recall_knn_vec(df, NEIGHBORS, TOTAL_NEIGHBORS_2, NAME_THRESHOLD, "name")
    recall_df = reduce_mem_usage(recall_df)
    pairs = pairs.merge(recall_df, on=["id","match_id"], how="outer")
    print(f"vec name shape: {pairs.shape}")

    #vec address
    recall_df = recall_knn_vec(df, NEIGHBORS, TOTAL_NEIGHBORS_2, ADDRESS_THRESHOLD, "address")
    recall_df = reduce_mem_usage(recall_df)
    pairs = pairs.merge(recall_df, on=["id","match_id"], how="outer")
    pairs = reduce_mem_usage(pairs)
    print(f"vec address shape: {pairs.shape}")
    
    #vec categoris
    """
    recall_df = recall_knn_vec(df,NEIGHBORS, TOTAL_NEIGHBORS_2, CATEGORIES_THRESHOLD, "categories")
    recall_df = reduce_mem_usage(recall_df)
    pairs = pairs.merge(recall_df, on=["id","match_id"], how="outer")

    pairs = reduce_mem_usage(pairs)
    print(f"vec categories shape: {pairs.shape}")
    """
    

    print(pairs.dtypes)

    
    return pairs

def generate_pairs (data):
    
    train_data = recall_knn(data, verbose=False)
    

    #generate target
    if "point_of_interest" in data:
        data = data.set_index('id')
        ids = train_data['id'].tolist()
        match_ids = train_data['match_id'].tolist()

        poi = data.loc[ids]['point_of_interest'].values
        match_poi = data.loc[match_ids]['point_of_interest'].values

        train_data['label'] = np.array(poi == match_poi, dtype = np.int8)
    
    return train_data

def reduce_mem_usage(df, verbose=True):

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def main():
    data = pd.read_feather("data.feather")


    pairs = generate_pairs (data)


    print(f"pairs shape: {pairs.shape}")

    pairs = pairs.reset_index(drop=True)
    print("save")
    pairs.to_feather(f"pairs.feather")
    
    
SEED = 2022
seed_everything(SEED)
TOTAL_NEIGHBORS = 100
TOTAL_NEIGHBORS_2 = 50
LATLONG_NEIGHBORS = 12
LATLONG_THRESHOLD = 0.000015
NAME_THRESHOLD = 0.00099
ADDRESS_THRESHOLD = 0.00492
#CATEGORIES_THRESHOLD = 0.00001
NEIGHBORS = 10
TEXT2VEC_PATH = "."
TEXT2VEC_PREFIX = "sentence-transformers_paraphrase-xlm-r-multilingual-v1_"


if __name__ == "__main__":
    main()

Writing generate_pairs.py


In [13]:
%%time
!python generate_pairs.py

cuml:21.10.02
100%|███████████████████████████████████████████| 32/32 [00:09<00:00,  3.55it/s]
recall_knn_latlong: (291, 6)
Memory usage after optimization is: 0.01 MB
Decreased by 35.4%
shape: (291, 6)
100%|███████████████████████████████████████████| 32/32 [00:01<00:00, 31.98it/s]
recall_knn_tfid_char: name: (281, 6)
Memory usage after optimization is: 0.01 MB
Decreased by 35.4%
tfidf_char name shape: (365, 10)
100%|███████████████████████████████████████████| 32/32 [00:03<00:00, 10.39it/s]
recall_knn_vec: name: (262, 6)
Memory usage after optimization is: 0.01 MB
Decreased by 35.4%
vec name shape: (397, 14)
100%|███████████████████████████████████████████| 32/32 [00:02<00:00, 13.93it/s]
recall_knn_vec: address: (189, 6)
Memory usage after optimization is: 0.01 MB
Decreased by 35.4%
Memory usage after optimization is: 0.02 MB
Decreased by 30.0%
vec address shape: (412, 18)
id                                   object
match_id                             object

## Feature Extraction

In [14]:
%%writefile generate_fe.py
import warnings
warnings.filterwarnings('ignore')
import os

import random
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm

import torch
if torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    
    

def reduce_mem_usage(df, verbose=True):

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def cosine_similarity (a,b):
    """
    a /= np.linalg.norm(a, axis=1).reshape((-1, 1))
    b /= np.linalg.norm(b, axis=1).reshape((-1, 1))

    cos_sim = np.sum(a*b, axis=1)
    """
    
    
    t1 = torch.from_numpy(a).to(DEVICE)
    t2 = torch.from_numpy(b).to(DEVICE)
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    output = cos(t1, t2)
    return output.to('cpu').numpy()

def lat_lon_cos_sim (pairs, inv_d, path):

    file = open(path,'rb')
    lat_lon_vec = pickle.load(file)

    ids1_x = pairs["id"].map(inv_d)
    ids2_x = pairs["match_id"].map(inv_d)


    a = lat_lon_vec[ids1_x]
    b = lat_lon_vec[ids2_x]
    #cos_sim = np.sum(a*b, axis=1)
    #return cos_sim
    return cosine_similarity(a,b)

def get_vec (feat_series, feat_name, prefix  ):
    
    df_feat = pd.read_csv(f'{prefix}{feat_name}.csv')
    file = open(f'{prefix}{feat_name}.vec','rb')
    feat_vec = pickle.load(file)

    
    d = df_feat[feat_name].to_dict()
    inv_d = {v: k for k, v in d.items()}

    feat_values = feat_series.values

    vec = np.zeros((feat_values.shape[0], feat_vec.shape[1]))
    for i in range(vec.shape[0]):
        vec[i,:] = feat_vec[ inv_d[feat_values[i]], : ]
    return vec

def feat_cos_sim(feat_name,  prefix , feat_series , inv_d, pairs, batch_size):
    vec = get_vec ( feat_series,feat_name = feat_name, prefix = prefix )    
    step = len(pairs)//batch_size
    if batch_size*step < len(pairs):
        step += 1

    ret = []
    start = 0
    for i in tqdm(range(step)):
        end = start + batch_size

        ids1_x = pairs[start:end]["id"].map(inv_d).values
        ids2_x = pairs[start:end]["match_id"].map(inv_d).values

        a = vec[ids1_x] 
        b = vec[ids2_x] 

        
        """
        a /= np.linalg.norm(a, axis=1).reshape((-1, 1))
        b /= np.linalg.norm(b, axis=1).reshape((-1, 1))

        cos_sim = np.sum(a*b, axis=1)
        """

        cos_sim = cosine_similarity(a,b)

        
        ret.append(cos_sim)
        start += batch_size
    
    return np.concatenate(ret)


def generate_fe (path, batch_size = 100_000):
    data = pd.read_feather("data.feather")
    d = data["id"].to_dict()
    # train["id"] --> train.index
    inv_d = {v: k for k, v in d.items()}
    data.index = data["id"]

    pairs = pd.read_feather("pairs.feather")
    if DEBUG:
        pairs = pairs [:1000]
    
    if "label" in pairs:
        targets = pairs["label"].values
        pairs = pairs.drop(["label"], axis = 1)
    else:
        targets = None
    
    
    pairs = pairs.rename(columns={"id_count":"kdist_count"})

    for feat_name in ["address", "city", "categories"]:
        print(feat_name)
        d = data[feat_name].notnull().to_dict()
        pairs[f"{feat_name}_id_notnull"] = pairs["id"].map(d).astype(np.int8)
        pairs[f"{feat_name}_match_id_notnull"] = pairs["match_id"].map(d).astype(np.int8)
    pairs = reduce_mem_usage (pairs)

    print("lat_lon_cs")
    cos_sim = lat_lon_cos_sim (pairs, inv_d, path=f"{INPUT_VEC}/lat_lon.vec")
    pairs["lat_lon_cs"] = cos_sim
    pairs = reduce_mem_usage (pairs)

    postfix = "xml"
    prefix = "./sentence-transformers_paraphrase-xlm-r-multilingual-v1_"

    for feat_name in [ "name", "address", "categories", "city" ]:
        print(feat_name)
        cos_sim = feat_cos_sim (feat_name = feat_name, prefix = prefix , 
                                feat_series = data[feat_name].fillna("unknow"), 
                                inv_d = inv_d, 
                                pairs = pairs, 
                                batch_size = batch_size)
        pairs[f"{feat_name}_{postfix}_cs"] = cos_sim
        pairs = reduce_mem_usage ( pairs , verbose=True)

    postfix = "mpnet-ml"
    path = "./sentence-transformers_paraphrase-multilingual-mpnet-base-v2_"

    for feat_name in [ "name", "address", "categories", "city" ]:
        print(feat_name)
        cos_sim = feat_cos_sim (feat_name = feat_name, prefix = prefix , 
                                feat_series = data[feat_name].fillna("unknow"), 
                                inv_d = inv_d, 
                                pairs = pairs, 
                                batch_size = batch_size)
        pairs[f"{feat_name}_{postfix}_cs"] = cos_sim
        pairs = reduce_mem_usage ( pairs , verbose=True)

    postfix = "mpnet"
    path = "./sentence-transformers_all-mpnet-base-v2_"

    for feat_name in [ "name", "address", "categories", "city" ]:
        print(feat_name)
        cos_sim = feat_cos_sim (feat_name = feat_name, prefix = prefix, 
                                feat_series = data[feat_name].fillna("unknow"), 
                                inv_d = inv_d, 
                                pairs = pairs, 
                                batch_size = batch_size)
        pairs[f"{feat_name}_{postfix}_cs"] = cos_sim
        pairs = reduce_mem_usage ( pairs , verbose=True)

    if not targets is None:
        pairs["label"] = targets
        
    return pairs 



SEED = 2022
seed_everything(SEED)

INPUT_PATH = "./"
INPUT_VEC = "./"
DEBUG = False

def main():
    pairs = generate_fe(path = INPUT_PATH,  batch_size = 200_000)
    pairs.to_feather(f"pairs.feather")
    
if __name__ == "__main__":
    main()

Writing generate_fe.py


In [15]:
%%time
!python generate_fe.py

address
city
categories
Memory usage after optimization is: 0.02 MB
Decreased by 0.0%
lat_lon_cs
Memory usage after optimization is: 0.02 MB
Decreased by 9.6%
name
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 136.36it/s]
Memory usage after optimization is: 0.02 MB
Decreased by 9.3%
address
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 306.69it/s]
Memory usage after optimization is: 0.02 MB
Decreased by 9.0%
categories
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 321.08it/s]
Memory usage after optimization is: 0.02 MB
Decreased by 8.8%
city
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 313.92it/s]
Memory usage after optimization is: 0.03 MB
Decreased by 8.5%
name
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 315.22it/s]
Memory usage after optimization is: 0.03 MB
Decreased by 8.3%
address
100%|████████████████████████████████████████████| 1/1 

In [16]:
display_features()

## Feature Extraction EXT

In [17]:
%%writefile generate_fe_ext.py
import pandas as pd
import numpy as np

import os
import gc
import random
import Levenshtein
import difflib

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

import joblib

def calculate_jaccard_char(str1, str2):
    
    # Combine both tokens to find union.
    both_tokens = str1 + str2
    union = set(both_tokens)
    if len(union) == 0:
        return 0
    
    # Calculate intersection.
    intersection = set()
    for w in set(str1):
        if w in set(str2):
            intersection.add(w)

    jaccard_score = len(intersection)/len(union)
    
    return jaccard_score

def calculate_jaccard_word(str1, str2):
    
    # Combine both tokens to find union.
    words1 = str1.split()
    words2 = str2.split()
    union = set(words1 + words2)
    if len(union) == 0:
        return 0
    
    # Calculate intersection.
    intersection = set()
    for word in union:
        if word in words1 and word in words2:
            intersection.add(word)

    jaccard_score = len(intersection)/len(union)
    
    return jaccard_score
    
def calculate_jaccard_word_smallest(str1, str2):
    
    if str1 == str2:
        return 1
    
    # Combine both tokens to find union.
    words1 = str1.split()
    words2 = str2.split()
    union = set(words1 + words2)
    small = min(len(set(words1)), len(set(words2)))
    if small == 0:
        return 0
    
    # Calculate intersection.
    intersection = set()
    for word in union:
        if word in words1 and word in words2:
            intersection.add(word)

    jaccard_score = len(intersection)/small
    
    return jaccard_score


def reduce_mem_usage(df, verbose=True):

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df



def generate_fe_ext (train_pairs, batch_size ):
    def process_tfidf (col):
        
        train = pd.read_feather(f"col_{col}.feather")
        
        tfidf = TfidfVectorizer()
        tv_fit = tfidf.fit_transform(train[col].fillna('nan'))
        a = np.array(tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1)).ravel().astype(np.float16)
        np.save(f"{col}_tfid.npy", a)
        
        del train, tfidf, a
        gc.collect()    

    def process_tfidf_char (col):
        
        train = pd.read_feather(f"conv_col_{col}.feather")
        
        tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
        tv_fit = tfidf.fit_transform(train[col].fillna('nan'))        

        a = np.array(tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1)).ravel().astype(np.float16)
        np.save(f"conv_{col}_tfid.npy", a)
        
        del train, tfidf, a
        gc.collect()    
        
        

    def process_ext(col):
        
        with open("log.txt", 'a') as f:
            f.write(f"process_ext: start {col}\n")
        
        train = pd.read_feather(f"col_{col}.feather")

        ret_gesh = []
        ret_levens = []
        ret_jaros = []
        ret_lcs = []
        ret_jaccard_words = []
        ret_jaccard_chars = []        


        start = 0
        for i in range(step):
            gesh = []
            levens = []
            jaros = []
            lcs = []
            jaccard_words = []
            jaccard_chars = []

            end = start + batch_size          
            id_values = train.loc[indexs[start:end]][col].values.astype(str)
            match_id_values = train.loc[match_indexs[start:end]][col].values.astype(str)


            
            for s, match_s in zip(id_values, match_id_values):
                if s != 'nan' and match_s != 'nan':                    
                    sm = difflib.SequenceMatcher(None, s, match_s)
                    
                    gesh.append(sm.ratio())
                    levens.append(Levenshtein.distance(s, match_s))
                    jaros.append(Levenshtein.jaro_winkler(s, match_s))                    
                    lcs.append(sm.find_longest_match(0,len(s), 0,len(match_s)).size)
                    
                    if col in ["name", "address", "categories"]:
                        jaccard_words.append(calculate_jaccard_word(s, match_s))
                        jaccard_chars.append(calculate_jaccard_char(s, match_s))                    
                else:
                    gesh.append(np.nan)
                    levens.append(np.nan)
                    jaros.append(np.nan)
                    lcs.append(np.nan)
                    if col in ["name", "address", "categories"]:
                        jaccard_words.append(np.nan)
                        jaccard_chars.append(np.nan)        
                    
            start += batch_size            
            
            ret_gesh.append(np.array(gesh).astype(np.float16))
            ret_levens.append(np.array(levens).astype(np.float16))
            ret_jaros.append(np.array(jaros).astype(np.float16))
            ret_lcs.append(np.array(lcs).astype(np.float16))
            del gesh, levens, jaros, lcs

            if col in ["name", "address", "categories"]:
                ret_jaccard_words.append(np.array(jaccard_words).astype(np.float16))
                ret_jaccard_chars.append(np.array(jaccard_chars).astype(np.float16))
            
            del jaccard_words, jaccard_chars
            gc.collect()
            
        np.save(f"{col}_gesh.npy", np.concatenate(ret_gesh))
        np.save(f"{col}_levens.npy", np.concatenate(ret_levens))
        np.save(f"{col}_jeros.npy", np.concatenate(ret_jaros))
        np.save(f"{col}_lcs.npy", np.concatenate(ret_lcs))
        del train, ret_gesh, ret_levens, ret_jaros, ret_lcs

        if col in ["name", "address", "categories"]:        
            np.save(f"{col}_jaccard_words.npy", np.concatenate(ret_jaccard_words))
            np.save(f"{col}_jaccard_chars.npy", np.concatenate(ret_jaccard_chars))        
            del ret_jaccard_words, ret_jaccard_chars
        
        gc.collect()            
        with open("log.txt", "a") as f:
            f.write(f"process_ext: end {col}\n")
        
        
    
    train = pd.read_feather("data.feather")
    
    #train_pairs = train_pairs.drop(["xml_name_kdist", "xml_address_kdist", "xml_categories_kdist"], axis=1)
    #train_pairs = reduce_mem_usage (train_pairs, verbose = True)
    tfidf_cols = ["name", "address", "categories", 'state', 'zip', 'url', 'phone' ] 
    tfidf_conv_cols = ["name", "address", "city", "state"] 
    feat_cols = ["name", "address", "categories", 'state', 'zip', 'url', 'phone']
    
    
    d = train["id"].to_dict()
    # train["id"] --> train.index
    inv_d = {v: k for k, v in d.items()}

    indexs = [inv_d[i] for i in train_pairs['id']]
    match_indexs = [inv_d[i] for i in train_pairs['match_id']]
    del d, inv_d 
    gc.collect()
    
    print("save cols")
    for col in set(tfidf_cols+feat_cols):
        print(col)
        train[[col]].to_feather(f"col_{col}.feather")
    
    del train
    gc.collect()    
    
    train = pd.read_feather("convert_data.feather")
    print("save conv cols")
    for col in set(tfidf_conv_cols):
        print(col)
        train[[col]].to_feather(f"conv_col_{col}.feather")
    
    del train
    gc.collect()        

    
    print("TF-IDF char feats")
    joblib.Parallel(n_jobs=2, timeout=9999999)(joblib.delayed(process_tfidf_char)(col) for col in tfidf_conv_cols)  
    
    
    print("TF-IDF feats")
    joblib.Parallel(n_jobs=2, timeout=9999999)(joblib.delayed(process_tfidf)(col) for col in tfidf_cols)  


    print("Distance feats")
    #train.index = train["id"]

    step = train_pairs.shape[0]//batch_size
    if batch_size*step < train_pairs.shape[0]:
        step += 1
    
    joblib.Parallel(n_jobs=2, timeout=9999999)(joblib.delayed(process_ext)(col) for col in feat_cols)

    del indexs, match_indexs
    gc.collect()    

    print("loading TF-IDF conv feats")
    for col in tfidf_conv_cols:
        print(f"loading {col}" )
        train_pairs[f'conv_{col}_tfid'] =  np.load(f'conv_{col}_tfid.npy')
    train_pairs = reduce_mem_usage (train_pairs, verbose = False)    
    
    print("loading TF-IDF feats")
    for col in tfidf_cols:
        print(f"loading {col}" )
        train_pairs[f'{col}_tfid'] =  np.load(f'{col}_tfid.npy')
    train_pairs = reduce_mem_usage (train_pairs, verbose = False)
    
    print("loading Distance feats")
    for col in feat_cols:

        print(f"loading {col}" )
        
        train_pairs[f"{col}_gesh"] = np.load(f"{col}_gesh.npy")
        train_pairs[f"{col}_levens"] =  np.load(f"{col}_levens.npy")
        train_pairs[f"{col}_jeros"] =  np.load(f"{col}_jeros.npy")
        train_pairs[f"{col}_lcs"] =  np.load(f"{col}_lcs.npy")   
    
        if col in ["name", "address", "categories"]:       
            train_pairs[f"{col}_jaccard_words"] =  np.load(f"{col}_jaccard_words.npy")
            train_pairs[f"{col}_jaccard_chars"] =  np.load(f"{col}_jaccard_chars.npy")               
        train_pairs = reduce_mem_usage (train_pairs, verbose=False)   
            
    train = pd.read_feather("data.feather")            
    train.index = train["id"]         
    print("create len feats")        
    for col in feat_cols:
        print(col)        
        train[f"len_{col}"]=train[col].fillna("").map(lambda x:len(x))
        d=train[f"len_{col}"].to_dict()
        train_pairs[f"{col}_len_diff"] = np.abs( train_pairs["id"].map(d) - train_pairs["match_id"].map(d)  )    


    country_size = train.groupby("country").size()
    train.index=train["id"]
    d=train["country"].to_dict()

    train_pairs["country_id_size"] = train_pairs["id"].map(lambda x: country_size[d[x]])/150_000
    del d
    train_pairs = reduce_mem_usage (train_pairs, verbose = True)

    print("lat/long feats")
    d_train_lat = train["latitude"].to_dict()
    d_train_long = train["longitude"].to_dict()

    train_pairs["lat_id"] = train_pairs["id"].map(d_train_lat)
    train_pairs["lat_match_id"] = train_pairs["match_id"].map(d_train_lat)
    train_pairs["long_id"] = train_pairs["id"].map(d_train_long)
    train_pairs["long_match_id"] = train_pairs["match_id"].map(d_train_long)
    
    del d_train_lat, d_train_long
    train_pairs["euclidean"] = np.sqrt(((train_pairs['lat_id'] - train_pairs['lat_match_id'])**2) + ((train_pairs['long_id'] - train_pairs['long_match_id'])**2))
    train_pairs["diff_lat"] = np.abs(train_pairs['lat_id'] - train_pairs['lat_match_id'])
    train_pairs["diff_long"] = np.abs(train_pairs['long_match_id'] - train_pairs['long_id'])    
    
    train_pairs = reduce_mem_usage (train_pairs, verbose = True)
    gc.collect()
    
    print("size feats")    
   
    for feat in ["name", "address", "url"]:
        print(feat)
        df = train.groupby([feat,"country"]).agg({"id":"count"}).reset_index()
        df = df.query("id >1").reset_index()
        df = df.rename(columns={"id":f"{feat}_country_size"})
        df = train.merge( df, on = feat, how="left" )
        df[f"{feat}_country_size"] =   df[f"{feat}_country_size"].fillna(1)/10_000
        df.index = df["id"]    
        d = df[f"{feat}_country_size"].to_dict()
        del df
        train_pairs[f"{feat}_country_size_id"] = train_pairs["id"].map(d)
        train_pairs[f"{feat}_country_size_match_id"] = train_pairs["match_id"].map(d)
        del d    
        train_pairs = reduce_mem_usage (train_pairs, verbose = True)
        gc.collect()    

        df = train.groupby([feat]).agg({"id":"count"}).reset_index()
        df = df.query("id >1").reset_index()
        df = df.rename(columns={"id":f"{feat}_size"})
        df = train.merge( df, on = feat, how="left" )
        df[f"{feat}_size"] =   df[f"{feat}_size"].fillna(1)/10_000
        df.index = df["id"]    
        d = df[f"{feat}_size"].to_dict()
        del df
        train_pairs[f"{feat}_size_id"] = train_pairs["id"].map(d)
        train_pairs[f"{feat}_size_match_id"] = train_pairs["match_id"].map(d)
        del d    
        train_pairs = reduce_mem_usage (train_pairs, verbose = True)
        gc.collect()    
    
    
    return train_pairs

def main():
    DEBUG = False
    pairs  = pd.read_feather("pairs.feather")
    if DEBUG :
        pairs = pairs [:66333]


    pairs = generate_fe_ext (pairs, batch_size = 50_000)
    pairs.to_feather ("pairs.feather")
    
if __name__ == "__main__":
    main()

Writing generate_fe_ext.py


In [18]:
%%time
!python generate_fe_ext.py

save cols
url
address
zip
categories
phone
state
name
save conv cols
address
state
city
name
TF-IDF char feats
TF-IDF feats
Distance feats
loading TF-IDF conv feats
loading name
loading address
loading city
loading state
loading TF-IDF feats
loading name
loading address
loading categories
loading state
loading zip
loading url
loading phone
loading Distance feats
loading name
loading address
loading categories
loading state
loading zip
loading url
loading phone
create len feats
name
address
categories
state
zip
url
phone
Memory usage after optimization is: 0.07 MB
Decreased by 23.0%
lat/long feats
Memory usage after optimization is: 0.08 MB
Decreased by 17.8%
size feats
name
Memory usage after optimization is: 0.08 MB
Decreased by 5.7%
Memory usage after optimization is: 0.08 MB
Decreased by 5.6%
address
Memory usage after optimization is: 0.08 MB
Decreased by 5.5%
Memory usage after optimization is: 0.08 MB
Decreased by 5.4%


## Inference

In [19]:
%%writefile predict.py
import pandas as pd
import numpy as np
import lightgbm as lgbm
from collections import defaultdict
from itertools import combinations
from tqdm import tqdm
import gc

def reduce_mem_usage(df, verbose=True):

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage before optimization is: {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def predict_xgb (model_name, pairs, cols, batch_size = 64):
    import torch
    from tqdm import tqdm
    if torch.cuda.is_available():
        from cuml import ForestInference
        model = ForestInference.load(filename=model_name,model_type='xgboost')
        step = pairs.shape[0]//batch_size
        if batch_size*step < pairs.shape[0]:
            step += 1
        ret = []
        start = 0
        for i in tqdm(range(step)):
            end = start + batch_size        
            
            X = pairs[start:end][cols].values
            
            pred = model.predict(X)
            
            ret.append(np.asarray(pred).astype(np.float16))
            start += batch_size
        pred = np.concatenate(ret)       
        
        
    else:
        model = lgbm.Booster(model_file=model_name)
        step = pairs.shape[0]//batch_size
        if batch_size*step < pairs.shape[0]:
            step += 1
        ret = []
        start = 0
        for i in tqdm(range(step)):
            end = start + batch_size        
            
            X = pairs[start:end][cols].values
            
            pred = model.predict(X)
            
            ret.append(np.asarray(pred).astype(np.float16))
            start += batch_size
        pred = np.concatenate(ret)               
        
        

        
    #print(pred.shape, type(pred))    
        
    return pred


def predict_lgb (model_name, pairs, cols, batch_size = 64):
    import torch
    from tqdm import tqdm
    if torch.cuda.is_available():
        from cuml import ForestInference
        model = ForestInference.load(filename=model_name,model_type='lightgbm')
        step = pairs.shape[0]//batch_size
        if batch_size*step < pairs.shape[0]:
            step += 1
        ret = []
        start = 0
        for i in tqdm(range(step)):
            end = start + batch_size        
            
            X = pairs[start:end][cols].values
            
            pred = model.predict(X)
            
            ret.append(np.asarray(pred).astype(np.float16))
            start += batch_size
        pred = np.concatenate(ret)       
        
        
    else:
        model = lgbm.Booster(model_file=model_name)
        step = pairs.shape[0]//batch_size
        if batch_size*step < pairs.shape[0]:
            step += 1
        ret = []
        start = 0
        for i in tqdm(range(step)):
            end = start + batch_size        
            
            X = pairs[start:end][cols].values
            
            pred = model.predict(X)
            
            ret.append(np.asarray(pred).astype(np.float16))
            start += batch_size
        pred = np.concatenate(ret)               
        
        

        
    #print(pred.shape, type(pred))    
        
    return pred

def create_submission (data, pairs, threshold ):
    probs = {}

    def get_probs(a,b):
        if a == b:
            return 1.0
        if a < b:
            return probs[(a,b)]
        else:
            return probs[(b,a)]

    
    print ("create_submission")
    pairs = pairs.append ( pairs.rename(columns = {"id":"match_id", "match_id":"id"}) )
    pairs = pairs.groupby(["id","match_id"]).agg({"pred":"mean"}).reset_index()
    pairs = pairs[pairs["pred"] > threshold ]
    
    
    print ("load probs")
    
    for a, b, c in tqdm(zip ( pairs["id"].values, pairs["match_id"].values, pairs["pred"].values ), total=len(pairs)):
        if a < b:
            probs[(a,b)] = c
        else:
            probs[(b,a)] = c
    
    ids = data["id"].unique()
    id_df = pd.DataFrame ({
        "id": ids,
        "match_id":ids
    })

    
    submission = pairs[['id', 'match_id']]
    submission = submission.append ( id_df )

    print ("create list")

    submission = submission.groupby('id')['match_id'].apply(list).reset_index()
    submission["match_id"] = submission["match_id"].map(lambda x: list(set(x)))
    
    sol = defaultdict(list)

    mat = submission[["id","match_id"]].values
    for k in tqdm(range ( mat.shape[0] )):
        id = mat[k,0]
        matches = mat[k,1]

        #first level (a,b) -> (b,a)
        for x in matches:
            if not x in sol[id]:
                sol[id].append(x)
            if not id in sol[x]:
                sol[x].append(id)

        #second level (a,b), (a,c) -> (b,c) ?
        matches.remove(id)
        if len(matches) > 2:
            for x, y in combinations(matches, 2):
                    if y not in sol[x]:
                        sol[x].append(y)
                    if x not in sol[y]:
                        sol[y].append(x)    

    submission["match_id"] = submission["id"].map(sol)
    submission['match_id'] = submission['match_id'].map(lambda x: list(set(x)))
    submission['matches'] = submission['match_id'].apply(lambda x: ' '.join(set(x)))

    return submission [["id", "matches"]]



XGB_MODELS = [
    
    "../input/flm-xgb-99h-v06a-sample-50",
    "../input/flm-xgb-99h-v06b-sample-50",
    "../input/flm-xgb-99h-v06c-sample-50",
    "../input/flm-xgb-99h-v06d-sample-50",    
    
    "../input/flm-xgb-99h-v06a-sample-65-s01",
    "../input/flm-xgb-99h-v06b-sample-65-s01",
    "../input/flm-xgb-99h-v06c-sample-65-s01",
    "../input/flm-xgb-99h-v06d-sample-65-s01",
    
    "../input/flm-xgb-99h-v06a-sample-65-s02",
    "../input/flm-xgb-99h-v06b-sample-65-s02",
    "../input/flm-xgb-99h-v06c-sample-65-s02",
    "../input/flm-xgb-99h-v06d-sample-65-s02",

    
    "../input/flm-xgb-99h-v06a-sample-65",
    "../input/flm-xgb-99h-v06b-sample-65",
    "../input/flm-xgb-99h-v06c-sample-65",
    "../input/flm-xgb-99h-v06d-sample-65",
]


LGB_MODELS = [
]

BATCH_SIZE = 80_000
THRESHOLD = 0.65

SIZE = len(XGB_MODELS + LGB_MODELS)



def main ():
    print("read pairs")
    test_pairs = pd.read_feather("pairs.feather")


    
    
    print("drop id, match_id")  

    test_pairs = test_pairs.drop( ['id', 'match_id'], axis=1)
    cols = [ c for c in  test_pairs.columns if c not in ("label", "index") ]


    print("predict pairs")
    
    
    for n, model in enumerate (XGB_MODELS): 
        p = predict_xgb (f"{model}/model.txt", test_pairs, cols, batch_size = BATCH_SIZE)
        if n == 0:
            pred = p/SIZE
        else:
            pred += p/SIZE

    for n, model in enumerate (LGB_MODELS): 
        p = predict_lgb (f"{model}/model.txt", test_pairs, cols, batch_size = BATCH_SIZE)
        pred += p/SIZE

    
    del test_pairs
    gc.collect()    
    
    
    print("read pairs")
    test_pairs = pd.read_feather("pairs.feather")
    test_pairs = test_pairs [["id", "match_id"]]
    test_pairs["pred"] = pred

    print("read data")
    test = pd.read_feather("data.feather")
    
    submission  = create_submission (test, test_pairs, THRESHOLD)
    
    submission.to_csv("submission.csv", index=False)
    

if __name__ == "__main__":
    main()

Writing predict.py


In [20]:
!python predict.py

read pairs
drop id, match_id
predict pairs
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  8.04it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 357.11it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 337.43it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 301.21it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 29.64it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 356.69it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 346.04it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 361.36it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 175.03it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 269.97it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 353.26it/s]
100%|████████████████████████████████████████████| 1

In [21]:
!rm *.npy *.vec *.py

In [22]:
%%time
submission = pd.read_csv("submission.csv")

submission

CPU times: user 3.46 ms, sys: 929 µs, total: 4.39 ms
Wall time: 4.31 ms


Unnamed: 0,id,matches
0,E_000001272c6c5d,E_000001272c6c5d
1,E_000002eae2a589,E_000002eae2a589
2,E_000007f24ebc95,E_000007f24ebc95
3,E_000008a8ba4f48,E_000008a8ba4f48
4,E_00001d92066153,E_00001d92066153
...,...,...
95,E_000610912e6c85,E_000610912e6c85
96,E_00063a791601cc,E_00063a791601cc
97,E_000641a9bc89cf,E_000641a9bc89cf
98,E_000643f43d5c02,E_000643f43d5c02
