In [1]:
import os
import json 
import lightgbm as lgb
import random

class Config:
    AUTHOR = "kuruton"

    NAME = "USP-" + "Exp126-funnel-transformer-large-kf"
    MODEL_PATH = "funnel-transformer/large"
    DATASET_PATH = [
        "yasufuminakama/cpc-data"
    ]

    COMPETITION = "us-patent-phrase-to-phrase-matching"
    COLAB_PATH = "/content/drive/Shareddrives/USPatent" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 42
    num_fold = 4
    trn_fold = [0, 1, 2, 3]
    batch_size = 32
    n_epochs = 10
    max_len = 256

    weight_decay = 2e-5
    lr_decay = 0.97
    beta = (0.9, 0.98)
    lr = 2e-5
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    num_eval = 1

    upload_from_colab = True

In [2]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        ! pip install -q torch==1.10.0
        ! pip install -q transformers
        ! pip install -q sentencepiece

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg

In [3]:
device = "cuda:0"

In [4]:
# ========================================
# Library
# ========================================
import os
import gc
import re
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy 
import itertools
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score,
    roc_auc_score,
)
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

In [6]:
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [11]:
# =====================
# CPC Data
# =====================
def get_cpc_texts(cfg):
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir(os.path.join(cfg.DATASET, 'cpc-data/CPCSchemeXML202105')):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(os.path.join(cfg.DATASET, f'cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt')) as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    for key, val in results.items():
        results[key] = val.lower()
    return results

In [8]:
cfg = setup(Config)

This environment is Google Colab


In [9]:
train_pd = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))

In [13]:
cpc_texts = get_cpc_texts(cfg)
train_pd['context_text'] = train_pd['context'].map(cpc_texts)
train_pd['text'] = train_pd['anchor'] + '[SEP]' + train_pd['target'] + '[SEP]' + train_pd['context_text']


In [14]:

folds = get_kfold(train_pd, 4, 42)

In [15]:
max_len = 256


In [16]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df['text'].values
        self.tokenizer = tokenizer
        self.ids = df['id'].values
        self.labels = df['score'].values
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.texts[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label, self.ids[index]
    
    def prepare_input(self, text):
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=256,
            padding="max_length",
            return_offsets_mapping=False
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs


In [17]:

class CustomModel(nn.Module):
    def __init__(self, mdl_base):
        super().__init__()
        self.config = AutoConfig.from_pretrained(
            mdl_base,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            mdl_base, 
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 1),
        )
        
    def forward(self, inputs, labels=None):
        logits = self.backbone(**inputs)["last_hidden_state"]
        logits = logits[:, 0, :]
        preds = self.fc(logits).flatten()
        
        return preds, logits

In [18]:
def collatte(inputs, labels=None):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    if not labels is None:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        labels =  labels[:,:mask_len]
        return inputs, labels, mask_len
                
    else:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        return inputs, mask_len

In [19]:

allmodels = [
    ["USP-Exp041-deberta-v3-large-kf", "microsoft/deberta-v3-large", { "transform_input": False }],
    ["USP-Exp042-deberta-v3-base-kf", "microsoft/deberta-v3-base", { "transform_input": False }],
    ["USP-Exp043-bert-for-patents-kf", "anferico/bert-for-patents", { "transform_input": False }],
    ["USP-Exp044-funnel-transformer-large-kf", "funnel-transformer/large", { "transform_input": False }],
    ["USP-Exp045-funnel-transformer-large-base-kf", "funnel-transformer/large-base", { "transform_input": False }],
    ["USP-Exp049-muppet-roberta-large-kf", "facebook/muppet-roberta-large", { "transform_input": True }],
    ["USP-Exp050-deberta-large-kf", "microsoft/deberta-large", { "transform_input": True }],
    ["USP-Exp055-electra-large-kf", "google/electra-large-discriminator", { "transform_input": True }],
    ["USP-Exp058-funnel-xlarge-kf", "funnel-transformer/xlarge", { "transform_input": True }],
    ["USP-Exp059-funnel-xlarge-base-kf", "funnel-transformer/xlarge-base", { "transform_input": True }]
]

model_id_preds = {}
model_id_logits = {}

for models in allmodels:
    print("MODELS: ", models[0])
    
    model_name = models[0]
    model_base = models[1]
    model_options = models[2]
    
    tokenizer = AutoTokenizer.from_pretrained(model_base)
    
    
    fold_id_preds = {}
    fold_id_logits = {}

    for fold in range(0,4):
        print("    ========== fold ", fold)
        id_preds = {}
        id_logits = {}

        valid_df = train_pd.loc[folds==fold]
        
        if model_options['transform_input']:
            valid_df['text'] = valid_df['anchor'] + tokenizer.sep_token + valid_df['target'] + tokenizer.sep_token + valid_df['context_text']
        
        valid_dataset = TestDataset(valid_df, tokenizer)
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=32,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

        model = CustomModel(model_base)
        model = model.to(device)

        model_weight = f"{model_name}/model/fold{fold}.pth"
        model.load_state_dict(torch.load(model_weight))

        model.eval()
        with torch.no_grad():
            for (inputs, labels, ids) in tqdm(valid_loader, total=len(valid_loader)):
                inputs, max_len = collatte(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(device)
                output_preds, logits = model(inputs)
                output_preds = output_preds.sigmoid().detach().cpu()
                logits = logits.detach().cpu()

                for oi, o in enumerate(output_preds):
                    thisid = ids[oi]
                    if thisid not in id_preds:
                        id_preds[thisid] = []
                    id_preds[thisid].append(o.item())

                for li, l in enumerate(logits):
                    thisid = ids[li]
                    if thisid not in id_logits:
                        id_logits[thisid] = []
                    id_logits[thisid].append(l.tolist())
                

                    
        fold_id_preds[fold] = id_preds
        fold_id_logits[fold] = id_logits

#     print(" --- WRITING ", f"./preds_{model_name}.json")
#     with open(f"./preds_{model_name}.json", "w") as file:
#         json.dump(fold_id_preds, file)
#     with open(f"./logits_{model_name}.json", "w") as file:
#         json.dump(fold_id_logits, file)

    model_id_preds[model_name] = fold_id_preds
    model_id_logits[model_name] = fold_id_logits

    

print("DONE")

MODELS:  USP-Exp041-deberta-v3-large-kf


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




Downloading:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored

In [None]:

id_labels = {}

valid_dataset = TestDataset(train_pd, tokenizer)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=32,
    shuffle=False,
    pin_memory=True,
    drop_last=False
)

with torch.no_grad():
    for (inputs, labels, ids) in tqdm(valid_loader, total=len(valid_loader)):
        for ii, i in enumerate(ids):
            id_labels[i] = labels[ii].item()

  0%|          | 0/1140 [00:00<?, ?it/s]

In [None]:


# model_id_preds
# model_id_logits

for modelinfo in allmodels:
    print("")
    print("========= ", modelinfo[0])
    for foldi in range(0,4):
        preds = model_id_preds[modelinfo[0]][foldi]
        errors = []
        for key in list(preds.keys()):
            errors.append(abs(preds[key][0] - id_labels[key]))

        print(foldi, " = ", sum(errors) / len(errors))



In [None]:
import scipy

In [None]:

# foldi = 0

datapreds = {}
datalogits = {}

for foldi in range(0,4):
    shuffeled_keys = list(model_id_logits[modelinfo[0]][foldi].keys())

    for modelinfo in allmodels:
        for key in shuffeled_keys:
            if key not in datapreds:
                datapreds[key] = []
                datalogits[key] = []
            datapreds[key].append(model_id_preds[modelinfo[0]][foldi][key][0])
            datalogits[key].append(model_id_logits[modelinfo[0]][foldi][key][0])

        
datapoints = []
datalabels = []


shuffeled_keys = list(datapreds.keys())
random.shuffle(shuffeled_keys)

for key in shuffeled_keys:
    datalabels.append(id_labels[key])
    datapoints.append(
                    100 * datapreds[key] +
                      datalogits[key][0] +
                      datalogits[key][1] +
                      datalogits[key][2] +
                      datalogits[key][3] +
                      datalogits[key][4] +
                      datalogits[key][5] +
                      datalogits[key][6] +
                      datalogits[key][7] +
                      datalogits[key][8] +
                      datalogits[key][9]
                     )
print("DONE")

DONE


In [None]:
split = len(datapoints) - 100
train_datapoints = datapoints[:split]
train_labels = datalabels[:split]
test_datapoints = datapoints[split:]
test_labels = datalabels[split:]


# split = 100
# train_datapoints = datapoints[split:]
# train_labels = datalabels[split:]
# test_datapoints = datapoints[:split]
# test_labels = datalabels[:split]


In [None]:
train_data = lgb.Dataset(np.array(train_datapoints), label=np.array(train_labels))
val_data = lgb.Dataset(np.array(test_datapoints), label=np.array(test_labels))


In [None]:
param = {'num_leaves': 100, 'objective': 'regression',
         'max_bin': 100,
#          'linear_tree': True,
#          'min_data_in_leaf': 200,
#          'bagging_fraction': 0.5
        }
param['metric'] = 'rmse'
num_round = 50
bst = lgb.train(param, train_data, num_round, 
                valid_sets=train_data)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1098400
[LightGBM] [Info] Number of data points in the train set: 36373, number of used features: 10984
[LightGBM] [Info] Start training from score 0.362061
[1]	training's rmse: 0.238119
[2]	training's rmse: 0.220274
[3]	training's rmse: 0.204562
[4]	training's rmse: 0.190754
[5]	training's rmse: 0.178691
[6]	training's rmse: 0.168177
[7]	training's rmse: 0.159011
[8]	training's rmse: 0.151055
[9]	training's rmse: 0.144114
[10]	training's rmse: 0.13813
[11]	training's rmse: 0.132982
[12]	training's rmse: 0.128517
[13]	training's rmse: 0.124641
[14]	training's rmse: 0.121262
[15]	training's rmse: 0.118327
[16]	training's rmse: 0.115761
[17]	training's rmse: 0.113515
[18]	training's rmse: 0.111548
[19]	training's rmse: 0.109824
[20]	training's rmse: 0.108262
[21]	training's rmse: 0.106876
[22]	training's rmse: 0.105632
[23]	training's rmse: 0.104464
[24]	training's rmse: 0.103398
[25]	training's rmse: 

In [None]:
y_pred = bst.predict(np.array(test_datapoints))

In [None]:
np.mean(abs(np.array(y_pred) - np.array(test_labels)))

0.08060451318063525

In [None]:
scipy.stats.pearsonr(y_pred, test_labels)[0]

0.8920839049757557

In [None]:
errors = []

for key in datapreds.keys():
    avg_pred = sum(datapreds[key]) / len(datapreds[key])
    errors.append(abs(avg_pred - id_labels[key]))
    
print(sum(errors) / len(errors))


0.07901135776279343


In [None]:

# bst.save_model('gbm_10_preds100_embeds.txt')


In [None]:
   
weights =[
    0.19498323, 0.04974783, 0.22670046, 0.04771472, 0.08034923,
    0.0712817 , 0.07163315, 0.11906611, 0.03680013, 0.10172343,
]

avgpreds = []

for td in test_datapoints:
    avgpred = td[0] * weights[0] + \
            td[1] * weights[1] + \
            td[2] * weights[2] + \
            td[3] * weights[3] + \
            td[4] * weights[4] + \
            td[5] * weights[5] + \
            td[6] * weights[6] + \
            td[7] * weights[7] + \
            td[8] * weights[8] + \
            td[9] * weights[9]
    
#     avgpred = sum(td) / len(td)
    avgpreds.append(avgpred)

print(scipy.stats.pearsonr(avgpreds, test_labels)[0])

0.8881446017017339


In [None]:
# 10preds (try 3) pearson for 100 test points:
# avg: 0.9017784793231405
# pred: 0.906418451162495

# preds10_embeds:
# avg: 0.9296647878665482
# tree: 0.9349721235066913
# same but diff test set: 0.8885606281148487 to 0.8920839049757557
# with weights, even lower: 0.8881446017017339
