In [1]:
!nvidia-smi

Wed Jul 13 12:14:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      On   | 00000000:01:00.0 Off |                   On |
| N/A   33C    P0    45W / 275W |                  N/A |     N/A      Default |
|                               |                      |              Enabled |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| MIG devices:                                                                |
+------

In [2]:
# 활용 gpu idx 세팅
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# import package

In [7]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

# model import
import torch
# from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('./input/AI4Code')
# data_dir = Path(f'./data/ai4code/')
print(data_dir)

input/AI4Code


# 필요 함수 정의

In [4]:
# Additional code cells

#preprocess.py -11
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else: #code cell 개수가 지정된 n 보다 넘어가면
        results = []
        step = len(cells) / n #ex) 25/20 = 1.25 씩 뛰어 넘으면서 셀을 추가
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results # 마지막 셀 꼭 추가
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")): # 각 노트북에 대한 정보 저장
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [5]:
#metric.py
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
            .assign(id=path.stem)
            .rename_axis('cell_id')
    )

In [6]:
#model.py
from tqdm import tqdm
import sys, os
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1)
        
    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = self.top(torch.cat((x[:, 0, :], fts),1))
        return x


#dataset.py
from torch.utils.data import DataLoader, Dataset

class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.fts = fts

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=True,
            max_length=24,
            padding="max_length",
            truncation=True
        )
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_md"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]

# Valid 시작!

In [8]:
# Train 데이터(notebook)의 order 순서가 적혀있는 데이터(train_orders.csv) import
# 여기서의 id는 notebook 단위

#preprocess.py -2
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # cell_id가 텍스트로 붙어있음, 띄어쓰기 단위로 끊어서 리스트화

print(df_orders.shape)
df_orders.head(2)



  df_orders = pd.read_csv(


(139256,)


id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
Name: cell_order, dtype: object

In [9]:
# 학습데이터 경로
experiment_data_dir = f'./ai4code-999/'

In [10]:
train_df_mark = pd.read_csv(f'{experiment_data_dir}/train_mark.csv')#.drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open(f'{experiment_data_dir}/train_fts.json'))
train_df = pd.read_csv(f'{experiment_data_dir}/train.csv')
val_df_mark = pd.read_csv(f'{experiment_data_dir}/val_mark.csv')#.drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open(f'{experiment_data_dir}/val_fts.json'))
val_df = pd.read_csv(f'{experiment_data_dir}/val.csv')

In [11]:
#이후  0713
print(train_df.shape, val_df.shape)

(6364816, 8) (5834, 8)


In [12]:
#이후  0713
print(train_df_mark.shape, val_df_mark.shape)

(2164091, 8) (1977, 8)


In [30]:
# tokenizer dir 지정
tokenizer_path = f'./model/finetuned/20220703/outputs'

In [24]:
md_max_len = 64
total_max_len = 512
batch_size = 8
# accumulation_steps = 4
# epochs = 4
n_workers = 8

train_ds = MarkdownDataset(train_df_mark, model_name_or_path=tokenizer_path, md_max_len=md_max_len,
                           total_max_len=total_max_len, fts=train_fts)
val_ds = MarkdownDataset(val_df_mark, model_name_or_path=tokenizer_path, md_max_len=md_max_len,
                         total_max_len=total_max_len, fts=val_fts)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=n_workers,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=n_workers,
                        pin_memory=False, drop_last=False)

In [6]:
def read_data(data):
    return tuple(d.to(device) for d in data[:-1]), data[-1].to(device)

def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)

In [13]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name())
print(device)

True
1
Graphics Device
cuda


In [16]:
# import finetuned model

model_path = f'./model/pretrain/codebert-base' # origin model path
ckpt_path = os.path.join(f'./model/finetuned/20220703/outputs', 'pytorch_model_3.bin') # 체크포인트 path

print(model_path, '\n', ckpt_path)
model = MarkdownModel(model_path)

# 불러온 모델의 weight를 finetuned된 모델로 update해줌
model.load_state_dict(torch.load(ckpt_path))
model.eval()

./model/pretrain/codebert-base 
 ./model/finetuned/20220703/outputs/pytorch_model_3.bin


MarkdownModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

In [17]:
print(device)
model = model.to(device)

cuda


In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [39]:
# Valid Set 결과 추출
# codeBERT & Valid(167,313건) 기준 약 16분 걸림
y_label, y_pred = validate(model, val_loader)

100%|██████████| 20915/20915 [16:39<00:00, 20.92it/s]


In [40]:
val_df_mark.shape, y_label.shape, y_pred.shape

((167313, 8), (167313,), (167313,))

In [41]:
# Valid 셋에 label, pred 추가
val_df_mark.loc[val_df_mark["cell_type"] == "markdown", "label"] = y_label
val_df_mark.loc[val_df_mark["cell_type"] == "markdown", "pred"] = y_pred

- encode 되어있는 ds 변환
    - 해당 작업이 조금 오래걸림

In [129]:
# incode Data append
tokenizer_forex = AutoTokenizer.from_pretrained(tokenizer_path)

decode_lst = []

for i in tqdm(range(len(val_ds))):
    row = val_ds.__getitem__(i)
    _encode = row[0]
    _decode = tokenizer_forex.decode(_encode).replace('<pad>', " ")
#     print(_encode)
#     print(_decode)
#     print(row)
    decode_lst.append(_decode)
len(decode_lst)

100%|██████████| 167313/167313 [21:05<00:00, 132.21it/s]


167313

In [134]:
val_df_mark.loc[:,'input_text'] = decode_lst
val_df_mark.head(2)

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred,input_text
0,000597ac4c6700,a3e03a94,markdown,"Always remember to delete your -now- useless data, will free up some memory.",30,81ab937c,,0.666667,0.666667,0.741699,"<s>Always remember to delete your -now- useless data, will free up some memory.</s> ..."
1,000597ac4c6700,8e68c8ad,markdown,And it's all encoded. We only encoded these columns to get rid of the string data type but **if you want to train a ...,23,81ab937c,,0.511111,0.511111,0.563965,<s>And it's all encoded. We only encoded these columns to get rid of the string data type but **if you want to train...


In [136]:
# Valid 셋에 label, pred 추가
use_col = ['cell_id', 'label', 'pred', 'input_text']

new_val_df = val_df.merge(val_df_mark[use_col]
                         ,how = 'left'
                         ,on = 'cell_id').copy()

print(val_df.shape, new_val_df.shape)
display(new_val_df.head(2))
display(new_val_df.tail(2))

(490240, 8) (490242, 11)


Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred,input_text
0,000597ac4c6700,ede4241f,code,import pandas as pd\nimport numpy as np,1,81ab937c,,0.022222,,,
1,000597ac4c6700,36b989a3,code,"## Function to reduce the DF size\ndef reduce_mem_usage(df, verbose=True):\n numerics = ['int16', 'int32', 'int64...",3,81ab937c,,0.066667,,,


Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred,input_text
490240,fff6c12f17ac92,7912edbd,code,"sample_df.to_csv('submission.csv',index=False)",25,41d132dc,,0.925926,,,
490241,fff6c12f17ac92,19a20bdc,markdown,"**If you like it , please upvote :)**",26,41d132dc,,0.962963,0.962963,1.036133,"<s>**If you like it, please upvote :)**</s> <s># Ignore the warni..."


In [138]:
# code cell label 추가
new_val_df.label = np.where(new_val_df.label.isna(), new_val_df.pct_rank, new_val_df.label)
new_val_df.pred = np.where(new_val_df.pred.isna(), new_val_df.pct_rank, new_val_df.pred)

new_val_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred,input_text
0,000597ac4c6700,ede4241f,code,import pandas as pd\nimport numpy as np,1,81ab937c,,0.022222,0.022222,0.022222,
1,000597ac4c6700,36b989a3,code,"## Function to reduce the DF size\ndef reduce_mem_usage(df, verbose=True):\n numerics = ['int16', 'int32', 'int64...",3,81ab937c,,0.066667,0.066667,0.066667,
2,000597ac4c6700,b91fc1a4,code,"train = pd.read_csv(""/kaggle/input/bdg2-class-competition/train.csv"")\ntest = pd.read_csv(""/kaggle/input/bdg2-class-...",5,81ab937c,,0.111111,0.111111,0.111111,
3,000597ac4c6700,97c1bb21,code,train = reduce_mem_usage(train)\ntest = reduce_mem_usage(test)\nwtrain = reduce_mem_usage(wtrain)\nwtest = reduce_me...,7,81ab937c,,0.155556,0.155556,0.155556,
4,000597ac4c6700,148274fd,code,metadata.info(),8,81ab937c,,0.177778,0.177778,0.177778,
...,...,...,...,...,...,...,...,...,...,...,...
490237,fff6c12f17ac92,bccd1fcb,code,"learn.data.add_test(ImageList.from_df(sample_df,'../input/aptos2019-blindness-detection',folder='test_images',suffix...",22,41d132dc,,0.814815,0.814815,0.814815,
490238,fff6c12f17ac92,323f0b85,code,"preds,y = learn.get_preds(DatasetType.Test)",23,41d132dc,,0.851852,0.851852,0.851852,
490239,fff6c12f17ac92,ac1e1d7f,code,sample_df.diagnosis = preds.argmax(1)\nsample_df.head(),24,41d132dc,,0.888889,0.888889,0.888889,
490240,fff6c12f17ac92,7912edbd,code,"sample_df.to_csv('submission.csv',index=False)",25,41d132dc,,0.925926,0.925926,0.925926,


## 결과물 print

In [139]:
#metric.py
from bisect import bisect

# row 단위로 kendall tau print
def kendall_tau_byrow(gt, pred):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    
    ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
    total_inversions += count_inversions(ranks)
    n = len(gt)
    total_2max += n * (n - 1)
    return abs(1 - 4 * total_inversions / total_2max)


In [48]:
# Valid set 성능 확인
tmp_val_df = val_df.copy()
tmp_val_df["pred"] = tmp_val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True) #그룹별 순위 구하기, 큰 순서대로 정렬
tmp_val_df.loc[tmp_val_df["cell_type"] == "markdown", "pred"] = y_pred              #마크다운엔 prediction 값
y_dummy = tmp_val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
print("Valid score", kendall_tau(df_orders.loc[y_dummy.index], y_dummy)) # label, pred 순으로 insert
del tmp_val_df

Valid score 0.8489752778268465


In [58]:
# id별로 성능이 낮은건 확인

label_byid = df_orders.loc[y_dummy.index].reset_index(drop=False)
pred_byid = y_dummy.reset_index(drop=False)
label_byid.columns = ['id', 'label_order']
pred_byid.columns = ['id', 'pred_order']

display(label_byid.shape, label_byid.head(2)
        , pred_byid.shape, pred_byid.head(2))

total_result = label_byid.merge(pred_byid
                               ,how = 'left'
                               ,on = 'id')
print('\n\n',total_result.shape)
total_result.head(2)

(10520, 2)

Unnamed: 0,id,label_order
0,000597ac4c6700,"[ba54a747, ede4241f, fb42ece2, 36b989a3, 2fa559cb, b91fc1a4, 649083d4, 97c1bb21, 148274fd, 7f40f579, 92a7ccd5, 9ed48..."
1,000a4651cce8f4,"[2844b16f, 883db462, 9ae35818, f1dee85b, 00e71412, 3ec4ab79, 7bfe7649, 5bcfa9ad, be38feb5, f417baab, cf43b3ce, edbdd..."


(10520, 2)

Unnamed: 0,id,pred_order
0,000597ac4c6700,"[ba54a747, ede4241f, fb42ece2, 36b989a3, 2fa559cb, b91fc1a4, 649083d4, 97c1bb21, 148274fd, 7f40f579, 92a7ccd5, a791b..."
1,000a4651cce8f4,"[2844b16f, 883db462, 9ae35818, f1dee85b, 3ec4ab79, 7bfe7649, be38feb5, 00e71412, 5bcfa9ad, cf43b3ce, f417baab, edbdd..."




 (10520, 3)


Unnamed: 0,id,label_order,pred_order
0,000597ac4c6700,"[ba54a747, ede4241f, fb42ece2, 36b989a3, 2fa559cb, b91fc1a4, 649083d4, 97c1bb21, 148274fd, 7f40f579, 92a7ccd5, 9ed48...","[ba54a747, ede4241f, fb42ece2, 36b989a3, 2fa559cb, b91fc1a4, 649083d4, 97c1bb21, 148274fd, 7f40f579, 92a7ccd5, a791b..."
1,000a4651cce8f4,"[2844b16f, 883db462, 9ae35818, f1dee85b, 00e71412, 3ec4ab79, 7bfe7649, 5bcfa9ad, be38feb5, f417baab, cf43b3ce, edbdd...","[2844b16f, 883db462, 9ae35818, f1dee85b, 3ec4ab79, 7bfe7649, be38feb5, 00e71412, 5bcfa9ad, cf43b3ce, f417baab, edbdd..."


In [60]:
# 결과물에 kendall_score 추가
total_result.loc[:,'kendall_score'] = [kendall_tau_byrow(n[0], n[1]) for n in total_result[['label_order', 'pred_order']].values]
total_result.head(2)

Unnamed: 0,id,label_order,pred_order,kendall_score
0,000597ac4c6700,"[ba54a747, ede4241f, fb42ece2, 36b989a3, 2fa559cb, b91fc1a4, 649083d4, 97c1bb21, 148274fd, 7f40f579, 92a7ccd5, 9ed48...","[ba54a747, ede4241f, fb42ece2, 36b989a3, 2fa559cb, b91fc1a4, 649083d4, 97c1bb21, 148274fd, 7f40f579, 92a7ccd5, a791b...",0.957576
1,000a4651cce8f4,"[2844b16f, 883db462, 9ae35818, f1dee85b, 00e71412, 3ec4ab79, 7bfe7649, 5bcfa9ad, be38feb5, f417baab, cf43b3ce, edbdd...","[2844b16f, 883db462, 9ae35818, f1dee85b, 3ec4ab79, 7bfe7649, be38feb5, 00e71412, 5bcfa9ad, cf43b3ce, f417baab, edbdd...",0.908832


In [61]:
# kendall tau score 분포 확인
total_result.kendall_score.describe()

count    10520.000000
mean         0.881397
std          0.105744
min          0.000000
25%          0.838235
50%          0.907179
75%          0.955556
max          1.000000
Name: kendall_score, dtype: float64

In [63]:
tmp_tb = new_val_df[new_val_df.id == '05f96edae0f024'].copy()
tmp_tb = tmp_tb.set_index('cell_id')

# tmp_tb.loc[['9d98338d', 'ee86eac5']]

In [None]:
import seaborn as sns
sns.distplot(total_result.kendall_score)

### kendall tau score가 낮은 id sampleing

In [None]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

print(color.YELLOW + 'Hello World !' + color.END)

In [104]:
low_score_sample = total_result.sort_values('kendall_score').iloc[:10,:]
display(low_score_sample)

# 실제 텍스트 확인
def print_result(cell_tb, notebook_id, label_order, pred_order):
    # cell tb 중 확인할 notebook_id sort
    tmp_tb = cell_tb[cell_tb.id == notebook_id].copy()
    tmp_tb = tmp_tb.set_index('cell_id')

    print(f'>>> ---------- Notebook id: {notebook_id} ----------')

#     # label 순서 대로 cell print
#     print(f'>>> label_order result')
#     display(tmp_tb.loc[label_order])
#     for t in tmp_tb.loc[label_order].source.tolist():
#         display(t)
    
    # pred order 대로 cell print
    print(f'>>> pred_order_col result')
    display(tmp_tb.loc[pred_order])
#     for t in tmp_tb.loc[pred_order].source.tolist():
#         display(t)
    
for row in low_score_sample.iterrows():
    row = row[1]
    print(row)
    n_id = row.id
    label_order = row.label_order
    pred_order = row.pred_order
    print_result(new_val_df, n_id, label_order, pred_order)
    print('\n\n\n')


Unnamed: 0,id,label_order,pred_order,kendall_score
2643,3fd66fe0a96d18,"[8538920c, c4d2907b, c142a2ee, 136f872b]","[c4d2907b, 136f872b, 8538920c, c142a2ee]",0.0
3526,544655921beef7,"[048e2d9b, bef56024, ea556589, 20d3fe90, 69294a5a, 4b60c22a, 01f6f76b, 06d7f1db, 42485723, 9b8b36c4, 0cf7ee49, 625f2...","[20d3fe90, db4baf15, ea556589, 3cf1066f, fdb913ac, c90e3bd6, bc18d466, ba162cd7, 76a0e541, 23597b53, bf1dc93b, 4b60c...",0.025309
1996,2fa3c4c997e1d1,"[b299f9ff, 0f8c59a6, 0cd37fbf, 41cac302, 2770c4e2, f8c8c680, 39d7665b, 3919bd9a, 485ecbae, 2c527861, 8c1c0e0a, fc058...","[0cd37fbf, e0769a65, da72def6, 79b32d5e, 9432920a, 44280a92, f4a41dbe, e752cb02, cb699fd2, b299f9ff, 0f8c59a6, f54b8...",0.086542
8038,c27850f9bbcac4,"[92e0a22c, 6574991c, aa24cfb2, 1efd2c07, f87c1725, e7529972, db582ff6, f34383a6, d1866a2a, 2cec003c, 72b3c352, 5a932...","[92e0a22c, 1c7d3327, 5a932c2f, f87c1725, 6574991c, aa24cfb2, 49831d80, db582ff6, 2cec003c, d1866a2a, d4d1bfee, 72b3c...",0.116667
3963,5e97933310f217,"[46fa93e3, 8515c21d, b0ea29f4, d72d0507, 553f5b96, eb2b787d, c926405c, dbbfaf24, 2708c4b8, 96b20353, 6934a531, 73da1...","[1b79d054, 2d7d9227, 9f5e2233, 46fa93e3, b0ea29f4, c472f10d, 8515c21d, d72d0507, 9c617cba, dbbfaf24, f907fbe0, 2708c...",0.131183
3635,56a364c9726435,"[d1824dcf, 232912f8, ddfe5a63, 71945f79, 7cd943a6, 2b918080, c2978663]","[2b918080, 232912f8, ddfe5a63, 71945f79, 7cd943a6, d1824dcf, c2978663]",0.142857
8650,d127172adeacea,"[eebc3086, 722cf377, b37481da, 712782ed, 1afd3599, 0315d35b, e92fec1b, ceb68917, b53f604a, 01514c28, 5ad53400, c2a6e...","[eebc3086, 712782ed, 5ad53400, 722cf377, b53f604a, e92fec1b, c2a6e274, 01514c28, 1afd3599, ceb68917, 0315d35b, 56f0a...",0.179487
3093,4a99d2ffd04cd6,"[19bb0372, 782c7b9e, 5b019082, 067348b8, 531994ce]","[531994ce, 19bb0372, 782c7b9e, 5b019082, 067348b8]",0.2
1353,206aadfedae1ad,"[46946ca5, 022d6bd1, 3a917eba, 59da069c, 4d9f8b9f, eb012803, d4470b3c, 6296022f, 2ba8a5bf, 86943f9e, 9927ec8f, 37578...","[46946ca5, 59da069c, 77022ccd, a2422444, 7d3549e7, 55790065, 1e7ecfb4, eb012803, 3ec5ab6f, 3757841d, 1e6707b1, 022d6...",0.219873
984,17845073a429f3,"[502033a4, 794feada, a42f4b7e, b9d3bfc2, a706930b, 1441e3fc, ece1fb3b, d4e523f0, c6b9d271, 87794ae1, bc9d7947, 1e2a3...","[502033a4, d50e67bb, a42f4b7e, 205b4129, 7cc1fd2a, 68bcf209, 794feada, b9d3bfc2, a706930b, 1441e3fc, a6f2a408, d4e52...",0.224638


id                                         3fd66fe0a96d18
label_order      [8538920c, c4d2907b, c142a2ee, 136f872b]
pred_order       [c4d2907b, 136f872b, 8538920c, c142a2ee]
kendall_score                                         0.0
Name: 2643, dtype: object
>>> ---------- Notebook id: 3fd66fe0a96d18 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
c4d2907b,3fd66fe0a96d18,markdown,# Title,1,0e15379d,,0.25,0.25,0.174438
136f872b,3fd66fe0a96d18,markdown,title,3,0e15379d,,0.75,0.75,0.345459
8538920c,3fd66fe0a96d18,code,"print(""hello world"")",0,0e15379d,,0.0,0.0,0.0
c142a2ee,3fd66fe0a96d18,code,title,2,0e15379d,,0.5,0.5,0.5






id                                                                                                                        544655921beef7
label_order      [048e2d9b, bef56024, ea556589, 20d3fe90, 69294a5a, 4b60c22a, 01f6f76b, 06d7f1db, 42485723, 9b8b36c4, 0cf7ee49, 625f2...
pred_order       [20d3fe90, db4baf15, ea556589, 3cf1066f, fdb913ac, c90e3bd6, bc18d466, ba162cd7, 76a0e541, 23597b53, bf1dc93b, 4b60c...
kendall_score                                                                                                                   0.025309
Name: 3526, dtype: object
>>> ---------- Notebook id: 544655921beef7 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20d3fe90,544655921beef7,markdown,# Explain inner working on logistic regression\nhttps://satishgunjal.com/binary_lr/,3,035e1579,,0.037037,0.037037,-0.003082
db4baf15,544655921beef7,markdown,"# Difference between univariate, bivariate and multivariate analysis?\n\n* Univariate Analysis\n\n![Univariate_Analy...",41,035e1579,,0.506173,0.506173,-0.002829
ea556589,544655921beef7,markdown,# Explain inner working on linear regression\nhttps://satishgunjal.com/univariate_lr/,2,035e1579,,0.024691,0.024691,-0.002739
3cf1066f,544655921beef7,markdown,"# Explain dimensionality reduction, and its benefits?\n* Dimensionality reduction referes to the process of converti...",54,035e1579,,0.666667,0.666667,-0.001030
fdb913ac,544655921beef7,markdown,"# Explain Principal Componenet Analysis?\n* Principal Component Analysis (PCA) is dimensionality reduction method, t...",64,035e1579,,0.790123,0.790123,-0.000325
...,...,...,...,...,...,...,...,...,...
0c99587e,544655921beef7,markdown,# Explain the scenario where both false positive and false negative are equally important\n* In banking industry giv...,77,035e1579,,0.950617,0.950617,0.750000
048e2d9b,544655921beef7,markdown,"# Important Tips\n* Datascience interview questions can include questions from statistics, math, data visualization,...",0,035e1579,,0.000000,0.000000,0.831055
9b8b36c4,544655921beef7,markdown,# What is more important model accuracy or model performance?\n* Short answer is: Model accuracy matters the most! i...,9,035e1579,,0.111111,0.111111,0.833496
3c5626e8,544655921beef7,markdown,# References\n* https://www.youtube.com/watch?v=k6QWYwOvJs0&t=1149s\n* https://towardsdatascience.com/taking-the-con...,80,035e1579,,0.987654,0.987654,0.941895






id                                                                                                                        2fa3c4c997e1d1
label_order      [b299f9ff, 0f8c59a6, 0cd37fbf, 41cac302, 2770c4e2, f8c8c680, 39d7665b, 3919bd9a, 485ecbae, 2c527861, 8c1c0e0a, fc058...
pred_order       [0cd37fbf, e0769a65, da72def6, 79b32d5e, 9432920a, 44280a92, f4a41dbe, e752cb02, cb699fd2, b299f9ff, 0f8c59a6, f54b8...
kendall_score                                                                                                                   0.086542
Name: 1996, dtype: object
>>> ---------- Notebook id: 2fa3c4c997e1d1 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0cd37fbf,2fa3c4c997e1d1,markdown,Keras 전처리 층 API는 개발자들이 Keras 고유의 입력 처리 파이프라인을 만들 수 있게 해줍니다. 이 입력 처리 파이프라인들은 Keras가 아닌 작업 흐름 안에서 독립적인 사전 처리 코드로써 사용되고...,2,074f30fd,,0.028571,0.028571,0.008728
e0769a65,2fa3c4c997e1d1,markdown,[바닥에서 부터 이미지 분류](https://keras.io/examples/vision/image_classification_from_scratch/) 예제에서 비슷한 설정 활동을 볼 수 있습니다.,45,074f30fd,,0.642857,0.642857,0.032257
da72def6,2fa3c4c997e1d1,markdown,[바닥에서 부터 텍스트 분류](https://keras.io/examples/nlp/text_classification_from_scratch/) 예제에서 `Embedding` 방식과 결합된 `TextVect...,62,074f30fd,,0.885714,0.885714,0.036621
79b32d5e,2fa3c4c997e1d1,code,import numpy as np\nimport tensorflow as tf\nfrom tensorflow.keras.layers.experimental import preprocessing\n\ndata ...,21,074f30fd,,0.300000,0.300000,0.300000
9432920a,2fa3c4c997e1d1,markdown,## 추론 기간에 모델 안에서 전처리 수행의 장점,36,074f30fd,,0.514286,0.514286,0.109009
...,...,...,...,...,...,...,...,...,...
beb20d89,2fa3c4c997e1d1,markdown,`adapt()` 메소드를 통해 학습 데이터에 전처리 층을 노출시킴으로써 상태를 설정할 수 있습니다:,20,074f30fd,,0.285714,0.285714,0.734863
df40cc17,2fa3c4c997e1d1,markdown,* `RandomCrop` 층\n* `RandomFlip` 층\n* `RandomTranslation` 층\n* `RandomRotation` 층\n* `RandomZoom` 층\n* `RandomHeight...,15,074f30fd,,0.214286,0.214286,0.798828
1c42dbd1,2fa3c4c997e1d1,code,"# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다\ndata = tf.constant(\n [\n ""The Brain is wider than the Sky"",\n ...",61,074f30fd,,0.871429,0.871429,0.871429
d52944d9,2fa3c4c997e1d1,code,"# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다\ndata = tf.constant(\n [\n ""The Brain is wider than the Sky"",\n ...",66,074f30fd,,0.942857,0.942857,0.942857






id                                                                                                                        c27850f9bbcac4
label_order      [92e0a22c, 6574991c, aa24cfb2, 1efd2c07, f87c1725, e7529972, db582ff6, f34383a6, d1866a2a, 2cec003c, 72b3c352, 5a932...
pred_order       [92e0a22c, 1c7d3327, 5a932c2f, f87c1725, 6574991c, aa24cfb2, 49831d80, db582ff6, 2cec003c, d1866a2a, d4d1bfee, 72b3c...
kendall_score                                                                                                                   0.116667
Name: 8038, dtype: object
>>> ---------- Notebook id: c27850f9bbcac4 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
92e0a22c,c27850f9bbcac4,markdown,# Mathematics of Support Vector Machines,0,541f28d7,,0.0,0.0,-0.005901
1c7d3327,c27850f9bbcac4,markdown,# Kernel Trick,12,541f28d7,,0.75,0.75,-0.005646
5a932c2f,c27850f9bbcac4,markdown,# Lagrangian Multipliers\n\nThe general idea is to transform a constrained optimization objective into an unconstrai...,11,541f28d7,,0.6875,0.6875,0.014542
f87c1725,c27850f9bbcac4,markdown,"## Lets look at mathematical expression defining the conditions\n\nThis notebook is inspired by MIT course on SVM, t...",4,541f28d7,,0.25,0.25,0.064209
6574991c,c27850f9bbcac4,markdown,## Topics covered in this notebook\n* The SVC Algorithm\n* The Lagrange Multiplier\n* Kernel Trick,1,541f28d7,,0.0625,0.0625,0.125
aa24cfb2,c27850f9bbcac4,markdown,# The Algorithm\n\nI will use very basic level examples starting with 2D space to explain this:\n\nLets say that you...,2,541f28d7,,0.125,0.125,0.155762
49831d80,c27850f9bbcac4,markdown,"## Types of kernel\n\n$K(xi,xj) = (xi.xj +c)^d$- This is example of polynomial kernel where dot product of original ...",15,541f28d7,,0.9375,0.9375,0.247559
db582ff6,c27850f9bbcac4,markdown,"The shortest distance between a point a plane is given by the orthogonal projection of a line into another line, i.e...",6,541f28d7,,0.375,0.375,0.356201
2cec003c,c27850f9bbcac4,markdown,![svm4.PNG](attachment:svm4.PNG),9,541f28d7,,0.5625,0.5625,0.373047
d1866a2a,c27850f9bbcac4,markdown,### Changing the one reference point to origin,8,541f28d7,,0.5,0.5,0.430176






id                                                                                                                        5e97933310f217
label_order      [46fa93e3, 8515c21d, b0ea29f4, d72d0507, 553f5b96, eb2b787d, c926405c, dbbfaf24, 2708c4b8, 96b20353, 6934a531, 73da1...
pred_order       [1b79d054, 2d7d9227, 9f5e2233, 46fa93e3, b0ea29f4, c472f10d, 8515c21d, d72d0507, 9c617cba, dbbfaf24, f907fbe0, 2708c...
kendall_score                                                                                                                   0.131183
Name: 3963, dtype: object
>>> ---------- Notebook id: 5e97933310f217 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1b79d054,5e97933310f217,markdown,"# BIAS in AI\n\n* Each neuron has bias.\n* The flexibility of the model will be rised. \n\n<a ><img src=""https://i.i...",19,8bb61997,,0.612903,0.612903,-0.002829
2d7d9227,5e97933310f217,markdown,# What is Kernel?\n\nThe Kernel helps us separate data with a non-linear decision boundary using a linear classifier...,28,8bb61997,,0.903226,0.903226,-0.001178
9f5e2233,5e97933310f217,markdown,# Vanishing Gradient \nCHeck the code example : https://cs224d.stanford.edu/notebooks/vanishing_grad_example.html\nW...,18,8bb61997,,0.580645,0.580645,0.011536
46fa93e3,5e97933310f217,markdown,"<div class=""list-group"" id=""list-tab"" role=""tablist"">\n <h3 class=""list-group-item list-group-item-action active"" d...",0,8bb61997,,0.0,0.0,0.028564
b0ea29f4,5e97933310f217,markdown,"<a id =""super""></a> \n\n# **1. Supervised Learning**\n\nYou give input data. \n\nEx for Supervised Learning:\n\n**Se...",2,8bb61997,,0.064516,0.064516,0.05423
c472f10d,5e97933310f217,markdown,"# Regularization-2\n\n ## 1. Ridge Regression\n\n<a ><img src=""https://i.ibb.co/y4SJq5k/Screenshot-2021-07-14-at-2...",20,8bb61997,,0.645161,0.645161,0.057678
8515c21d,5e97933310f217,markdown,"<a id =""1""></a> \n# **Four Categories of Machine Learning** \n\n \n \n1. Supervised Learning\n2. Unsupervised Lea...",1,8bb61997,,0.032258,0.032258,0.058167
d72d0507,5e97933310f217,markdown,"<a id =""unsuper""></a>\n\n# **2. Unsupervised Learning**\n\n ""Unsupervised Learning is the bread and butter of dat...",3,8bb61997,,0.096774,0.096774,0.141968
9c617cba,5e97933310f217,markdown,"# Leaky Relu\n\n> **Leaky Rectified Linear Unit**, or Leaky ReLU, is a type of activation function based on a ReLU,...",27,8bb61997,,0.870968,0.870968,0.226196
dbbfaf24,5e97933310f217,code,num_validation_samples = 20000\n\nnp.random.shuffle(data)\n\n#Defining the validatian set.\nvalidation_data= data[:n...,7,8bb61997,,0.225806,0.225806,0.225806






id                                                                       56a364c9726435
label_order      [d1824dcf, 232912f8, ddfe5a63, 71945f79, 7cd943a6, 2b918080, c2978663]
pred_order       [2b918080, 232912f8, ddfe5a63, 71945f79, 7cd943a6, d1824dcf, c2978663]
kendall_score                                                                  0.142857
Name: 3635, dtype: object
>>> ---------- Notebook id: 56a364c9726435 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2b918080,56a364c9726435,markdown,"This is Global Coronavirus (COVID-19) data, provided by Johns Hopkins and cleaned and reshaped with Tableau.Here I a...",5,af2210ef,,0.714286,0.714286,-0.001391
232912f8,56a364c9726435,markdown,![](https://storage.googleapis.com/kagglesdsdata/datasets/623593/1112109/USA.jpg?GoogleAccessId=web-data@kaggle-1616...,1,af2210ef,,0.142857,0.142857,0.142212
ddfe5a63,56a364c9726435,markdown,"Coronavirus is a family of viruses that can cause illness, which can vary from common cold and cough to sometimes mo...",2,af2210ef,,0.285714,0.285714,0.220581
71945f79,56a364c9726435,markdown,Data Source : https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases,3,af2210ef,,0.428571,0.428571,0.273682
7cd943a6,56a364c9726435,markdown,Tableau Dashboard,4,af2210ef,,0.571429,0.571429,0.683105
d1824dcf,56a364c9726435,markdown,If you like this kernel Greatly Appreciate with UPVOTE.Thank you\nhttps://public.tableau.com/profile/mahi.khedkar#!/...,0,af2210ef,,0.0,0.0,0.908203
c2978663,56a364c9726435,code,#Import section\n\nfrom IPython.display import IFrame\nIFrame('https://public.tableau.com/views/CoronavirusDashboard...,6,af2210ef,,0.857143,0.857143,0.857143






id                                                                                                                        d127172adeacea
label_order      [eebc3086, 722cf377, b37481da, 712782ed, 1afd3599, 0315d35b, e92fec1b, ceb68917, b53f604a, 01514c28, 5ad53400, c2a6e...
pred_order       [eebc3086, 712782ed, 5ad53400, 722cf377, b53f604a, e92fec1b, c2a6e274, 01514c28, 1afd3599, ceb68917, 0315d35b, 56f0a...
kendall_score                                                                                                                   0.179487
Name: 8650, dtype: object
>>> ---------- Notebook id: d127172adeacea ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
eebc3086,d127172adeacea,markdown,This text is an English version of the following article written in Japanese in December 2019.\n\nhttps://qiita.com/...,0,aed848ca,,0.0,0.0,-0.00252
712782ed,d127172adeacea,markdown,# Explanation\n\nLet's look at points of the cells above!,3,aed848ca,,0.230769,0.230769,0.298828
5ad53400,d127172adeacea,markdown,"## Constants\n\n```python\nSEED = 2019\nN_FOLDS = 10\n```\n\nIn advance, fix the seed of pseudo-random number (`SEED...",10,aed848ca,,0.769231,0.769231,0.316895
722cf377,d127172adeacea,markdown,"# This is ""one cell""...\n",1,aed848ca,,0.076923,0.076923,0.375
b53f604a,d127172adeacea,markdown,## Preprocessing\n\n```python\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardS...,8,aed848ca,,0.615385,0.615385,0.39624
e92fec1b,d127172adeacea,markdown,"## Data\n\n```python\nimport numpy as np\nimport pandas as pd\npd.set_option('display.max_columns', None)\n```\n\nTh...",6,aed848ca,,0.461538,0.461538,0.418213
c2a6e274,d127172adeacea,markdown,"## Output list of file names given as input\n\n```python\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/i...",11,aed848ca,,0.846154,0.846154,0.430908
01514c28,d127172adeacea,markdown,## Cross validation\n\n```python\nfrom sklearn.model_selection import StratifiedKFold \n```\n\nCross validation is n...,9,aed848ca,,0.692308,0.692308,0.544922
1afd3599,d127172adeacea,markdown,## Display\n\n```python\nfrom IPython.core.interactiveshell import InteractiveShell\nInteractiveShell.ast_node_inter...,4,aed848ca,,0.307692,0.307692,0.553223
ceb68917,d127172adeacea,markdown,## Visualization\n\n```python\nimport pandas_profiling as pdp\nimport matplotlib.pyplot as plt\n%matplotlib inline\n...,7,aed848ca,,0.538462,0.538462,0.566895






id                                                   4a99d2ffd04cd6
label_order      [19bb0372, 782c7b9e, 5b019082, 067348b8, 531994ce]
pred_order       [531994ce, 19bb0372, 782c7b9e, 5b019082, 067348b8]
kendall_score                                                   0.2
Name: 3093, dtype: object
>>> ---------- Notebook id: 4a99d2ffd04cd6 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
531994ce,4a99d2ffd04cd6,markdown,Credits: Python Engineer\n\nDon't hesitate to watch YouTube video. It has clear explanation\n\nhttps://youtu.be/3Kb0...,4,9ef2154d,,0.8,0.8,-0.001771
19bb0372,4a99d2ffd04cd6,code,"import torch\n\n\n# Intialize values to X,Y\n\n\nx = torch.tensor(1.0)\ny = torch.tensor(2.0)",0,9ef2154d,,0.0,0.0,0.0
782c7b9e,4a99d2ffd04cd6,code,"\n# This is the parameter we want to optimize -> requires_grad=True\nw = torch.tensor(1.0, requires_grad=True)\n\n# ...",1,9ef2154d,,0.2,0.2,0.2
5b019082,4a99d2ffd04cd6,code,\n\n# backward pass to compute gradient dLoss/dw\nloss.backward()\nprint(w.grad),2,9ef2154d,,0.4,0.4,0.4
067348b8,4a99d2ffd04cd6,code,"\n\n# update weights\n# next forward and backward pass...\n\n# continue optimizing:\n# update weights, this operatio...",3,9ef2154d,,0.6,0.6,0.6






id                                                                                                                        206aadfedae1ad
label_order      [46946ca5, 022d6bd1, 3a917eba, 59da069c, 4d9f8b9f, eb012803, d4470b3c, 6296022f, 2ba8a5bf, 86943f9e, 9927ec8f, 37578...
pred_order       [46946ca5, 59da069c, 77022ccd, a2422444, 7d3549e7, 55790065, 1e7ecfb4, eb012803, 3ec5ab6f, 3757841d, 1e6707b1, 022d6...
kendall_score                                                                                                                   0.219873
Name: 1353, dtype: object
>>> ---------- Notebook id: 206aadfedae1ad ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
46946ca5,206aadfedae1ad,markdown,# 7 Coolest Python Packages Top Kagglers Are Using Without Telling You\n## Let me expose the secrets...\n![](https:/...,0,54b0f8b3,,0.0,0.0,-0.00143
59da069c,206aadfedae1ad,markdown,"Kaggle is a hot spot for what is trending in data science and machine learning.\n\nDue to its competitiveness, the t...",3,54b0f8b3,,0.068182,0.068182,0.005646
77022ccd,206aadfedae1ad,markdown,The available tools and packages to execute data science tasks are endless. Everyone has the right to be overwhelmed...,42,54b0f8b3,,0.954545,0.954545,0.01712
a2422444,206aadfedae1ad,markdown,Explainable AI (XAI) is one of the strongest trends in the ML and AI sphere. Companies and businesses are starting t...,32,54b0f8b3,,0.727273,0.727273,0.018707
7d3549e7,206aadfedae1ad,markdown,"Normally, I am against any library or tool that takes a programmer away from writing actual code. But, since auto-ED...",40,54b0f8b3,,0.909091,0.909091,0.028137
55790065,206aadfedae1ad,markdown,"Lazypredict is one of the best one-liner packages I have ever seen.\n\nUsing the library, you can train almost all S...",20,54b0f8b3,,0.454545,0.454545,0.028152
1e7ecfb4,206aadfedae1ad,markdown,### 🛠 GitHub and documentation\n- https://github.com/h2oai/datatable\n- https://datatable.readthedocs.io/en/latest/?...,13,54b0f8b3,,0.295455,0.295455,0.028641
eb012803,206aadfedae1ad,markdown,"![](https://cdn-images-1.medium.com/max/800/1*1sUCX4FKCLjJzbEqMjg2YQ.png)\n<figcaption style=""text-align: center;"">\...",5,54b0f8b3,,0.113636,0.113636,0.041168
3ec5ab6f,206aadfedae1ad,markdown,![](https://cdn-images-1.medium.com/max/800/1*ZoH5jGuQiKKhxeZYNHcwDA.png),36,54b0f8b3,,0.818182,0.818182,0.06427
3757841d,206aadfedae1ad,markdown,![](https://cdn-images-1.medium.com/max/800/1*HfCFXaFA0cS2uLw1Gv3x9w.png),11,54b0f8b3,,0.25,0.25,0.068115






id                                                                                                                        17845073a429f3
label_order      [502033a4, 794feada, a42f4b7e, b9d3bfc2, a706930b, 1441e3fc, ece1fb3b, d4e523f0, c6b9d271, 87794ae1, bc9d7947, 1e2a3...
pred_order       [502033a4, d50e67bb, a42f4b7e, 205b4129, 7cc1fd2a, 68bcf209, 794feada, b9d3bfc2, a706930b, 1441e3fc, a6f2a408, d4e52...
kendall_score                                                                                                                   0.224638
Name: 984, dtype: object
>>> ---------- Notebook id: 17845073a429f3 ----------
>>> pred_order_col result


Unnamed: 0_level_0,id,cell_type,source,rank,ancestor_id,parent_id,pct_rank,label,pred
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
502033a4,17845073a429f3,markdown,<h1> COVID-19 Analysis Notebook on Comorbidities (CANCe) </h1>\n\n<h2> What is the incident of COVID-19 Infections i...,0,fbae70fc,,0.0,0.0,-0.003014
d50e67bb,17845073a429f3,markdown,"<img src=""https://www.statista.com/graphic/1/1102796/south-korea-covid-19-deaths-by-chronic-disease.jpg"" alt=""Statis...",18,fbae70fc,,0.75,0.75,0.057007
a42f4b7e,17845073a429f3,markdown,# <a id='main'><h3>Table of Contents</h3></a>\n- [Importing the Essential Libraries](#lib)\n- [Datasets used in note...,2,fbae70fc,,0.083333,0.083333,0.057831
205b4129,17845073a429f3,markdown,<h3> Breakdown of COVID-19 Death Cases in South Korea </h3>\n\nThe following is the dataset that shows the deaths in...,17,fbae70fc,,0.708333,0.708333,0.095581
7cc1fd2a,17845073a429f3,markdown,"<img src=""https://www.statista.com/graphic/1/1110949/common-comorbidities-in-covid-19-deceased-patients-in-italy.jpg...",22,fbae70fc,,0.916667,0.916667,0.121704
68bcf209,17845073a429f3,markdown,"<img src=""https://www.statista.com/graphic/1/1108836/china-coronavirus-covid-19-fatality-rate-by-health-condition.jp...",20,fbae70fc,,0.833333,0.833333,0.134521
794feada,17845073a429f3,markdown,<H3> Task Details </H3>\n\nThe Roche Data Science Coalition is a group of like-minded public and private organizatio...,1,fbae70fc,,0.041667,0.041667,0.145874
b9d3bfc2,17845073a429f3,markdown,# <a id='lib'><h3>Importing the essential libraries</h3></a>,3,fbae70fc,,0.125,0.125,0.155273
a706930b,17845073a429f3,code,#Data Analyses Libraries\nimport pandas as pd \nimport numpy as np \nfrom urllib.request import ur...,4,fbae70fc,,0.166667,0.166667,0.166667
1441e3fc,17845073a429f3,markdown,# <a id='data'><h3>Datasets used for analyses in this notebook</h3></a>\n\nThe various datasets that we take under c...,5,fbae70fc,,0.208333,0.208333,0.200073








### kendall tau score가 낮은 id 실제 print되는 STT 확인

In [99]:
low_score_sample = total_result.sort_values('kendall_score').iloc[:10,:]
# display(low_score_sample)

# 실제 인코딩 텍스트 확인
def print_incoding_result(cell_tb, notebook_id, label_order, pred_order):
    # cell tb 중 확인할 notebook_id sort
    tmp_tb = cell_tb[cell_tb.id == notebook_id].copy()
    tmp_tb = tmp_tb.set_index('cell_id')

    print(f'################### Notebook id: {notebook_id} ###################\n')

    # pred order 대로 cell print
#     print(f'>>> pred_order_col result')
#     display(tmp_tb.loc[pred_order])
    for c_type, label_r, pred_r, t, e_t in tmp_tb.loc[pred_order][['cell_type','rank', 'pred', 'source', 'input_text']].values:
        if c_type == 'markdown':
            print(f'------------------------- code start--------------------------------------------------------------------------------------')
            print(f'{color.YELLOW}1. Code Type:{color.END}{c_type}\n')
            print(f'{color.YELLOW}>> Label rank:{color.END}{label_r}\n')
            print(f'{color.YELLOW}>> Pred rank:{color.END}{pred_r}\n')     
            print(f'{color.YELLOW}2. og text:{color.END} \n{t}\n')
            print(f'{color.YELLOW}3. input_text:{color.END} \n{e_t}\n')
#             _encode = tokenizer_forex.encode(t, max_length=md_max_len)
#             print(f'4. encode: \n{_encode[:5]}\n')        
#             print(f'5. decode: \n{tokenizer_forex.decode(_encode)}')
            print(f'------------------------- code end--------------------------------------------------------------------------------------')        
        else:
            pass
    return None
print(md_max_len)

64
################### Notebook id: 2fa3c4c997e1d1 ###################

------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
Keras 전처리 층 API는 개발자들이 Keras 고유의 입력 처리 파이프라인을 만들 수 있게 해줍니다. 이 입력 처리 파이프라인들은 Keras가 아닌 작업 흐름 안에서 독립적인 사전 처리 코드로써 사용되고, 직접적으로 Keras 모델들과 결합되고, Keras SavedModel의 일부로써 수출될 수 있습니다.



'2. og_source: \nKeras 전처리 층 API는 개발자들이 Keras 고유의 입력 처리 파이프라인을 만들 수 있게 해줍니다. 이 입력 처리 파이프라인들은 Keras가 아닌 작업 흐름 안에서 독립적인 사전 처리 코드로써 사용되고, 직접적으로 Keras 모델들과 결합되고, Keras SavedModel의 일부로써 수출될 수 있습니다.\n'

3. encode: 
[0, 530, 254, 281, 46747]

4. decode: 
<s>Keras 전처리 층 API는 개발자들이 Keras 고유의 입력 처리 파�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
[바닥에서 부터 이미지 분류](https://keras.io/examples/vision/image_classification_from_scratch/) 예제에서 비슷한 설정 활동을 볼 수 있습니다.



'2. og_source: \n[바닥에서 부터 이미지 분류](https://keras.io/examples/vision/image_classification_from_scratch/) 예제에서 비슷한 설정 활동을 볼 수 있습니다.\n'

3. encode: 
[0, 10975, 45209, 7487, 10674]

4. decode: 
<s>[바닥에서 부터 이미지 분류](https://keras.io/examples/vision/image_classification_from_scratch/) 예제</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
[바닥에서 부터 텍스트 분류](https://keras.io/examples/nlp/text_classification_from_scratch/) 예제에서 `Embedding` 방식과 결합된 `TextVectorization` 층을 동작 속에서 볼 수 있습니다.



'2. og_source: \n[바닥에서 부터 텍스트 분류](https://keras.io/examples/nlp/text_classification_from_scratch/) 예제에서 `Embedding` 방식과 결합된 `TextVectorization` 층을 동작 속에서 볼 수 있습니다.\n'

3. encode: 
[0, 10975, 45209, 7487, 10674]

4. decode: 
<s>[바닥에서 부터 텍스트 분류](https://keras.io/examples/nlp/text_classification_from_scratch/) 예�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

data = np.array([[0.1, 0.2, 0.3], [0.8, 0.9, 1.0], [1.5, 1.6, 1.7],])
layer = preprocessing.Normalization()
layer.adapt(data)
normalized_data = layer(data)

print("Features mean: %.2f" % (normalized_data.numpy().mean()))
print("Features std: %.2f" % (normalized_data.numpy().std()))



'2. og_source: \nimport numpy as np\nimport tensorflow as tf\nfrom tensorflow.keras.layers.experimental import preprocessing\n\ndata = np.array([[0.1, 0.2, 0.3], [0.8, 0.9, 1.0], [1.5, 1.6, 1.7],])\nlayer = preprocessing.Normalization()\nlayer.adapt(data)\nnormalized_data = layer(data)\n\nprint("Features mean: %.2f" % (normalized_data.numpy().mean()))\nprint("Features std: %.2f" % (normalized_data.numpy().std()))\n'

3. encode: 
[0, 41975, 295, 35187, 25]

4. decode: 
<s>import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

data = np.array([[0.1, 0.2, 0.3], [0.8, 0.9, 1.0</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
## 추론 기간에 모델 안에서 전처리 수행의 장점



'2. og_source: \n## 추론 기간에 모델 안에서 전처리 수행의 장점\n'

3. encode: 
[0, 48342, 46747, 19002, 10674]

4. decode: 
<s>## 추론 기간에 모델 안에서 전처리 수행의 장점</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
[바닥에서 부터 구조화된 데이터 분류](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/) 예제에서 `IntegerLookup`과 `CategoryEncoding` 층들을 작동하는 것으로 볼 수 있습니다.



'2. og_source: \n[바닥에서 부터 구조화된 데이터 분류](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/) 예제에서 `IntegerLookup`과 `CategoryEncoding` 층들을 작동하는 것으로 볼 수 있습니다.\n'

3. encode: 
[0, 10975, 45209, 7487, 10674]

4. decode: 
<s>[바닥에서 부터 구조화된 데이터 분류](https://keras.io/examples/structured_data/struct</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
[바닥에서 부터 구조화된 데이터 분류](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/) 예제에서 `StringLookup`과 `CategoryEncoding` 층들을 작동하는 것으로 볼 수 있습니다.



'2. og_source: \n[바닥에서 부터 구조화된 데이터 분류](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/) 예제에서 `StringLookup`과 `CategoryEncoding` 층들을 작동하는 것으로 볼 수 있습니다.\n'

3. encode: 
[0, 10975, 45209, 7487, 10674]

4. decode: 
<s>[바닥에서 부터 구조화된 데이터 분류](https://keras.io/examples/structured_data/struct</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
만약 초기에 전처리 층들을 [`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data) 파이프라인에 넣었다면, 전처리를 묶는 추론 모델을 내보낼 수 있습니다. 간단하게 전처리 층들과 학습 모델을 잇는 새로운 모델을 생성합니다:



'2. og_source: \n만약 초기에 전처리 층들을 [`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data) 파이프라인에 넣었다면, 전처리를 묶는 추론 모델을 내보낼 수 있습니다. 간단하게 전처리 층들과 학습 모델을 잇는 새로운 모델을 생성합니다:\n'

3. encode: 
[0, 45209, 6248, 14285, 43998]

4. decode: 
<s>만약 초기에 전처리 층들을 [`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data) �</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
data = [
    "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυθοι",
    "γίγνοντ᾽, οὐδέ τι πάντα τελείεται ἀνθρώποισι.",
    "δοιαὶ γάρ τε πύλαι ἀμενηνῶν εἰσὶν ὀνείρων:",
    "αἱ μὲν γὰρ κεράεσσι τετεύχαται, αἱ δ᾽ ἐλέφαντι:",
    "τῶν οἳ μέν κ᾽ ἔλθωσι διὰ πριστοῦ ἐλέφαντος,",
    "οἵ ῥ᾽ ἐλεφαίρονται, ἔπε᾽ ἀκράαντα φέροντες:",
    "οἱ δὲ διὰ ξεστῶν κεράων ἔλθωσι θύραζε,",
    "οἵ ῥ᾽ ἔτυμα κραίνουσι, βροτῶν ὅτε κέν τις ἴδηται.",
]
layer = preprocessing.TextVectorization()
layer.adapt(data)
vectorized_text = layer(data)
print(vectorized_text)



'2. og_source: \ndata = [\n    "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυθοι",\n    "γίγνοντ᾽, οὐδέ τι πάντα τελείεται ἀνθρώποισι.",\n    "δοιαὶ γάρ τε πύλαι ἀμενηνῶν εἰσὶν ὀνείρων:",\n    "αἱ μὲν γὰρ κεράεσσι τετεύχαται, αἱ δ᾽ ἐλέφαντι:",\n    "τῶν οἳ μέν κ᾽ ἔλθωσι διὰ πριστοῦ ἐλέφαντος,",\n    "οἵ ῥ᾽ ἐλεφαίρονται, ἔπε᾽ ἀκράαντα φέροντες:",\n    "οἱ δὲ διὰ ξεστῶν κεράων ἔλθωσι θύραζε,",\n    "οἵ ῥ᾽ ἔτυμα κραίνουσι, βροτῶν ὅτε κέν τις ἴδηται.",\n]\nlayer = preprocessing.TextVectorization()\nlayer.adapt(data)\nvectorized_text = layer(data)\nprint(vectorized_text)\n'

3. encode: 
[0, 23687, 5457, 646, 50118]

4. decode: 
<s>data = [
    "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυ</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
# 전처리 층 다루기



'2. og_source: \n# 전처리 층 다루기\n'

3. encode: 
[0, 10431, 46747, 21402, 11936]

4. decode: 
<s># 전처리 층 다루기</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
## Keras 전처리 층



'2. og_source: \n## Keras 전처리 층\n'

3. encode: 
[0, 48342, 9508, 281, 46747]

4. decode: 
<s>## Keras 전처리 층</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
결정적으로, 이 층들은 **학습 불가능**합니다. 이들의 상태는 학습 동안 설정되지 않습니다; **학습 이전에** 설정되어야 하며, 이 단계는 "적응"이라고 불립니다.



'2. og_source: \n결정적으로, 이 층들은 **학습 불가능**합니다. 이들의 상태는 학습 동안 설정되지 않습니다; **학습 이전에** 설정되어야 하며, 이 단계는 "적응"이라고 불립니다.\n'

3. encode: 
[0, 46873, 14292, 7487, 43998]

4. decode: 
<s>결정적으로, 이 층들은 **학습 불가능**합니다. 이들의 상�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
Keras 전처리 층으로, 완전히 종단하는 모델들(가공되지 않은 이미지나 가공되지 않은 구조화된 데이터를 입력으로 받으며, 자체적인 특징 표준화나 특징 값 색인을 다루는 모델들)을 만들고 수출할 수 있습니다.



'2. og_source: \nKeras 전처리 층으로, 완전히 종단하는 모델들(가공되지 않은 이미지나 가공되지 않은 구조화된 데이터를 입력으로 받으며, 자체적인 특징 표준화나 특징 값 색인을 다루는 모델들)을 만들고 수출할 수 있습니다.\n'

3. encode: 
[0, 530, 254, 281, 46747]

4. decode: 
<s>Keras 전처리 층으로, 완전히 종단하는 모델들(가공되�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 텍스트를 토큰 색인들의 순열로 인코딩



'2. og_source: \n### 텍스트를 토큰 색인들의 순열로 인코딩\n'

3. encode: 
[0, 48134, 1437, 47649, 5782]

4. decode: 
<s>### 텍스트를 토큰 색인들의 순열로 인코딩</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
* `TextVectorization`: 문자열 토큰과 정수형 색인 사이의 대응을 가집니다.
* `Normalization`: 특징의 평균과 표준 편차를 가집니다.
* `StringLookup`과 `IntegerLookup`: 입력 값과 출력 색인 사이의 대응을 가집니다.
* `CategoryEncoding`: 입력 값의 색인을 가집니다.
* `Discretization`: 바구니 경계 값에 대한 정보를 가집니다.



'2. og_source: \n* `TextVectorization`: 문자열 토큰과 정수형 색인 사이의 대응을 가집니다.\n* `Normalization`: 특징의 평균과 표준 편차를 가집니다.\n* `StringLookup`과 `IntegerLookup`: 입력 값과 출력 색인 사이의 대응을 가집니다.\n* `CategoryEncoding`: 입력 값의 색인을 가집니다.\n* `Discretization`: 바구니 경계 값에 대한 정보를 가집니다.\n'

3. encode: 
[0, 3226, 22209, 39645, 48417]

4. decode: 
<s>* `TextVectorization`: 문자열 토큰과 정수형 색인 사이의 대응을 가집니</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### TF-IDF 가중치로 ngram의 밀집 행렬로써 텍스트 인코딩



'2. og_source: \n### TF-IDF 가중치로 ngram의 밀집 행렬로써 텍스트 인코딩\n'

3. encode: 
[0, 48134, 35690, 12, 2688]

4. decode: 
<s>### TF-IDF 가중치로 ngram의 밀집 행렬로써 텍스트 인코딩</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = preprocessing.StringLookup(vocabulary=vocab)
vectorized_data = layer(data)
print(vectorized_data)



'2. og_source: \nvocab = ["a", "b", "c", "d"]\ndata = tf.constant([["a", "c", "d"], ["d", "z", "b"]])\nlayer = preprocessing.StringLookup(vocabulary=vocab)\nvectorized_data = layer(data)\nprint(vectorized_data)\n'

3. encode: 
[0, 31375, 873, 5457, 46679]

4. decode: 
<s>vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = preprocessing.StringLookup(vocabulary=vocab)
vectorized_</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
* `CategoryEncoding` 층: 정수형 범주형 특징들을 one-hot, multi-hot, 또는 TF-IDF 밀집 표현으로 바꿉니다.
* `Hashing` 층: "해싱 기법"으로도 알려진, 범주형 특징 해싱을 수행합니다.
* `Discretization` 층: 연속형 수치형 특징들을 정수형 범주형 특징들로 바꿉니다.
* `StringLookup` 층: 문자열 범주형 값들을 정수형 색인으로 바꿉니다.
* `IntegerLookup` 층: 정수형 범주형 값들을 정수형 색인으로 바꿉니다.
* `CategoryCrossing` 층: 범주형 특징들을 동시 발현 특징들로 결합합니다. 예를 들어, 만약 특징 값 "a"와 "b"를 가지고 있다면, 결합 특징 "a와 b가 동시에 존재한다"를 공급할 수 있습니다.



'2. og_source: \n* `CategoryEncoding` 층: 정수형 범주형 특징들을 one-hot, multi-hot, 또는 TF-IDF 밀집 표현으로 바꿉니다.\n* `Hashing` 층: "해싱 기법"으로도 알려진, 범주형 특징 해싱을 수행합니다.\n* `Discretization` 층: 연속형 수치형 특징들을 정수형 범주형 특징들로 바꿉니다.\n* `StringLookup` 층: 문자열 범주형 값들을 정수형 색인으로 바꿉니다.\n* `IntegerLookup` 층: 정수형 범주형 값들을 정수형 색인으로 바꿉니다.\n* `CategoryCrossing` 층: 범주형 특징들을 동시 발현 특징들로 결합합니다. 예를 들어, 만약 특징 값 "a"와 "b"를 가지고 있다면, 결합 특징 "a와 b가 동시에 존재한다"를 공급할 수 있습니다.\n'

3. encode: 
[0, 3226, 22209, 46308, 45780]

4. decode: 
<s>* `CategoryEncoding` 층: 정수형 범주형 특징들을 one-hot, multi-hot, 또는 TF-IDF 밀�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이것은 `TextVectorization`과 모든 구조화된 데이터 전처리 층들에 대해 최선의 선택입니다. 이것은 CPU 위에서 학습을 시키고 이미지 전처리 층들을 사용하는 경우에도 좋은 선택지입니다.



'2. og_source: \n이것은 `TextVectorization`과 모든 구조화된 데이터 전처리 층들에 대해 최선의 선택입니다. 이것은 CPU 위에서 학습을 시키고 이미지 전처리 층들을 사용하는 경우에도 좋은 선택지입니다.\n'

3. encode: 
[0, 48280, 20024, 46873, 14292]

4. decode: 
<s>이것은 `TextVectorization`과 모든 구조화된 데이터 전처리 층들에 �</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
## 사용 가능한 전처리 층



'2. og_source: \n## 사용 가능한 전처리 층\n'

3. encode: 
[0, 48342, 46747, 49171, 43998]

4. decode: 
<s>## 사용 가능한 전처리 층</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
```python
dataset = dataset.map(
    lambda x, y: (preprocessing_layer(x), y))
```



'2. og_source: \n```python\ndataset = dataset.map(\n    lambda x, y: (preprocessing_layer(x), y))\n```\n'

3. encode: 
[0, 49519, 12905, 49119, 50118]

4. decode: 
<s>```python
dataset = dataset.map(
    lambda x, y: (preprocessing_layer(x), y))
```</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### multi-hot 인코딩으로 ngram의 밀집 행렬로써 텍스트 인코딩



'2. og_source: \n### multi-hot 인코딩으로 ngram의 밀집 행렬로써 텍스트 인코딩\n'

3. encode: 
[0, 48134, 3228, 12, 10120]

4. decode: 
<s>### multi-hot 인코딩으로 ngram의 밀집 행렬로써 텍스트 인코딩</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### one-hot 인코딩을 통한 정수형 범주형 특징 인코딩



'2. og_source: \n### one-hot 인코딩을 통한 정수형 범주형 특징 인코딩\n'

3. encode: 
[0, 48134, 65, 12, 10120]

4. decode: 
<s>### one-hot 인코딩을 통한 정수형 범주형 특징 인코딩</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 정수형 범주형 특징에 해싱 기법 적용



'2. og_source: \n### 정수형 범주형 특징에 해싱 기법 적용\n'

3. encode: 
[0, 48134, 46747, 21402, 15722]

4. decode: 
<s>### 정수형 범주형 특징에 해싱 기법 적용</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### one-hot 인코딩을 통한 문자열 범주형 특징 인코딩



'2. og_source: \n### one-hot 인코딩을 통한 문자열 범주형 특징 인코딩\n'

3. encode: 
[0, 48134, 65, 12, 10120]

4. decode: 
<s>### one-hot 인코딩을 통한 문자열 범주형 특징 인코딩</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
from tensorflow import keras
from tensorflow.keras import layers

# 수평 뒤집기, 회전, 확대로 데이터 증강 단계를 생성합니다
data_augmentation = keras.Sequential(
    [
        preprocessing.RandomFlip("horizontal"),
        preprocessing.RandomRotation(0.1),
        preprocessing.RandomZoom(0.1),
    ]
)

# 증강 단계를 포함하는 모델을 생성합니다
input_shape = (32, 32, 3)
classes = 10
inputs = keras.Input(shape=input_shape)
# 이미지를 증강합니다
x = data_augmentation(inputs)
# 이미지 값들을 [0, 1]로 재조정합니다
x = preprocessing.Rescaling(1.0 / 255)(x)
# 모델의 나머지를 추가합니다
outputs = keras.applications.ResNet50(
    weights=None, input_shape=input_shape, classes=classes
)(x)
model = k

'2. og_source: \nfrom tensorflow import keras\nfrom tensorflow.keras import layers\n\n# 수평 뒤집기, 회전, 확대로 데이터 증강 단계를 생성합니다\ndata_augmentation = keras.Sequential(\n    [\n        preprocessing.RandomFlip("horizontal"),\n        preprocessing.RandomRotation(0.1),\n        preprocessing.RandomZoom(0.1),\n    ]\n)\n\n# 증강 단계를 포함하는 모델을 생성합니다\ninput_shape = (32, 32, 3)\nclasses = 10\ninputs = keras.Input(shape=input_shape)\n# 이미지를 증강합니다\nx = data_augmentation(inputs)\n# 이미지 값들을 [0, 1]로 재조정합니다\nx = preprocessing.Rescaling(1.0 / 255)(x)\n# 모델의 나머지를 추가합니다\noutputs = keras.applications.ResNet50(\n    weights=None, input_shape=input_shape, classes=classes\n)(x)\nmodel = keras.Model(inputs, outputs)\n'

3. encode: 
[0, 7761, 7281, 368, 19322]

4. decode: 
<s>from tensorflow import keras
from tensorflow.keras import layers

# 수평 뒤집기, 회전, 확대로 데이터</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 구조화된 데이터 전처리 층



'2. og_source: \n### 구조화된 데이터 전처리 층\n'

3. encode: 
[0, 48134, 1437, 46873, 8906]

4. decode: 
<s>### 구조화된 데이터 전처리 층</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
만약 각 값이 데이터에서 오직 몇번만 나타나는 많은 다른 값들(대략 10e3 이상)을 가질 수 있는 범주형 특징을 가지고 있다면, 특징값들을 색인하고 one-hot 인코딩을 하는 것은 비실용적이고 비효율적이게 됩니다. 대신, "해싱 기법"을 적용하는 것이 좋은 발상이 될 수 있습니다: 값을 고정된 크기의 벡터로 해싱합니다. 이것은 특징 공간의 크기를 관리할 수 있게 유지하고 명백한 색인에 대한 필요성을 제거합니다.



'2. og_source: \n만약 각 값이 데이터에서 오직 몇번만 나타나는 많은 다른 값들(대략 10e3 이상)을 가질 수 있는 범주형 특징을 가지고 있다면, 특징값들을 색인하고 one-hot 인코딩을 하는 것은 비실용적이고 비효율적이게 됩니다. 대신, "해싱 기법"을 적용하는 것이 좋은 발상이 될 수 있습니다: 값을 고정된 크기의 벡터로 해싱합니다. 이것은 특징 공간의 크기를 관리할 수 있게 유지하고 명백한 색인에 대한 필요성을 제거합니다.\n'

3. encode: 
[0, 45209, 6248, 14285, 43998]

4. decode: 
<s>만약 각 값이 데이터에서 오직 몇번만 나타나는 많은</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
여기 사전 계산된 사전으로 `StringLookup` 층을 생성하는 예제입니다:



'2. og_source: \n여기 사전 계산된 사전으로 `StringLookup` 층을 생성하는 예제입니다:\n'

3. encode: 
[0, 43998, 6800, 11582, 46873]

4. decode: 
<s>여기 사전 계산된 사전으로 `StringLookup` 층을 생성하는 예제입</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
## 빠른 사용법



'2. og_source: \n## 빠른 사용법\n'

3. encode: 
[0, 48342, 47672, 9253, 21402]

4. decode: 
<s>## 빠른 사용법</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
* `TextVectorization` 층: 가공되지 않은 문자열을 `Embedding` 층이나 `Dense` 층에 의해 읽힐 수 있는 인코딩된 표현으로 바꿉니다.
* `Normalization` 층: 입력 특징들의 특징별 표준화를 수행합니다.



'2. og_source: \n* `TextVectorization` 층: 가공되지 않은 문자열을 `Embedding` 층이나 `Dense` 층에 의해 읽힐 수 있는 인코딩된 표현으로 바꿉니다.\n* `Normalization` 층: 입력 특징들의 특징별 표준화를 수행합니다.\n'

3. encode: 
[0, 3226, 22209, 39645, 48417]

4. decode: 
<s>* `TextVectorization` 층: 가공되지 않은 문자열을 `Embedding` 층이나 `Dense` 층에</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
**선택지 2**: 전처리된 데이터의 묶음을 내놓을 수 있는 데이터 세트를 얻기 위하여, 다음과 같이, [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)에 적용합니다:



'2. og_source: \n**선택지 2**: 전처리된 데이터의 묶음을 내놓을 수 있는 데이터 세트를 얻기 위하여, 다음과 같이, [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)에 적용합니다:\n'

3. encode: 
[0, 12606, 43998, 11936, 21402]

4. decode: 
<s>**선택지 2**: 전처리된 데이터의 묶음을 내놓을 수 있는 데</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이 층들은 구조화된 데이터 인코딩과 특징 공학을 위한 것입니다.



'2. og_source: \n이 층들은 구조화된 데이터 인코딩과 특징 공학을 위한 것입니다.\n'

3. encode: 
[0, 48280, 20024, 46747, 18537]

4. decode: 
<s>이 층들은 구조화된 데이터 인코딩과 특징 공학을 위�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이것은 `Dense` 층에 전달되는 텍스트를 전처리해야 하는 방법입니다.



'2. og_source: \n이것은 `Dense` 층에 전달되는 텍스트를 전처리해야 하는 방법입니다.\n'

3. encode: 
[0, 48280, 20024, 46873, 14292]

4. decode: 
<s>이것은 `Dense` 층에 전달되는 텍스트를 전처리해야 하는 �</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이것은 `Dense` 층에 텍스트를 넘기기 전에 전처리하는 또다른 방법입니다.



'2. og_source: \n이것은 `Dense` 층에 텍스트를 넘기기 전에 전처리하는 또다른 방법입니다.\n'

3. encode: 
[0, 48280, 20024, 46873, 14292]

4. decode: 
<s>이것은 `Dense` 층에 텍스트를 넘기기 전에 전처리하는 또</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
선택지 2를 사용해 진행한다고 해도, 추후에 전처리 층들을 포함할 추론만 수행하는 종단 간 모델을 내보내고 싶을 수도 있습니다. 이것을 수행하는 핵심 장점은 **모델을 휴대 가능하게 만든다**는 것과 **[학습/제공 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)을 줄이게 도와준다**는 것입니다.



'2. og_source: \n선택지 2를 사용해 진행한다고 해도, 추후에 전처리 층들을 포함할 추론만 수행하는 종단 간 모델을 내보내고 싶을 수도 있습니다. 이것을 수행하는 핵심 장점은 **모델을 휴대 가능하게 만든다**는 것과 **[학습/제공 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)을 줄이게 도와준다**는 것입니다.\n'

3. encode: 
[0, 43998, 11936, 21402, 47649]

4. decode: 
<s>선택지 2를 사용해 진행한다고 해도, 추후에 전처리 층�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
## 모델 이전 혹은 안에서 데이터 전처리



'2. og_source: \n## 모델 이전 혹은 안에서 데이터 전처리\n'

3. encode: 
[0, 48342, 47672, 10278, 11423]

4. decode: 
<s>## 모델 이전 혹은 안에서 데이터 전처리</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 몇개의 데이터를 불러옵니다
(x_train, y_train), _ = keras.datasets.cifar10.load_data()
x_train = x_train.reshape((len(x_train), -1))
input_shape = x_train.shape[1:]
classes = 10

# 학습 데이터를 사용해 정규화 층과 그 내부 상태를 생성합니다
normalizer = preprocessing.Normalization()
normalizer.adapt(x_train)

# 정규화 층을 포함하는 모델을 생성합니다
inputs = keras.Input(shape=input_shape)
x = normalizer(inputs)
outputs = layers.Dense(classes, activation="softmax")(x)
model = keras.Model(inputs, outputs)

# 모델을 학습시킵니다
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.fit(x_train, y_train)



'2. og_source: \n# 몇개의 데이터를 불러옵니다\n(x_train, y_train), _ = keras.datasets.cifar10.load_data()\nx_train = x_train.reshape((len(x_train), -1))\ninput_shape = x_train.shape[1:]\nclasses = 10\n\n# 학습 데이터를 사용해 정규화 층과 그 내부 상태를 생성합니다\nnormalizer = preprocessing.Normalization()\nnormalizer.adapt(x_train)\n\n# 정규화 층을 포함하는 모델을 생성합니다\ninputs = keras.Input(shape=input_shape)\nx = normalizer(inputs)\noutputs = layers.Dense(classes, activation="softmax")(x)\nmodel = keras.Model(inputs, outputs)\n\n# 모델을 학습시킵니다\nmodel.compile(optimizer="adam", loss="sparse_categorical_crossentropy")\nmodel.fit(x_train, y_train)\n'

3. encode: 
[0, 10431, 47672, 10278, 6382]

4. decode: 
<s># 몇개의 데이터를 불러옵니다
(x_train, y_train), _ = keras.datasets.cifar10.load_data()
</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 수치형 특징 정규화



'2. og_source: \n### 수치형 특징 정규화\n'

3. encode: 
[0, 48134, 46747, 23133, 711]

4. decode: 
<s>### 수치형 특징 정규화</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
* `Resizing` 층: 이미지 묶음을 목표 크기로 조절합니다.
* `Rescaling` 층: 이미지 묶음의 값을 재조정하고 옮깁니다(예를 들어, `[0, 255]` 범위의 입력에서 `[0, 1]` 범위의 입력으로 갑니다).
* `CentorCrop` 층: 이미지 묶음의 중심 조각을 반환합니다.



'2. og_source: \n* `Resizing` 층: 이미지 묶음을 목표 크기로 조절합니다.\n* `Rescaling` 층: 이미지 묶음의 값을 재조정하고 옮깁니다(예를 들어, `[0, 255]` 범위의 입력에서 `[0, 1]` 범위의 입력으로 갑니다).\n* `CentorCrop` 층: 이미지 묶음의 중심 조각을 반환합니다.\n'

3. encode: 
[0, 3226, 22209, 20028, 2787]

4. decode: 
<s>* `Resizing` 층: 이미지 묶음을 목표 크기로 조절합니다.
* `Rescaling` �</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
전처리 층들을 사용할 수 있는 두가지 방법이 있습니다:



'2. og_source: \n전처리 층들을 사용할 수 있는 두가지 방법이 있습니다:\n'

3. encode: 
[0, 43998, 21402, 11936, 43998]

4. decode: 
<s>전처리 층들을 사용할 수 있는 두가지 방법이 있습니다:</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이것은 `Embedding` 층에 넘겨지는 텍스트를 어떻게 전처리해야 하는지 입니다.



'2. og_source: \n이것은 `Embedding` 층에 넘겨지는 텍스트를 어떻게 전처리해야 하는지 입니다.\n'

3. encode: 
[0, 48280, 20024, 46873, 14292]

4. decode: 
<s>이것은 `Embedding` 층에 넘겨지는 텍스트를 어떻게 전처리�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
모든 데이터 전처리가 모델의 일부라면, 다른 사람들은 각 특징들이 어떻게 인코딩 & 표준화되기를 기대하는지에 대해 알고 있지 않아도 모델을 불러오고 사용할 수 있을 것입니다. 추론 모델은 원본 이미지나 원본 구조화된 데이터를 처리할 수 있을 것이고, 모델의 사용자들이 텍스트에 대해 사용된 토큰화 계획, 범주형 특징들에 대해 사용된 색인 계획, 이미지 픽셀값들이 `[-1, +1]`이나 `[0, 1]`로 표준화되었는지 여부 등 자세한 내용을 알도록 요구하지 않을 것입니다. 이것은 TensorFlow.js와 같은 다른 런타임에 모델을 내보낼 때 특별하게 강력합니다: JavaScript에서 전처리 파이프라인을 재구형하지 않아도 됩니다.



'2. og_source: \n모든 데이터 전처리가 모델의 일부라면, 다른 사람들은 각 특징들이 어떻게 인코딩 & 표준화되기를 기대하는지에 대해 알고 있지 않아도 모델을 불러오고 사용할 수 있을 것입니다. 추론 모델은 원본 이미지나 원본 구조화된 데이터를 처리할 수 있을 것이고, 모델의 사용자들이 텍스트에 대해 사용된 토큰화 계획, 범주형 특징들에 대해 사용된 색인 계획, 이미지 픽셀값들이 `[-1, +1]`이나 `[0, 1]`로 표준화되었는지 여부 등 자세한 내용을 알도록 요구하지 않을 것입니다. 이것은 TensorFlow.js와 같은 다른 런타임에 모델을 내보낼 때 특별하게 강력합니다: JavaScript에서 전처리 파이프라인을 재구형하지 않아도 됩니다.\n'

3. encode: 
[0, 45209, 10278, 11423, 45209]

4. decode: 
<s>모든 데이터 전처리가 모델의 일부라면, 다른 사람들�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
**선택지 1**: 다음과 같이, 모델의 일부로 만듭니다:



'2. og_source: \n**선택지 1**: 다음과 같이, 모델의 일부로 만듭니다:\n'

3. encode: 
[0, 12606, 43998, 11936, 21402]

4. decode: 
<s>**선택지 1**: 다음과 같이, 모델의 일부로 만듭니다:</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
몇개의 전처리 층들은 학습 데이터의 표본을 기반으로 계산되야 하는 내부 상태를 가집니다. 상태를 가지는 전처리 층들의 목록은:



'2. og_source: \n몇개의 전처리 층들은 학습 데이터의 표본을 기반으로 계산되야 하는 내부 상태를 가집니다. 상태를 가지는 전처리 층들의 목록은:\n'

3. encode: 
[0, 45209, 10278, 6382, 46873]

4. decode: 
<s>몇개의 전처리 층들은 학습 데이터의 표본을 기반으�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 이미지 데이터 증강 층



'2. og_source: \n### 이미지 데이터 증강 층\n'

3. encode: 
[0, 48134, 46747, 46, 20024]

4. decode: 
<s>### 이미지 데이터 증강 층</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 이미지 데이터 증강 (기기 위에서)



'2. og_source: \n### 이미지 데이터 증강 (기기 위에서)\n'

3. encode: 
[0, 48134, 46747, 46, 20024]

4. decode: 
<s>### 이미지 데이터 증강 (기기 위에서)</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = rest_of_the_model(x)
model = keras.Model(inputs, outputs)
```



'2. og_source: \n```python\ninputs = keras.Input(shape=input_shape)\nx = preprocessing_layer(inputs)\noutputs = rest_of_the_model(x)\nmodel = keras.Model(inputs, outputs)\n```\n'

3. encode: 
[0, 49519, 12905, 49119, 50118]

4. decode: 
<s>```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = rest_of_the_model(x)
model = keras.Model(inputs, outputs)
```</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 몇가지 연습용 데이터를 정의합니다
data = tf.constant(["a", "b", "c", "b", "c", "a"])

# 특징값들의 색인을 만들기 위해 StringLookup을 사용합니다
indexer = preprocessing.StringLookup()
indexer.adapt(data)

# 정수형 색인들을 one-hot 벡터로 인코딩하기 위해 CategoryEncoding을 사용합니다
encoder = preprocessing.CategoryEncoding(output_mode="binary")
encoder.adapt(indexer(data))

# (알 수 없는 특징값을 포함하는) 새로운 평가 데이터를 변환합니다
test_data = tf.constant(["a", "b", "c", "d", "e", ""])
encoded_data = encoder(indexer(test_data))
print(encoded_data)



'2. og_source: \n# 몇가지 연습용 데이터를 정의합니다\ndata = tf.constant(["a", "b", "c", "b", "c", "a"])\n\n# 특징값들의 색인을 만들기 위해 StringLookup을 사용합니다\nindexer = preprocessing.StringLookup()\nindexer.adapt(data)\n\n# 정수형 색인들을 one-hot 벡터로 인코딩하기 위해 CategoryEncoding을 사용합니다\nencoder = preprocessing.CategoryEncoding(output_mode="binary")\nencoder.adapt(indexer(data))\n\n# (알 수 없는 특징값을 포함하는) 새로운 평가 데이터를 변환합니다\ntest_data = tf.constant(["a", "b", "c", "d", "e", ""])\nencoded_data = encoder(indexer(test_data))\nprint(encoded_data)\n'

3. encode: 
[0, 10431, 47672, 10278, 6382]

4. decode: 
<s># 몇가지 연습용 데이터를 정의합니다
data = tf.constant(["a", "b", "c", "b", "</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
추가적으로, 적응 가능한 층들은 생성자 인자나 가중치 대입을 통해 상태를 직접적으로 설정할 수 있는 선택지를 항상 제공합니다. 만약 대상 상태값이 층 생성 기간에 알려져 있거나, `adapt()` 호출의 밖에서 계산된다면, 그들은 층의 내부 계산에 기대지 않고 설정될 수 있습니다. 예를 들어, `TextVectorization`, `StringLookup`, 또는 `IntegerLookup` 층들에 대한 외부 사전 파일들이 이미 존재한다면, 그것들은 층의 생성자 인자에 사전 파일에 대한 경로를 넘겨줌으로써 검색표에 직접적으로 불러와질 수 있습니다.



'2. og_source: \n추가적으로, 적응 가능한 층들은 생성자 인자나 가중치 대입을 통해 상태를 직접적으로 설정할 수 있는 선택지를 항상 제공합니다. 만약 대상 상태값이 층 생성 기간에 알려져 있거나, `adapt()` 호출의 밖에서 계산된다면, 그들은 층의 내부 계산에 기대지 않고 설정될 수 있습니다. 예를 들어, `TextVectorization`, `StringLookup`, 또는 `IntegerLookup` 층들에 대한 외부 사전 파일들이 이미 존재한다면, 그것들은 층의 생성자 인자에 사전 파일에 대한 경로를 넘겨줌으로써 검색표에 직접적으로 불러와질 수 있습니다.\n'

3. encode: 
[0, 43998, 19002, 10674, 46873]

4. decode: 
<s>추가적으로, 적응 가능한 층들은 생성자 인자나 가�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 이미지 전처리 층



'2. og_source: \n### 이미지 전처리 층\n'

3. encode: 
[0, 48134, 46747, 46, 20024]

4. decode: 
<s>### 이미지 전처리 층</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이 층들은 이미지 묶음에 무작위 증강 변환을 적용합니다. 오직 학습 동안에만 활성화됩니다.



'2. og_source: \n이 층들은 이미지 묶음에 무작위 증강 변환을 적용합니다. 오직 학습 동안에만 활성화됩니다.\n'

3. encode: 
[0, 48280, 20024, 46747, 18537]

4. decode: 
<s>이 층들은 이미지 묶음에 무작위 증강 변환을 적용합니다</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = training_model(x)
inference_model = keras.Model(inputs, outputs)
```



'2. og_source: \n```python\ninputs = keras.Input(shape=input_shape)\nx = preprocessing_layer(inputs)\noutputs = training_model(x)\ninference_model = keras.Model(inputs, outputs)\n```\n'

3. encode: 
[0, 49519, 12905, 49119, 50118]

4. decode: 
<s>```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = training_model(x)
inference_model = keras.Model(inputs, outputs)
```</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 몇가지 연습용 데이터를 정의합니다
data = tf.constant([10, 20, 20, 10, 30, 0])

# 특징값들의 색인을 만들기 위해 IntegerLookup을 사용합니다
indexer = preprocessing.IntegerLookup()
indexer.adapt(data)

# 정수형 색인들을 one-hot 벡터로 인코딩하기 위해 CategoryEncoding을 사용합니다
encoder = preprocessing.CategoryEncoding(output_mode="binary")
encoder.adapt(indexer(data))

# (알 수 없는 특징값을 포함하는) 새로운 평가 데이터를 변환합니다
test_data = tf.constant([10, 10, 20, 50, 60, 0])
encoded_data = encoder(indexer(test_data))
print(encoded_data)



'2. og_source: \n# 몇가지 연습용 데이터를 정의합니다\ndata = tf.constant([10, 20, 20, 10, 30, 0])\n\n# 특징값들의 색인을 만들기 위해 IntegerLookup을 사용합니다\nindexer = preprocessing.IntegerLookup()\nindexer.adapt(data)\n\n# 정수형 색인들을 one-hot 벡터로 인코딩하기 위해 CategoryEncoding을 사용합니다\nencoder = preprocessing.CategoryEncoding(output_mode="binary")\nencoder.adapt(indexer(data))\n\n# (알 수 없는 특징값을 포함하는) 새로운 평가 데이터를 변환합니다\ntest_data = tf.constant([10, 10, 20, 50, 60, 0])\nencoded_data = encoder(indexer(test_data))\nprint(encoded_data)\n'

3. encode: 
[0, 10431, 47672, 10278, 6382]

4. decode: 
<s># 몇가지 연습용 데이터를 정의합니다
data = tf.constant([10, 20, 20, 10, 30, 0])
</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이미지 증강 층들은 학습 동안에만 (`Dropout` 층과 유사하게) 활성화된다는 것을 주목하세요.



'2. og_source: \n이미지 증강 층들은 학습 동안에만 (`Dropout` 층과 유사하게) 활성화된다는 것을 주목하세요.\n'

3. encode: 
[0, 48280, 20024, 45209, 10965]

4. decode: 
<s>이미지 증강 층들은 학습 동안에만 (`Dropout` 층과 유사하게)</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이 방법으로는, 전처리가 기기 위에서, 나머지 모델 실행과 동기화되어 일어날 것이며, 이는 GPU 가속에 이익이 될 것이라는 뜻입니다. GPU 위에서 학습시키고 있다면, 이것은 `Normalization` 층에 대해, 그리고 모든 이미지 전처리와 데이터 증강 층들에 대해 최선의 선택입니다.



'2. og_source: \n이 방법으로는, 전처리가 기기 위에서, 나머지 모델 실행과 동기화되어 일어날 것이며, 이는 GPU 가속에 이익이 될 것이라는 뜻입니다. GPU 위에서 학습시키고 있다면, 이것은 `Normalization` 층에 대해, 그리고 모든 이미지 전처리와 데이터 증강 층들에 대해 최선의 선택입니다.\n'

3. encode: 
[0, 48280, 20024, 47672, 7487]

4. decode: 
<s>이 방법으로는, 전처리가 기기 위에서, 나머지 모델</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이 층들은 이미지 모델의 입력들을 표준화하기 위한 것입니다.



'2. og_source: \n이 층들은 이미지 모델의 입력들을 표준화하기 위한 것입니다.\n'

3. encode: 
[0, 48280, 20024, 46747, 18537]

4. decode: 
<s>이 층들은 이미지 모델의 입력들을 표준화하기 위한 것</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이 방법으로는, 전처리가 CPU 위에서, 비동기적으로 일어날 것이며, 모델에 들어가기 전에 임시 저장될 것입니다.



'2. og_source: \n이 방법으로는, 전처리가 CPU 위에서, 비동기적으로 일어날 것이며, 모델에 들어가기 전에 임시 저장될 것입니다.\n'

3. encode: 
[0, 48280, 20024, 47672, 7487]

4. decode: 
<s>이 방법으로는, 전처리가 CPU 위에서, 비동기적으로 일</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
## `adapt()` 메소드



'2. og_source: \n## `adapt()` 메소드\n'

3. encode: 
[0, 48342, 22209, 43199, 43048]

4. decode: 
<s>## `adapt()` 메소드</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
### 핵심 전처리 층



'2. og_source: \n### 핵심 전처리 층\n'

3. encode: 
[0, 48134, 1437, 48589, 8906]

4. decode: 
<s>### 핵심 전처리 층</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
0번째 색인은 (빈 문자열 `""`로써 특정해야 하는)결측값들을 위해 보존되며, 1번째 색인은 사전 외 값들(`adapt()` 동안 보이지 않은 값들)을 위해 보존됩니다. 이것은 `IntegerLookup`의 `mask_value`와 `oov_value` 생성자 인자들을 사용함으로써 설정할 수 있습니다.



'2. og_source: \n0번째 색인은 (빈 문자열 `""`로써 특정해야 하는)결측값들을 위해 보존되며, 1번째 색인은 사전 외 값들(`adapt()` 동안 보이지 않은 값들)을 위해 보존됩니다. 이것은 `IntegerLookup`의 `mask_value`와 `oov_value` 생성자 인자들을 사용함으로써 설정할 수 있습니다.\n'

3. encode: 
[0, 288, 45209, 14292, 23133]

4. decode: 
<s>0번째 색인은 (빈 문자열 `""`로써 특정해야 하는)결측�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
0번째 색인은 (빈 문자열 `""`로써 특정해야 하는)결측값들을 위해 보존되며, 1번째 색인은 사전 외 값들(`adapt()` 동안 보이지 않은 값들)을 위해 보존됩니다. 이것은 `StringLookup`의 `mask_token`과 `oov_token` 생성자 인자들을 사용함으로써 설정할 수 있습니다.



'2. og_source: \n0번째 색인은 (빈 문자열 `""`로써 특정해야 하는)결측값들을 위해 보존되며, 1번째 색인은 사전 외 값들(`adapt()` 동안 보이지 않은 값들)을 위해 보존됩니다. 이것은 `StringLookup`의 `mask_token`과 `oov_token` 생성자 인자들을 사용함으로써 설정할 수 있습니다.\n'

3. encode: 
[0, 288, 45209, 14292, 23133]

4. decode: 
<s>0번째 색인은 (빈 문자열 `""`로써 특정해야 하는)결측�</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
이런 모델을 학습시킬 때에는, 최고의 성능을 위해, (위의 텍스트 분류 예제에서 했던 것인) 입력 파이프라인의 부분으로 `TextVectorization` 층을 사용해야 합니다.



'2. og_source: \n이런 모델을 학습시킬 때에는, 최고의 성능을 위해, (위의 텍스트 분류 예제에서 했던 것인) 입력 파이프라인의 부분으로 `TextVectorization` 층을 사용해야 합니다.\n'

3. encode: 
[0, 48280, 20024, 45209, 4333]

4. decode: 
<s>이런 모델을 학습시킬 때에는, 최고의 성능을 위해, (위의</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 표본 데이터: 0과 100,000 사이의 10,000개의 무작위 정수
data = np.random.randint(0, 100000, size=(10000, 1))

# [0, 64] 범위로 값들을 해싱하기 위해 Hashing 층을 사용합니다
hasher = preprocessing.Hashing(num_bins=64, salt=1337)

# 해싱된 값들을 one-hot 인코딩하기 위해 CategoryEncoding 층을 사용합니다
encoder = preprocessing.CategoryEncoding(max_tokens=64, output_mode="binary")
encoded_data = encoder(hasher(data))
print(encoded_data.shape)



'2. og_source: \n# 표본 데이터: 0과 100,000 사이의 10,000개의 무작위 정수\ndata = np.random.randint(0, 100000, size=(10000, 1))\n\n# [0, 64] 범위로 값들을 해싱하기 위해 Hashing 층을 사용합니다\nhasher = preprocessing.Hashing(num_bins=64, salt=1337)\n\n# 해싱된 값들을 one-hot 인코딩하기 위해 CategoryEncoding 층을 사용합니다\nencoder = preprocessing.CategoryEncoding(max_tokens=64, output_mode="binary")\nencoded_data = encoder(hasher(data))\nprint(encoded_data.shape)\n'

3. encode: 
[0, 10431, 1437, 47649, 3602]

4. decode: 
<s># 표본 데이터: 0과 100,000 사이의 10,000개의 무작위 정수
data = np.random.randint</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
`adapt()` 메소드는 Numpy 배열이나 [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) 객체 모두 받습니다. `StringLookup`과 `TextVectorization`의 경우에, 문자열들의 목록을 넘겨줄 수 있습니다:



'2. og_source: \n`adapt()` 메소드는 Numpy 배열이나 [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) 객체 모두 받습니다. `StringLookup`과 `TextVectorization`의 경우에, 문자열들의 목록을 넘겨줄 수 있습니다:\n'

3. encode: 
[0, 12905, 43199, 43048, 12905]

4. decode: 
<s>`adapt()` 메소드는 Numpy 배열이나 [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Datas</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
`adapt()` 메소드를 통해 학습 데이터에 전처리 층을 노출시킴으로써 상태를 설정할 수 있습니다:



'2. og_source: \n`adapt()` 메소드를 통해 학습 데이터에 전처리 층을 노출시킴으로써 상태를 설정할 수 있습니다:\n'

3. encode: 
[0, 12905, 43199, 43048, 12905]

4. decode: 
<s>`adapt()` 메소드를 통해 학습 데이터에 전처리 층을 노출시</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: markdown

2. og_source: 
* `RandomCrop` 층
* `RandomFlip` 층
* `RandomTranslation` 층
* `RandomRotation` 층
* `RandomZoom` 층
* `RandomHeight` 층
* `RandomWidth` 층



'2. og_source: \n* `RandomCrop` 층\n* `RandomFlip` 층\n* `RandomTranslation` 층\n* `RandomRotation` 층\n* `RandomZoom` 층\n* `RandomHeight` 층\n* `RandomWidth` 층\n'

3. encode: 
[0, 3226, 22209, 45134, 347]

4. decode: 
<s>* `RandomCrop` 층
* `RandomFlip` 층
* `RandomTranslation` 층
* `RandomRotation` 층
* `RandomZoom` 층
* `RandomHeight` 층
* `RandomWidth</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# "정수" 출력 방식으로 TextVectorization을 생성합니다
text_vectorizer = preprocessing.TextVectorization(output_mode="int")
# `adapt()`를 통해 사전을 색인합니다
text_vectorizer.adapt(data)

# get_vocabulary()를 통해 색인한 사전을 받아올 수 있습니다
vocab = text_vectorizer.get_vocabulary()
print("Vocabulary:", vocab)

# Embedding + LSTM 모델을 생성

'2. og_source: \n# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다\ndata = tf.constant(\n    [\n        "The Brain is wider than the Sky",\n        "For put them side by side",\n        "The one the other will contain",\n        "With ease and You beside",\n    ]\n)\n# "정수" 출력 방식으로 TextVectorization을 생성합니다\ntext_vectorizer = preprocessing.TextVectorization(output_mode="int")\n# `adapt()`를 통해 사전을 색인합니다\ntext_vectorizer.adapt(data)\n\n# get_vocabulary()를 통해 색인한 사전을 받아올 수 있습니다\nvocab = text_vectorizer.get_vocabulary()\nprint("Vocabulary:", vocab)\n\n# Embedding + LSTM 모델을 생성합니다\ninputs = keras.Input(shape=(1,), dtype="string")\nx = text_vectorizer(inputs)\nx = layers.Embedding(input_dim=len(vocab), output_dim=64)(x)\noutputs = layers.LSTM(1)(x)\nmodel = keras.Model(inputs, outputs)\n\n# (알 수 없는 토큰을 포함하는) 평가 데이터 위에서 모델을 호출합니다\ntest_data = tf.constant(["The Brain is deeper than the sea"])\ntest_output = model(test_data)\n'

3. encode: 
[0, 10431, 46747, 18537, 8906]

4. decode: 
<s># 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다
data =</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# "binary" 출력 방식(multi-hot)과 ngram=2(모든 bigram 색인)으로
# TextVectorization을 생성합니다
text_vectorizer = preprocessing.TextVectorization(output_mode="binary", ngrams=2)
# `adapt()`를 통해 bigram들을 색인합니다
text_vectorizer.adapt(data)

print(
    "Encoded text:\n",
    text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
    "\n",
)

# Dense 모델을 생성합니다
inputs = keras.Input(shape=(1,), dtyp

'2. og_source: \n# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다\ndata = tf.constant(\n    [\n        "The Brain is wider than the Sky",\n        "For put them side by side",\n        "The one the other will contain",\n        "With ease and You beside",\n    ]\n)\n# "binary" 출력 방식(multi-hot)과 ngram=2(모든 bigram 색인)으로\n# TextVectorization을 생성합니다\ntext_vectorizer = preprocessing.TextVectorization(output_mode="binary", ngrams=2)\n# `adapt()`를 통해 bigram들을 색인합니다\ntext_vectorizer.adapt(data)\n\nprint(\n    "Encoded text:\\n",\n    text_vectorizer(["The Brain is deeper than the sea"]).numpy(),\n    "\\n",\n)\n\n# Dense 모델을 생성합니다\ninputs = keras.Input(shape=(1,), dtype="string")\nx = text_vectorizer(inputs)\noutputs = layers.Dense(1)(x)\nmodel = keras.Model(inputs, outputs)\n\n# (알 수 없는 토큰을 포함하는) 평가 데이터 위에서 모델을 호출합니다\ntest_data = tf.constant(["The Brain is deeper than the sea"])\ntest_output = model(test_data)\n\nprint("Model output:", test_output)\n'

3. encode: 
[0, 10431, 46747, 18537, 8906]

4. decode: 
<s># 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다
data =</s>
------------------------- code start--------------------------------------------------------------------------------------
------------------------- code start--------------------------------------------------------------------------------------
1. Code Type: code

2. og_source: 
# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# "tf-idf" 출력 방식(TF-IDF 가중치를 사용하는 multi-hot)과 ngram=2(모든 bigram 색인)으로
# TextVectorization을 생성합니다
text_vectorizer = preprocessing.TextVectorization(output_mode="tf-idf", ngrams=2)
# `adapt()`를 통해 bigram들을 색인합니다
text_vectorizer.adapt(data)

print(
    "Encoded text:\n",
    text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
    "\n",
)

# Dense 모델을 생성합니다
inputs = keras.Input

'2. og_source: \n# 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다\ndata = tf.constant(\n    [\n        "The Brain is wider than the Sky",\n        "For put them side by side",\n        "The one the other will contain",\n        "With ease and You beside",\n    ]\n)\n# "tf-idf" 출력 방식(TF-IDF 가중치를 사용하는 multi-hot)과 ngram=2(모든 bigram 색인)으로\n# TextVectorization을 생성합니다\ntext_vectorizer = preprocessing.TextVectorization(output_mode="tf-idf", ngrams=2)\n# `adapt()`를 통해 bigram들을 색인합니다\ntext_vectorizer.adapt(data)\n\nprint(\n    "Encoded text:\\n",\n    text_vectorizer(["The Brain is deeper than the sea"]).numpy(),\n    "\\n",\n)\n\n# Dense 모델을 생성합니다\ninputs = keras.Input(shape=(1,), dtype="string")\nx = text_vectorizer(inputs)\noutputs = layers.Dense(1)(x)\nmodel = keras.Model(inputs, outputs)\n\n# (알 수 없는 토큰을 포함하는) 평가 데이터 위에서 모델을 호출합니다\ntest_data = tf.constant(["The Brain is deeper than the sea"])\ntest_output = model(test_data)\n\nprint("Model output:", test_output)\n'

3. encode: 
[0, 10431, 46747, 18537, 8906]

4. decode: 
<s># 층에 adapt하기 위한 몇가지 텍스트 데이터를 정의합니다
data =</s>
------------------------- code start--------------------------------------------------------------------------------------






In [None]:
idx = 3
row = low_score_sample.iloc[idx]
n_id = row.id
label_order = row.label_order
pred_order = row.pred_order
print_incoding_result(new_val_df, n_id, label_order, pred_order)
print('\n\n\n')