In [62]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deberta-v3-base/deberta-v3-base/spm.model
/kaggle/input/deberta-v3-base/deberta-v3-base/config.json
/kaggle/input/deberta-v3-base/deberta-v3-base/README.md
/kaggle/input/deberta-v3-base/deberta-v3-base/tokenizer_config.json
/kaggle/input/deberta-v3-base/deberta-v3-base/pytorch_model.bin
/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv


In [16]:
import pandas as pd
#pd.set_option('display.max_colwidth', None)  
train=pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test=pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
model_path = '/kaggle/input/deberta-v3-base/deberta-v3-base'

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2029 entries, 0 to 2028
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              2029 non-null   int64 
 1   body                2029 non-null   object
 2   rule                2029 non-null   object
 3   subreddit           2029 non-null   object
 4   positive_example_1  2029 non-null   object
 5   positive_example_2  2029 non-null   object
 6   negative_example_1  2029 non-null   object
 7   negative_example_2  2029 non-null   object
 8   rule_violation      2029 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 142.8+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              10 non-null     int64 
 1   body                10 non-null     object
 2   rule                10 non-null     object
 3   subreddit           10 non-null     object
 4   positive_example_1  10 non-null     object
 5   positive_example_2  10 non-null     object
 6   negative_example_1  10 non-null     object
 7   negative_example_2  10 non-null     object
dtypes: int64(1), object(7)
memory usage: 772.0+ bytes


In [5]:
train['rule'].unique()

array(['No Advertising: Spam, referral links, unsolicited advertising, and promotional content are not allowed.',
       'No legal advice: Do not offer or request legal advice.'],
      dtype=object)

In [7]:
counts = train['subreddit'].value_counts()

subreddit_count = pd.DataFrame({
    'Subreddit': counts.index,
    'Count': counts.values
})

subreddit_count.head(10)


Unnamed: 0,Subreddit,Count
0,legaladvice,213
1,AskReddit,152
2,soccerstreams,139
3,personalfinance,125
4,relationships,106
5,The_Donald,94
6,TwoXChromosomes,87
7,news,65
8,movies,56
9,videos,50


In [8]:
train['rule_violation'].value_counts()

rule_violation
1    1031
0     998
Name: count, dtype: int64

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer=TfidfVectorizer().fit(train['rule'])
rule_embeddings=tfidf_vectorizer.transform(train['rule'])
# rule_embeddings.shape #(2029,18)
print(tfidf_vectorizer.get_feature_names_out())  

['advertising' 'advice' 'allowed' 'and' 'are' 'content' 'do' 'legal'
 'links' 'no' 'not' 'offer' 'or' 'promotional' 'referral' 'request' 'spam'
 'unsolicited']


In [33]:
from sklearn.metrics.pairwise import cosine_similarity
#Dynamically retrieve few-shot examples
def retrieve_examples(current_rule,k=2):
    query_vec=tfidf_vectorizer.transform([current_rule]) #(1,18)
    sims=cosine_similarity(query_vec,rule_embeddings)[0]
    topk_idx=np.argsort(sims)[-k-1:-1][::-1]
    return train.iloc[topk_idx]

#query_vec.shape(1,18) 와 rule_embeddings(2029개) 간의 코사인 유사도를 계산
#각 규칙과 얼마나 비슷한지 0~1 사이 수치로 나옴
#유사도가 높은 상위 k개의 인덱스를 추출
#상위 k개 인덱스에 해당하는 규칙들을 반환

In [34]:
#Combine all text fileds
def create_input_text_with_retrieval(row,k=2):
    retrieved = retrieve_examples(row['rule'],k)
    positives=' | '.join(retrieved['positive_example_1'].fillna('').tolist())
    negatives=' | '.join(retrieved['negative_example_1'].fillna('').tolist())
    return f"""
    Comment:{row['body']}
    Rule:{row['rule']}
    Subreddit: {row['subreddit']}
    Positive Examples: {positives}
    Negative Examples: {negatives}
    """
train['input_text']=train.apply(create_input_text_with_retrieval,axis=1)
train['labels']=train['rule_violation'].values 
#labels 컬럼을 기대하는 경우가 많음 , y=train['labels']



In [63]:
from transformers import AutoTokenizer

tokenizer =AutoTokenizer.from_pretrained(model_path)

def tokenize(sample):
    return tokenizer(sample['input_text'],max_length=256,truncation=True)
    

In [55]:
from sklearn.model_selection import StratifiedKFold

# StratifiedKFold를 이용해 데이터셋을 k개로 나눔
# 각 폴드마다 어떤 인덱스가 검증용인지 확인해서,
# 해당 인덱스의 행에 fold 번호를 넣는 작업임

skf=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)

# 'fold'라는 새로운 열 추가 (초기값 -1)
train['fold'] = -1

# 폴드 나누기
for fold_num, (_, val_idx) in enumerate(skf.split(train, train['rule_violation'])):
    # 검증 인덱스에 해당하는 행에 fold 번호 입력
    train.loc[val_idx, 'fold'] = fold_num

#나중에 fold별로 모델 훈련/평가 반복하려고
a=sorted(train['fold'].unique())
a # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [47]:
#Hugging Face의 datasets 라이브러리를 쓸 때 자주 등장하는 코드
#ds_train = Dataset.from_pandas(df_train)
#ds_train = Dataset.from_dict(my_dict)
from datasets import Dataset

ds_train=Dataset.from_pandas(train[train.fold !=0].copy())
ds_eval=Dataset.from_pandas(train[train.fold ==0].copy())

In [64]:
remove_columns = ['row_id', 'body', 'rule', 'subreddit',
                  'positive_example_1', 'positive_example_2',
                  'negative_example_1', 'negative_example_2',
                  'rule_violation', 'input_text','fold']

ds_train=ds_train.map(tokenize,num_proc=1).remove_columns(remove_columns)
ds_eval = ds_eval.map(tokenize,num_proc=1).remove_columns(remove_columns)

Map:   0%|          | 0/1826 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [69]:
import numpy as np
import torch
from transformers import TrainingArguments
from sklearn.metrics import roc_auc_score

def compute_metrics(eval_pred):
    logits,labels = eval_pred
    probs = torch.softmax(torch.tensor(logits),dim=1)[:,1].numpy()
    auc = roc_auc_score(labels,probs)
    return {"auc":auc}
output_path = '/kaggle/working/deberta-v3-base-finetuned'
train_args=TrainingArguments(
    report_to='none',
    output_dir=output_path,
    optim='adamw_torch',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    warmup_ratio=0.03,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=69,
    save_steps=345,
    save_total_limit=1,
    metric_for_best_model='auc',
    greater_is_better=True,
    load_best_model_at_end=True
)

In [72]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=2)

trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=ds_train,
        eval_dataset=ds_eval,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
checkpoint_path=output_path+'/checkpoint-'+str(345)

test['input_text']=test.apply(create_input_text_with_retrieval,axis=1)

ds_test=Dataset.from_pandas(test).map(tokenize).remove_columns(['row_id',
    'body', 'rule', 'subreddit',
    'positive_example_1', 'positive_example_2',
    'negative_example_1', 'negative_example_2',
    'input_text']
)

test_args = TrainingArguments(".",per_device_eval_batch_size=8,report_to='none')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
trainer = Trainer(model=model,args=test_args,tokenizer=tokenizer,data_collator=DataCollatorWithPadding(tokenizer))

In [None]:
preds = trainer.predict(ds_test).predictions
probs= torch.softmax(torch.tensor(preds),dim=1)[:,1].numpy()

test['rule_violation'] = probs
submission = test[['row_id','rule_violation']].to_csv('submission.csv',index=False)
submission