In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import gc 
import sys
import lightgbm as lgb
import shutil
import spacy
import optuna
    
#the basics
import pandas as pd, numpy as np, seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm

# this code is essential for applying progress_apply() in pandas series object
tqdm.pandas()

#for model evaluation
from sklearn.model_selection import train_test_split, KFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.chdir('C:\\Users\\MINSEOK\\Desktop\\대학생활\\대외활동\\kaggle\\commonlit-evaluate-student-summaries')

In [2]:
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera, [6](https://stackoverflow.com/a/1094933/1870254), modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

# memory usage of all global variables

# global_vars = list(globals().items())
# variables = [(var, (sys.getsizeof(obj))) for var, obj in global_vars]
# variables = sorted(((var, size_value) for var, size_value in variables), key=lambda x: -x[1])
# variables = [(var, sizeof_fmt(size_value)) for var, size_value in variables]

# for var, size_fmt in variables:
#     print(" {:>30}: {:>8}".format(var, size_fmt))

In [3]:
prompts_train=pd.read_csv('prompts_train.csv')
prompts_test=pd.read_csv('prompts_test.csv')
summaries_train=pd.read_csv('summaries_train.csv')
summaries_test=pd.read_csv('summaries_test.csv')

sample_submission=pd.read_csv('sample_submission.csv')


In [4]:
prompts_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   prompt_id        4 non-null      object
 1   prompt_question  4 non-null      object
 2   prompt_title     4 non-null      object
 3   prompt_text      4 non-null      object
dtypes: object(4)
memory usage: 256.0+ bytes


In [5]:
prompts_train.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [6]:
for val in prompts_train.prompt_text.values:
    print(val.split('\n'))

['Chapter 13 \r', 'As the sequel to what has already been said, we must proceed to consider what the poet should aim at, and what he should avoid, in constructing his plots; and by what means the specific effect of Tragedy will be produced. \r', 'A perfect tragedy should, as we have seen, be arranged not on the simple but on the complex plan. It should, moreover, imitate actions which excite pity and fear, this being the distinctive mark of tragic imitation. It follows plainly, in the first place, that the change of fortune presented must not be the spectacle of a virtuous man brought from prosperity to adversity: for this moves neither pity nor fear; it merely shocks us. Nor, again, that of a bad man passing from adversity to prosperity: for nothing can be more alien to the spirit of Tragedy; it possesses no single tragic quality; it neither satisfies the moral sense nor calls forth pity or fear. Nor, again, should the downfall of the utter villain be exhibited. A plot of this kind wo

In [8]:
prompts_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   prompt_id        2 non-null      object
 1   prompt_question  2 non-null      object
 2   prompt_title     2 non-null      object
 3   prompt_text      2 non-null      object
dtypes: object(4)
memory usage: 192.0+ bytes


In [9]:
prompts_test.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,abc123,Summarize...,Example Title 1,Heading\nText...
1,def789,Summarize...,Example Title 2,Heading\nText...


In [10]:
summaries_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7165 entries, 0 to 7164
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   student_id  7165 non-null   object 
 1   prompt_id   7165 non-null   object 
 2   text        7165 non-null   object 
 3   content     7165 non-null   float64
 4   wording     7165 non-null   float64
dtypes: float64(2), object(3)
memory usage: 280.0+ KB


In [11]:
summaries_train.prompt_id.unique()

array(['814d6b', 'ebad26', '3b9047', '39c16e'], dtype=object)

In [12]:
summaries_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [13]:
summaries_test.head()

Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


Strategy
==========

1. *base model* : deberta v3, lgbm, lstm, randomforest 
2. *training feature* : prompt_question, prompt_text (based on prompt id)
3. *tokenizer* : autotokenizer
4. *divide model for two objective* : content / wording
5. *training feature for wording* : spell_wrong_num, spell_wrong_words(fixed version)
6. *training feature for content* : prompt_question, prompt_text (based on prompt id)
7. *additional idea* : 
- create diff model for four diff prompt_title
> impossible. test data will be diffrent, use groupkfold  
>  ~~since each prompt topic is diff and indep, model for specific title seems effective~~
>> ~~but wording model should not be seperated.~~
- stacking / bagging last pred from diff models

Pipeline
===========

## preprocessor 
+ collect & create useful feature for wording/content scoring

## train & validate function for each model

+ assign each embedding & shape calibration differently
+ depend on model name
+ assign different feature columns for training, depends on target value(content/wording) 
+ use optuna for further improvement

## stacking ensemble

+ final layer model : linear regression
+ ??

## predict function


In [14]:
import transformers

from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords

import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker

from datasets import Dataset, load_dataset, load_from_disk


In [15]:
class cfg:
    model_name=['debertav3base', 'lgbm', 'lstm']
    learning_rate=0.001
    weight_decay=0.005
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005
    num_train_epochs=5
    n_splits=4
    folds=[0,1,2,3]
    batch_size=30 # maybe need adjustment
    save_steps=100
    max_length=512 # maybe need adjustment

In [16]:
class Preprocessor:
    def __init__(self):
        self.speller=Speller(lang='en')
        self.spellckecker=SpellChecker()
        self.STOP_WORDS=set(stopwords.words('english'))
        
    def word_overlap_count(self,row):
        """count the words that are used in both prompt text and summary text.
        
        If the student use same words appeared in prompt text frequently, it would be minus factor to context score."""
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        
        # Filter out stopwords if they are defined
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        
        # Calculate the count of overlapping words
        return len(set(prompt_words).intersection(set(summary_words)))
    
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}
    
    def spell_right(self, text):
        wordlist=text.split()
        wrong_word_cnt=len(list(self.spellckecker.unknown(wordlist)))
        return wrong_word_cnt
    
    def quotes_count(self,row):
        summary=row['text']
        text=row['prompt_text']
        quotes_from_summary=re.findall(r'"([^"]*)"',summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0
        

    def add_train_worddict(self, tokens):
        self.spellckecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def run(self, prompt_df, summary_df):
        prompt_df['prompt_length']=prompt_df['prompt_text'].progress_apply(lambda x: len(word_tokenize(x)))
        print('-----------prompt length col added!-----------')
        prompt_df['prompt_tokens']=prompt_df['prompt_text'].progress_apply(lambda x: word_tokenize(x))
        print('-----------prompt tokens col added!-----------')
        prompt_df['prompt_tokens'].progress_apply(lambda x: self.add_train_worddict(x))
        print('-----------prompt tokens added to worddict!-----------')
        print(prompt_df.columns)
        
        summary_df['summary_length']=summary_df['text'].progress_apply(lambda x: len(word_tokenize(x)))
        print('-----------summary length col added!-----------')
        summary_df['summary_tokens']=summary_df['text'].progress_apply(lambda x: word_tokenize(x))        
        print('-----------summary tokens col added!-----------')
        summary_df['wrong_word_cnt']=summary_df['text'].progress_apply(self.spell_right)
        print('-----------wrong_word_cnt col added!-----------')
        summary_df['fixed_summary_text']=summary_df['text'].progress_apply(self.speller)
        print('-----------fixed_summary_text col added!-----------')
        print(summary_df.columns)
        
        input_df=pd.merge(prompt_df, summary_df, on='prompt_id')
        print(input_df.columns)
        
        input_df['word_overlap_cnt']=input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])



In [17]:
preprocessor=Preprocessor()

train=preprocessor.run(prompts_train, summaries_train)
print(sizeof_fmt(sys.getsizeof(train)))
test=preprocessor.run(prompts_test, summaries_test)
print(sizeof_fmt(sys.getsizeof(test)))

100%|██████████| 4/4 [00:00<00:00, 266.61it/s]


-----------prompt length col added!-----------


100%|██████████| 4/4 [00:00<00:00, 363.53it/s]


-----------prompt tokens col added!-----------


100%|██████████| 4/4 [00:00<00:00, 68.95it/s]


-----------prompt tokens added to worddict!-----------
Index(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text',
       'prompt_length', 'prompt_tokens'],
      dtype='object')


100%|██████████| 7165/7165 [00:02<00:00, 3172.34it/s]


-----------summary length col added!-----------


100%|██████████| 7165/7165 [00:02<00:00, 3166.94it/s]


-----------summary tokens col added!-----------


100%|██████████| 7165/7165 [00:00<00:00, 10472.81it/s]


-----------wrong_word_cnt col added!-----------


100%|██████████| 7165/7165 [04:54<00:00, 24.34it/s]


-----------fixed_summary_text col added!-----------
Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'summary_length', 'summary_tokens', 'wrong_word_cnt',
       'fixed_summary_text'],
      dtype='object')
Index(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text',
       'prompt_length', 'prompt_tokens', 'student_id', 'text', 'content',
       'wording', 'summary_length', 'summary_tokens', 'wrong_word_cnt',
       'fixed_summary_text'],
      dtype='object')


100%|██████████| 7165/7165 [00:00<00:00, 13492.07it/s]
100%|██████████| 7165/7165 [00:00<00:00, 132656.14it/s]
100%|██████████| 7165/7165 [00:00<00:00, 7643.56it/s]
100%|██████████| 7165/7165 [00:01<00:00, 6747.06it/s]


64.0 MiB


100%|██████████| 2/2 [00:00<?, ?it/s]


-----------prompt length col added!-----------


100%|██████████| 2/2 [00:00<?, ?it/s]


-----------prompt tokens col added!-----------


100%|██████████| 2/2 [00:00<00:00, 68.95it/s]


-----------prompt tokens added to worddict!-----------
Index(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text',
       'prompt_length', 'prompt_tokens'],
      dtype='object')


100%|██████████| 4/4 [00:00<00:00, 4000.29it/s]


-----------summary length col added!-----------


100%|██████████| 4/4 [00:00<00:00, 3996.48it/s]


-----------summary tokens col added!-----------


100%|██████████| 4/4 [00:00<?, ?it/s]


-----------wrong_word_cnt col added!-----------


100%|██████████| 4/4 [00:00<?, ?it/s]


-----------fixed_summary_text col added!-----------
Index(['student_id', 'prompt_id', 'text', 'summary_length', 'summary_tokens',
       'wrong_word_cnt', 'fixed_summary_text'],
      dtype='object')
Index(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text',
       'prompt_length', 'prompt_tokens', 'student_id', 'text',
       'summary_length', 'summary_tokens', 'wrong_word_cnt',
       'fixed_summary_text'],
      dtype='object')


100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 3998.38it/s]

2.3 KiB





In [18]:
from sklearn.model_selection import GroupKFold

gkf = GroupKFold(n_splits=cfg.n_splits)

# Iterate through the splits and assign fold numbers to validation data
for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

In [19]:
train.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,prompt_length,student_id,text,content,wording,summary_length,wrong_word_cnt,fixed_summary_text,word_overlap_cnt,quotes_count,length_ratio,bigram_overlap_count,trigram_overlap_count,fold
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,59,3,1 element of an ideal tragedy is that it shoul...,12,0,0.084406,13,4,0.0
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,30,4,The three elements of an ideal tragedy are: H...,10,0,0.042918,2,0,0.0
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,74,17,Aristotle states that an ideal tragedy should ...,13,4,0.105866,14,7,0.0
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,61,2,One element of an Ideal tragedy is having a co...,13,0,0.087268,12,3,0.0
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,63,9,The 3 ideal of tragedy is how complex you need...,11,1,0.090129,6,0,0.0


In [23]:
train.columns

Index(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text',
       'prompt_length', 'student_id', 'text', 'content', 'wording',
       'summary_length', 'wrong_word_cnt', 'fixed_summary_text',
       'word_overlap_cnt', 'quotes_count', 'fold'],
      dtype='object')

In [20]:
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Calculate RMSE
    rmse = mean_squared_error(labels, predictions, squared=False)
    
    # Return RMSE as a dictionary
    return {"rmse": rmse}

DebertaV3 Regression Model
==========================

In [39]:
class DebertaReg:
    def __init__(self, model_name, target_cols, max_length, model_dir,
                 hidden_dropout_prob, attention_probs_dropout_prob
                 ):
        self.input_col='train_text'
        self.model_name=model_name
        self.model_dir=model_dir
        self.target_cols=target_cols
        self.max_length=cfg.max_length
        
        self.tokenizer=AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
        self.model_config=AutoConfig.from_pretrained("microsoft/deberta-v3-base")
        
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression",
        })

        self.data_collator=DataCollatorWithPadding(tokenizer=self.tokenizer)
        
    def embedding_train(self, train_df):
        labels=[train_df['content'], train_df['wording']]
        tokenized=self.tokenizer(train_df[self.input_col],
                                    padding="longest",
                                    truncation=True,
                                    max_length=self.max_length)
        
        return {
        **tokenized,
        "labels": labels,
    }
            
    def embedding_test(self, test_df):
        labels=[test_df['content'], test_df['wording']]
        tokenized=self.tokenizer(test_df[self.input_col],
                                    padding="longest",
                                    truncation=True,
                                    max_length=self.max_length)
        
        return tokenized
    
    
    def train(self, 
    fold: int,
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    batch_size: int,
    learning_rate: float,
    weight_decay: float,
    num_train_epochs: float,
    save_steps: int
    ):
        
        sep=self.tokenizer.sep_token
        
        train_df[self.input_col] = (
                train_df["prompt_title"] + sep 
                + train_df["prompt_question"] + sep 
                + train_df["fixed_summary_text"]
                )

        valid_df[self.input_col] = (
                valid_df["prompt_title"] + sep 
                + valid_df["prompt_question"] + sep 
                + valid_df["fixed_summary_text"]
                )
        
        train_df=train_df[[self.input_col]+self.target_cols]
        valid_df=valid_df[[self.input_col]+self.target_cols]
        print(train_df.shape, valid_df.shape)
        print(train_df.columns, valid_df.columns)
        
        model=AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base",
                                                                            config=self.model_config)
        
        train_df=Dataset.from_pandas(train_df, preserve_index=False)
        valid_df=Dataset.from_pandas(valid_df, preserve_index=False)
        
        train_df=train_df.map(self.embedding_train, batched=False)
        valid_df=valid_df.map(self.embedding_train, batched=False)
        
        print("----embedding complete----")
        print(sizeof_fmt(sys.getsizeof(train_df)))
        print(sizeof_fmt(sys.getsizeof(valid_df)))
        
        
        gc.collect()
        torch.cuda.empty_cache()
        
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True,
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        # Create a trainer for model training
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_df,
            eval_dataset=valid_df,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        # Train the model
        trainer.train()
        
        print('----trainer complete----')
        
        # Save the trained model and tokenizer
        model.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)
        
        model.cpu()
        del model
        gc.collect()
        torch.cuda.empty_cache()

    def predict(self, 
    test_df: pd.DataFrame,
    batch_size: int,
    fold: int,
    ):

        sep = self.tokenizer.sep_token
        
        # Create input text for test data
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                )
        test_df[self.input_col] = in_text

        # Select the relevant columns
        test_df = test_df[[self.input_col]]
    
        # Create a dataset from the test data
        test_df = Dataset.from_pandas(test_df, preserve_index=False) 
        test_df = test_df.map(self.embedding_test, batched=False)

        # Load the trained content score prediction model
        
        ## NEED ADJUSTMENT -> FROM PRETRAINED : model_dir
        model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base")
        model.eval()
        
        # Define model prediction arguments
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size =batch_size,
            dataloader_drop_last = False,
        )

        # Initialize a trainer for inference
        infer_content = Trainer(
                    model = model, 
                    tokenizer=self.tokenizer,
                    data_collator=self.data_collator,
                    args = test_args)

        # Perform predictions
        preds = infer_content.predict(test_df)[0]
        pred_df=pd.DataFrame(preds, columns=["content_pred", "wording_pred"]
        )
        
        model.cpu()
        del model
        gc.collect()
        torch.cuda.empty_cache()
        
        return pred_df
        

In [34]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        targets: str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length: int
    ):

    # Delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]

        model_dir =  f"{model_name}/fold_{fold}"

        csr = DebertaReg(
            model_name=model_name,
            target_cols=targets,
            model_dir=model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

# Define a function for validation (predicting oof data)
def validate(
    train_df: pd.DataFrame,
    targets: str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length: int,
    batch_size: int
    ) -> pd.DataFrame:
    """Predict out-of-fold (oof) data"""
    for fold in range(cfg.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        model_dir =  f"{model_name}/fold_{fold}"
        
        csr = DebertaReg(
            model_name=model_name,
            target_cols=targets,
            model_dir=model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )
        
        pred_df = csr.predict(
            test_df=valid_data, 
            batch_size=batch_size,
            fold=fold
        )
        
        train_df.loc[valid_data.index, "content_pred"] = pred_df["content_pred"].values
        train_df.loc[valid_data.index, "wording_pred"] = pred_df["wording_pred"].values

    return train_df
    
# Define a function for prediction (using mean folds)
def predict(
    test_df: pd.DataFrame,
    targets: str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length: int,
    batch_size: int
    ):
    """Predict using mean of folds"""

    for fold in range(cfg.n_splits):
        print(f"fold {fold}:")
        
        model_dir =  f"{model_name}/fold_{fold}"

        csr = DebertaReg(
            model_name=model_name,
            target_cols=targets,
            model_dir=model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )
        
        pred_df = csr.predict(
            test_df=test_df, 
            batch_size=batch_size,
            fold=fold
        )
        
        test_df[f"content_pred_{fold}"] = pred_df['content_pred'].values
        test_df[f"wording_pred_{fold}"] = pred_df['wording_pred'].values
    
    test_df["content_pred"] = test_df[[f"content_pred_{fold}" for fold in range(cfg.n_splits)]].mean(axis=1)
    test_df["wording_pred"] = test_df[[f"wording_pred_{fold}" for fold in range(cfg.n_splits)]].mean(axis=1)

    return test_df

In [40]:
t = ["wording", "content"]
    
train_by_fold(
    train,
    model_name="debertav3",
    save_each_model=False,
    targets=t,
    learning_rate=cfg.learning_rate,
    hidden_dropout_prob=cfg.hidden_dropout_prob,
    attention_probs_dropout_prob=cfg.attention_probs_dropout_prob,
    weight_decay=cfg.weight_decay,
    num_train_epochs=cfg.num_train_epochs,
    n_splits=cfg.n_splits,
    batch_size=cfg.batch_size,
    save_steps=cfg.save_steps,
    max_length=cfg.max_length
)


train = validate(
    train,
    targets=t,
    save_each_model=False,
    model_name=cfg.model_name,
    hidden_dropout_prob=cfg.hidden_dropout_prob,
    attention_probs_dropout_prob=cfg.attention_probs_dropout_prob,
    max_length=cfg.max_length
)

for target in ["content","wording"]:
    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

test = predict(
    test,
    targets=t,
    save_each_model=False,
    model_name=cfg.model_name,
    hidden_dropout_prob=cfg.hidden_dropout_prob,
    attention_probs_dropout_prob=cfg.attention_probs_dropout_prob,
    max_length=cfg.max_length
)



fold 0:
(5108, 3) (2057, 3)
Index(['train_text', 'wording', 'content'], dtype='object') Index(['train_text', 'wording', 'content'], dtype='object')


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 5108/5108 [00:02<00:00, 1822.60 examples/s]
Map: 100%|██████████| 2057/2057 [00:00<00:00, 2249.70 examples/s]


----embedding complete----
48.0 B
48.0 B


  0%|          | 0/855 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 8.00 GiB total capacity; 13.59 GiB already allocated; 0 bytes free; 13.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

LGBM model
============

In [36]:
lgbm_cfg={'num_leaves':31, 'min_data_in_leaf':20, 
        'early_stopping_round':10,
        'objective':'regression', 'metric':'rmse',
        'learning_rate':0.001, 'num_iteration':200, 'device':'gpu', 'verbosity':0}

In [37]:
import joblib

class LGBMReg:
    def __init__(self, input_cols, target_cols, model_dir, model_name, lgbm_cfg):
        self.input_cols=input_cols
        self.model_name=model_name
        self.model_dir=model_dir
        self.target_cols=target_cols
            
        self.lgbm_parameter=lgbm_cfg
        
    def train(self, 
              fold :int,
              train_df: pd.DataFrame,
              valid_df: pd.DataFrame,
              ):
        
        train_df=train_df[self.input_cols+self.target_cols]
        valid_df=valid_df[self.input_cols+self.target_cols]
  
        print(train_df.shape, valid_df.shape)
        
        for target in self.target_cols:
        
            train_dataset=lgb.Dataset(train_df[self.input_cols], label=train_df[target])
            valid_dataset=lgb.Dataset(valid_df[self.input_cols], label=valid_df[target])
            
            model=lgb.train(params=self.lgbm_parameter, train_set=train_dataset, valid_sets=valid_dataset)
            
            joblib.dump(model, f'{self.model_dir}/lgbm_{target}_{fold}.pkl')
            
            del model
            
            gc.collect()
            torch.cuda.empty_cache()
        
    def predict(self,
                test_df: pd.DataFrame,
                fold: int,
                ):
        
        pred_df=pd.DataFrame(columns=['content_pred', 'wording_pred'])
        
        test_df=test_df[self.input_cols]
        
        for target in self.target_cols:
        
            model_infer=joblib.load(f'{self.model_dir}/lgbm_{target}_{fold}.pkl')
            
            preds=model_infer.predict(test_df)
            
            pred_df[f'{target}_pred']=preds
            
        return pred_df

In [38]:
def train_by_fold(
    train_df: pd.DataFrame,
    model_name: str,
    targets: list[str],
    inputs: list[str],
    lgbm_cfg=lgbm_cfg
):

    if os.path.exists(model_name):
        shutil.rmtree(model_name)
        
    os.mkdir(model_name)
    
    for fold in cfg.folds:
        train_data=train_df[train_df['fold']!=fold]
        valid_data=train_df[train_df['fold']==fold]
        
        model_dir=f'{model_name}'
            
        model=LGBMReg(model_name=model_name,
                    target_cols=targets, 
                    input_cols=inputs,
                    model_dir=model_dir,
                    lgbm_cfg=lgbm_cfg)
        
        model.train(fold=fold,
                    train_df=train_data,
                    valid_df=valid_data)
    
def validate(train_df:pd.DataFrame,
                targets: list[str],
                inputs: list[str],
                model_name: str,
                lgbm_cfg=lgbm_cfg
                ):
    
    for fold in cfg.folds:
        
        valid_data=train_df[train_df["fold"]==fold]
        model_dir=f'{model_name}'
        
        model=LGBMReg(model_name=model_name,
                        target_cols=targets,
                        input_cols=inputs,
                        model_dir=model_dir,
                        lgbm_cfg=lgbm_cfg
                        )
        
        pred_df=model.predict(test_df=valid_data,
                                fold=fold)
        
        train_df.loc[valid_data.index, f'content_pred']=pred_df['content_pred'].values
        train_df.loc[valid_data.index, f'wording_pred']=pred_df['wording_pred'].values
        
    return train_df

def predict(test_df:pd.DataFrame,
                targets: list[str],
                inputs: list[str],
                n_splits: int,
                model_name: str,
                lgbm_cfg=lgbm_cfg):
    
    for fold in cfg.folds:
        
        model_dir=f'{model_name}'
        
        model=LGBMReg(model_name=model_name,
                        target_cols=targets,
                        input_cols=inputs,
                        model_dir=model_dir,
                        lgbm_cfg=lgbm_cfg
                        )
        
        pred_df=model.predict(test_df=test_df,
                                fold=fold)
        
        test_df[f'content_pred_{fold}']=pred_df[f'content_pred'].values
        test_df[f'wording_pred_{fold}']=pred_df[f'wording_pred'].values
        
    test_df[f'content_pred']=test_df[[f'content_pred_{fold}' for fold in range(n_splits)]].mean(axis=1)
    test_df[f'wording_pred']=test_df[[f'wording_pred_{fold}' for fold in range(n_splits)]].mean(axis=1)
    
    return test_df    

In [39]:
def lgbm_optuna(n_trials, train_df, test_df):

    targets=['wording', 'content']
    inputs=['summary_length', 'word_overlap_cnt', 'quotes_count','wrong_word_cnt',
            'length_ratio', 'bigram_overlap_count', 'trigram_overlap_count']
    
    
    def objective(trial):
        num_leaves=trial.suggest_int("num_leaves", 20, 50, 5)
        num_iteration=trial.suggest_int("num_iteration", 100, 1000, 100)
        min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 10, 100, 10)
        early_stopping_round=trial.suggest_int("early_stopping_round", 10, 100, 10)
        learning_rate=trial.suggest_float("learning_rate", 0.0001, 0.1)
        
        lgbm_cfg={'num_leaves' : num_leaves,
                'min_data_in_leaf':min_data_in_leaf, 
                'early_stopping_round':early_stopping_round,
                'objective':'regression', 
                'metric': 'rmse',
                'learning_rate':learning_rate, 
                'num_iteration': num_iteration,
                'device':'gpu',
                'verbosity': 0
                }


        train_by_fold(train_df=train_df, model_name="lgbm", 
                    targets=targets, inputs=inputs, lgbm_cfg=lgbm_cfg)

        train=validate(train_df=train_df, targets=targets,
                    inputs=inputs, model_name='lgbm', lgbm_cfg=lgbm_cfg)

        rmse=0
        
        for target in ['content', 'wording']:
            rmse+=mean_squared_error(train[target], train[f'{target}_pred'], squared=False)
    
        return rmse/2
 

    def print_progress(study, trial):
        print(f"Number of completed trials: {len(study.trials)}")
    
    study=optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, callbacks=[print_progress])
    
    lgbm_cfg=study.best_trial.params
    
    train_by_fold(train_df=train, model_name="lgbm", 
                    targets=targets, inputs=inputs, lgbm_cfg=lgbm_cfg)

    train=validate(train_df=train, targets=targets,
                    inputs=inputs, model_name='lgbm', lgbm_cfg=lgbm_cfg)
    
    for target in ['content', 'wording']:
            rmse=mean_squared_error(train[target], train[f'{target}_pred'], squared=False)
            print(f'cv {target} rmse : {rmse}')
    
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()
    
    test_df=predict(test_df=test_df, targets=targets, inputs=inputs,
                n_splits=cfg.n_splits, model_name='lgbm', lgbm_cfg=lgbm_cfg)
        

In [40]:
lgbm_optuna(n_trials=100, train_df=train, test_df=test)

[I 2023-11-05 01:54:45,800] A new study created in memory with name: no-name-4d361ad5-c360-4fb9-9cd7-11491a37a599


(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:54:59,483] Trial 0 finished with value: 0.6083415523587224 and parameters: {'num_leaves': 40, 'num_iteration': 800, 'min_data_in_leaf': 10, 'early_stopping_round': 80, 'learning_rate': 0.04964071300582693}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 1
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:55:06,718] Trial 1 finished with value: 0.6091007246455553 and parameters: {'num_leaves': 40, 'num_iteration': 200, 'min_data_in_leaf': 20, 'early_stopping_round': 50, 'learning_rate': 0.07261919152103168}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 2
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:55:13,924] Trial 2 finished with value: 0.6120861706246754 and parameters: {'num_leaves': 30, 'num_iteration': 200, 'min_data_in_leaf': 60, 'early_stopping_round': 90, 'learning_rate': 0.09696662986978291}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 3
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:55:31,788] Trial 3 finished with value: 0.6178361463164226 and parameters: {'num_leaves': 50, 'num_iteration': 700, 'min_data_in_leaf': 60, 'early_stopping_round': 70, 'learning_rate': 0.016136136825667103}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 4
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:55:39,369] Trial 4 finished with value: 0.6084465064059227 and parameters: {'num_leaves': 40, 'num_iteration': 100, 'min_data_in_leaf': 20, 'early_stopping_round': 90, 'learning_rate': 0.09412801513082038}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 5
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:55:46,367] Trial 5 finished with value: 0.6103264482701063 and parameters: {'num_leaves': 30, 'num_iteration': 500, 'min_data_in_leaf': 40, 'early_stopping_round': 60, 'learning_rate': 0.051395947466641466}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 6
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:55:53,867] Trial 6 finished with value: 0.6153891032629555 and parameters: {'num_leaves': 35, 'num_iteration': 300, 'min_data_in_leaf': 70, 'early_stopping_round': 10, 'learning_rate': 0.02910420471893114}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 7
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:56:02,157] Trial 7 finished with value: 0.6106416904808605 and parameters: {'num_leaves': 35, 'num_iteration': 400, 'min_data_in_leaf': 40, 'early_stopping_round': 90, 'learning_rate': 0.07587365608952189}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 8
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:56:09,051] Trial 8 finished with value: 0.6091857727701145 and parameters: {'num_leaves': 30, 'num_iteration': 900, 'min_data_in_leaf': 30, 'early_stopping_round': 80, 'learning_rate': 0.0956840905928222}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 9
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:56:13,937] Trial 9 finished with value: 0.6116285532070724 and parameters: {'num_leaves': 30, 'num_iteration': 800, 'min_data_in_leaf': 40, 'early_stopping_round': 30, 'learning_rate': 0.09591426789522102}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 10
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[I 2023-11-05 01:56:20,709] Trial 10 finished with value: 0.6153230537229862 and parameters: {'num_leaves': 20, 'num_iteration': 1000, 'min_data_in_leaf': 100, 'early_stopping_round': 40, 'learning_rate': 0.04112997764811223}. Best is trial 0 with value: 0.6083415523587224.


Number of completed trials: 11
(5108, 9) (2057, 9)
(5156, 9) (2009, 9)
(5169, 9) (1996, 9)
(6062, 9) (1103, 9)


[W 2023-11-05 01:56:49,439] Trial 11 failed with parameters: {'num_leaves': 45, 'num_iteration': 600, 'min_data_in_leaf': 10, 'early_stopping_round': 100, 'learning_rate': 0.0012285994276365897} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\MINSEOK\anaconda3\envs\torch\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\MINSEOK\AppData\Local\Temp\ipykernel_21276\4177901319.py", line 27, in objective
    train_by_fold(train_df=train_df, model_name="lgbm",
  File "C:\Users\MINSEOK\AppData\Local\Temp\ipykernel_21276\1305135552.py", line 26, in train_by_fold
    model.train(fold=fold,
  File "C:\Users\MINSEOK\AppData\Local\Temp\ipykernel_21276\368387694.py", line 28, in train
    model=lgb.train(params=self.lgbm_parameter, train_set=train_dataset, valid_sets=valid_dataset)
  File "c:\Users\MINSEOK\anaconda3\envs\torch\lib\site-packages\lightgbm\engine.py", line 27

KeyboardInterrupt: 

In [94]:
targets=['wording', 'content']
inputs=['summary_length', 'word_overlap_cnt', 'quotes_count','wrong_word_cnt',
        'length_ratio', 'bigram_overlap_count', 'trigram_overlap_count']

train_by_fold(train_df=train, model_name="lgbm", 
              targets=targets, inputs=inputs, lgbm_cfg=lgbm_cfg)

train=validate(train_df=train, targets=targets,
               inputs=inputs, model_name='lgbm', lgbm_cfg=lgbm_cfg)

for target in ['content', 'wording']:
    rmse=mean_squared_error(train[target], train[f'{target}_pred'], squared=False)
    print(f'cv {target} rmse : {rmse}')
    
test=predict(test_df=test, targets=targets, inputs=inputs,
             n_splits=cfg.n_splits, model_name='lgbm', lgbm_cfg=lgbm_cfg)

fold 0:


KeyError: 'fold'

In [74]:
train.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,prompt_length,student_id,text,content,wording,summary_length,wrong_word_cnt,fixed_summary_text,word_overlap_cnt,quotes_count,fold,content_pred,wording_pred
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,59,3,1 element of an ideal tragedy is that it shoul...,12,0,0.0,-0.06927,-0.062762
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,30,4,The three elements of an ideal tragedy are: H...,10,0,0.0,-0.222421,-0.196068
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,74,17,Aristotle states that an ideal tragedy should ...,13,4,0.0,0.019454,-0.069977
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,61,2,One element of an Ideal tragedy is having a co...,13,0,0.0,-0.01296,-0.014932
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,699,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,63,9,The 3 ideal of tragedy is how complex you need...,11,1,0.0,-0.01296,-0.069977


In [None]:
class LSTMReg:
    