# All Models

#### This notebook originates from the [work](https://www.kaggle.com/code/nhttinnguynbch/commonlit-ess-lgbm-autocorrect-deberta-v3-tuned) of [@nhttinnguynbch](https://www.kaggle.com/nhttinnguynbch)<br> Which originates from the [work](https://www.kaggle.com/code/siddhvr/commonlit-ess-lgbm-autocorrect-deberta-v3-tuned) of [@siddhvr](https://www.kaggle.com/siddhvr)

On this notebook, I tried to try on all the other models and compare them with `debertav3base`.<br>
However, the performance of other models were not as good as `debertav3base`.<br>
Therefore, I had to move back to `debertav3base` and added symspellpy for preprocessing part (based on my research on this [notebook](https://www.kaggle.com/code/jasonheesanglee/spellcheck-tool-comparison)) and optuna for LGBM.

# Install, Import

In [1]:
!pip install /kaggle/input/autocorrect/autocorrect-2.6.1.tar
!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
!pip install /kaggle/input/symspell-677/editdistpy-0.1.3-cp310-cp310-linux_x86_64.whl
!pip install /kaggle/input/symspell-677/symspellpy-6.7.7-py3-none-any.whl

Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=3e9bf84dac1038066f4f5f521064dac850c772b46f9c960667dbd8f865a9e613
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1
Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2
Processing /kaggle/input/symspell-677/editdistpy-0.1.3-cp310-cp310-linux_x86_64.whl
Installing collected packages: editdistpy
Successfully installed editdistpy-0.1.3
Processing /kaggle/input/symspell-677/symspellpy-6.7.7

In [2]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import logging
logging.disable(logging.ERROR)

from typing import List
import numpy as np
import pandas as pd
import shutil
import json
import transformers
import sentencepiece
import pkg_resources

from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
# from transformers import T5ForConditionalGeneration, T5TokenizerFast, T5Config
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
from symspellpy import SymSpell, Verbosity
import lightgbm as lgb
import optuna


warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

# Configuration

In [3]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [4]:
files = ['debertav3base', # files[0]
         'albert-large-v2', # files[1]
         'bert-base-uncased', # files[2]
         'bert-large-uncased', # files[3]
         'distilroberta-base', # files[4]
         'distilbert-base-uncased', # files[5]
         'google-electra-base-discriminator', # files[6]
         'facebook-bart-base', # files[7] # Not working
         'facebook-bart-large', # files[8]
         'funnel-transformer-small', # files[9]
         'funnel-transformer-large', # files[10]
         'roberta-base', # files[11]
         'roberta-large', # files[12]
         't5-base', # files[13] # don't use
         't5-large', # files[14] # don't use
         'xlnet-base-cased', # files[15]
         'xlnet-large-cased' # files[16]
         ]

In [5]:
IS_DEBUG = True
OPTUNA = True
FOLD = 'G_FOLD' # 'G_FOLD' or 'S_FOLD'

SEP_TKN = ' #### ' # ' [SEP]' or ' #### '

# if SEP_TKN == ' #### ':
CLS_TKN = ''
# else:
#     CLS_TKN = ' [CLS] '
    
class CFG:
    model_name=files[0]
    learning_rate=0.000016   #0.000015
    weight_decay=0.007        #0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_mprob=0.007
    num_train_epochs=1 if IS_DEBUG else 4
    n_splits= 2 if IS_DEBUG else 4
    batch_size= 2 if IS_DEBUG else 8
    random_seed=42
    save_steps=1000 if IS_DEBUG else 100
    max_length= 10 if IS_DEBUG else 512

In [6]:
method = ['py_and_sym', 'pyspell_only', 'symspell_only']

freq_dict_list = ["/content/symspell-677/symspell_freq_dict.txt",
            "/content/symspell-677/frequency_dictionary_en_82_765.txt",
            "/content/symspell-677/frequency_bigramdictionary_en_243_342.txt"]

yes = True
no = False

manage_misspelled_words = yes
misspelled_word_method = method[0]
freq_dict = freq_dict_list[1].split('/')[-1]
freq_dict


'frequency_dictionary_en_82_765.txt'

In [7]:
if (manage_misspelled_words==yes) & (misspelled_word_method == 'py_and_sym'):
    pyspell_detector = yes
    symspell_corrector = yes
    misspell_counter = 0

if (manage_misspelled_words==yes) & (misspelled_word_method == 'pyspell_only'):
    pyspell_detector = yes
    symspell_corrector = no
    misspell_counter = 1
    
if (manage_misspelled_words==yes) & (misspelled_word_method == 'symspell_only'):
    pyspell_detector = no
    symspell_corrector = yes
    freq_dict = freq_dict_list[1]
    misspell_counter = 2

## Dataload

In [8]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker


In [9]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        if model_name == files[0]:
            self.tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/{model_name}')
        elif model_name == (files[13] or files[14]):
            self.tokenizer = T5TokenizerFast.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        # input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [10]:
train = pd.read_csv('/kaggle/input/datapreprocess/original_train.csv')
# train = preprocessor.run(prompts_train, summaries_train, mode="train")

test = preprocessor.run(prompts_test, summaries_test, mode="test")

train.head()

100%|██████████| 4/4 [00:00<00:00, 6058.94it/s]
100%|██████████| 4/4 [00:00<00:00, 6125.31it/s]
100%|██████████| 4/4 [00:00<00:00, 3568.86it/s]
100%|██████████| 4/4 [00:00<00:00, 3885.41it/s]
100%|██████████| 4/4 [00:00<00:00, 4032.02it/s]
100%|██████████| 4/4 [00:00<00:00, 3128.33it/s]


Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,14,4,0.063492,0,0.0,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,18,22,0.415094,10,0.192308,0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,22,52,0.19403,23,0.086142,2
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,6,6,0.222222,5,0.192308,0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,23,27,0.116883,5,0.021739,4


# SymSpell & Pyspellcheck

In [11]:
def pyspellchecker_detector(sentence):
    sentence = re.sub(r'[^\w\s]','',sentence)
    spell = SpellChecker()
    tokens = sentence.split(' ')
    mis_tokens = []
    for token in spell.unknown(tokens):
        if token.isalpha():
            mis_tokens.append(token)
    return mis_tokens

def symspellpy_corrector(mis_tokens, freq_dict_):
    try:
        sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
        freq_dict = pkg_resources.resource_filename("symspellpy", freq_dict_)
        sym_spell.load_dictionary(freq_dict, term_index=0, count_index=1)
        corrected_token = {}
        for token in tqdm(mis_tokens):
            terms = sym_spell.lookup_compound(token, 
                                              max_edit_distance=2) 
            if token not in corrected_token.keys():
                corrected_token[token] = terms[0].term
        return corrected_token

    except UnicodeDecodeError:
        return mis_tokens

def py_sym_checker(df, column, new_col, freq_dict_):
    try:
        mis_tokens = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            for word in pyspellchecker_detector(sentence):
                mis_tokens.append(word)
        
        mis_token_rep = symspellpy_corrector(mis_tokens, freq_dict_)
        
        temp = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            tokens = sentence.split(' ')
            temp_str = ''
            for token in tokens:
                if token in mis_token_rep.keys():
                    temp_str = temp_str + " " + mis_token_rep.get(token)
                else:
                    temp_str = temp_str + " " + token
            temp.append(temp_str)
            
        # df[column] = pd.Series(temp)
        return pd.Series(temp)
    
    except UnicodeDecodeError:
        return df[new_col]
    
def symspellpy_correction(df, column, new_col, freq_dict=freq_dict):
    try:        
        temp = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5)
            freq_dict = freq_dict
            sym_spell.load_dictionary(freq_dict, term_index=0, count_index=1)
            terms = sym_spell.lookup_compound(sentence, 
                                              max_edit_distance=2) 

            corrected_sentence = terms[0].term
            temp.append(corrected_sentence)
            
        # df[column] = pd.Series(temp)
        return pd.Series(temp)
    
    except UnicodeDecodeError:
        return df[new_col]
    
def pyspell_correction(df, column, new_col):
    try:        
        temp_total = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            
            spell = SpellChecker()
            tokens = nltk.word_tokenize(sentence)
            text_length = len(tokens)

            mis_tokens = [token for token in spell.unknown(tokens) if token.isalpha()]
            temp = []
            corrected_words = []
            for word in mis_tokens:
                corrected_word = spell.correction(word)
                temp.append({word : corrected_word})
                corrected_words.append(corrected_word)

            temp_1 = []
            for word in tokens:
                for set_ in temp:
                    if list(set_.keys())[0] == word:
                        word = list(set_.values())[0]
                        if word in temp_1:
                            continue
                        else:
                            temp_1.append(word)
                if word in temp_1:
                    continue
                else:
                    temp_1.append(word)

            corrected_sentence = ''
            for word in temp_1:
                try:
                    if (word.isalpha() or word.isnumeric()) == True:
                        corrected_sentence = corrected_sentence + word + ' '
                    elif word in [',', '.', '"', "'", '(', ')', '[', ']', '{', '}']:
                        corrected_sentence = corrected_sentence + word
                    else:
                        corrected_sentence = corrected_sentence + word
                except:
                    continue
            corrected_sentence = corrected_sentence.replace('  ', ' ').strip()
            temp_total.append(corrected_sentence)
            
        # df[column] = pd.Series(temp_total)
        return pd.Series(temp_total)
    except UnicodeDecodeError:
        return df[new_col]

In [12]:
if manage_misspelled_words:
    train = pd.read_csv('/kaggle/input/datapreprocess/py_sym_train.csv')
    if misspell_counter == 0:
        test['symspell_corr'] = py_sym_checker(test, 'text', 'symspell_corr',freq_dict)
        print(f"py & sym test_processed['text'][0] =\n{test['symspell_corr'][0]}")
    elif misspell_counter == 1:
        test['symspell_corr'] = pyspell_correction(test, 'text', 'symspell_corr')
        print(f"py only test_processed['text'][0] =\n{test['symspell_corr'][0]}")
    elif misspell_counter == 2:
        test['symspell_corr'] = symspellpy_correction(test, 'text', 'symspell_corr', freq_dict)
        print(f"sym only test_processed['text'][0] =\n{test['symspell_corr'][0]}")
else:
    
    print('no change')

100%|██████████| 4/4 [00:00<00:00, 17.72it/s]
0it [00:00, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 9742.87it/s]

py & sym test_processed['text'][0] =
 Example text 1





# Fold

In [13]:
if FOLD == 'S_FOLD':
    S_kfold = StratifiedKFold(n_splits=CFG.n_splits)
    for i, (_, val_index) in enumerate(S_kfold.split(train, y=train["prompt_id"])):
        train.loc[val_index, "fold"] = i
    print("It's StratifiedKFold")
    
if FOLD == 'G_FOLD':
    gkf = GroupKFold(n_splits=CFG.n_splits)
    for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
        train.loc[val_index, "fold"] = i
    print("It's GroupKFold")
    
train.head(5)

It's GroupKFold


Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,symspell_corr,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,14,4,0.063492,0,0.0,0,The third wave was an experiment to see how p...,0.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,18,22,0.415094,10,0.192308,0,They would rub it up with soda to make the sm...,1.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,22,52,0.19403,23,0.086142,2,"In Egypt, there were many occupations and soc...",1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,6,6,0.222222,5,0.192308,0,The highest class was Pharaohs these people w...,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,23,27,0.116883,5,0.021739,4,The Third Wave developed rapidly because the...,0.0


In [14]:
# train.text = train.prompt_question + ' #### ' + train.fixed_summary_text
# train = train.drop(columns=['prompt_question', 'fixed_summary_text'])
# display(train.head(5))

# test.text = test.prompt_question + ' #### '  + test.fixed_summary_text
# test = test.drop(columns=['prompt_question', 'fixed_summary_text'])
# display(test.head(5))



train.text = train.prompt_question + ' #### ' + train.fixed_summary_text + ' #### ' + train.symspell_corr
train = train.drop(columns=['prompt_question', 'fixed_summary_text', 'symspell_corr'])
display(train.head(5))

test.text = test.prompt_question + ' #### '  + test.fixed_summary_text + ' #### ' + test.symspell_corr
test = test.drop(columns=['prompt_question', 'fixed_summary_text', 'symspell_corr'])
display(test.head(5))



# train.text = train.prompt_question + ' #### ' + train.symspell_corr
# train = train.drop(columns=['prompt_question', 'fixed_summary_text', 'symspell_corr'])
# display(train.head(5))

# test.text = test.prompt_question + ' #### '  + test.symspell_corr
# test = test.drop(columns=['prompt_question', 'fixed_summary_text', 'symspell_corr'])
# display(test.head(5))



Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,fold
0,000e8c3c7ddb,814d6b,Summarize how the Third Wave developed over su...,0.205683,0.380538,64,5,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,14,4,0.063492,0,0.0,0,0.0
1,0020ae56ffbf,ebad26,Summarize the various ways the factory would u...,-0.548304,0.506755,54,2,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,18,22,0.415094,10,0.192308,0,1.0
2,004e978e639e,3b9047,"In complete sentences, summarize the structure...",3.128928,4.231226,269,32,Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,22,52,0.19403,23,0.086142,2,1.0
3,005ab0199905,3b9047,"In complete sentences, summarize the structure...",-0.210614,-0.471415,28,5,Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,6,6,0.222222,5,0.192308,0,1.0
4,0070c9e7af47,814d6b,Summarize how the Third Wave developed over su...,3.272894,3.219757,232,29,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,23,27,0.116883,5,0.021739,4,0.0


Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,000000ffffff,abc123,Summarize... #### Example text 1 #### Example...,3,0,Example Title 1,Heading\nText...,3,0,0,0.0,0,0.0,0
1,111111eeeeee,def789,Summarize... #### Example text 2 #### Example...,3,0,Example Title 2,Heading\nText...,3,0,0,0.0,0,0.0,0
2,222222cccccc,abc123,Summarize... #### Example text 3 #### Example...,3,0,Example Title 1,Heading\nText...,3,0,0,0.0,0,0.0,0
3,333333dddddd,def789,Summarize... #### Example text 4 #### Example...,3,0,Example Title 2,Heading\nText...,3,0,0,0.0,0,0.0,0


In [15]:
train.columns

Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'summary_length', 'splling_err_num', 'prompt_title', 'prompt_text',
       'prompt_length', 'word_overlap_count', 'bigram_overlap_count',
       'bigram_overlap_ratio', 'trigram_overlap_count',
       'trigram_overlap_ratio', 'quotes_count', 'fold'],
      dtype='object')

## Model Function Definition

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor
*Edited some codes to fit my need.*

In [17]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        # Tokenizer

        if model_name == files[0]:
            self.tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/{model_name}')
        elif model_name == (files[13] or files[14]):
            self.tokenizer = T5TokenizerFast.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/transformers/{model_name}')

        # Config

        if model_name == files[0]:
            self.model_config = AutoConfig.from_pretrained(f'/kaggle/input/{model_name}')
        elif model_name == (files[13] or files[14]):
            self.model_config = T5Config.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        else:
            self.model_config = AutoConfig.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        token_dict = {"debertav3base": [' [CLS] ', ' [SEP]'],
                      "albert-large-v2": [' [CLS] ', ' [SEP]'],
                      "bert-base-uncased": [' [CLS] ', ' [SEP]'],
                      "bert-large-uncased": [' [CLS] ', ' [SEP]'],
                      "distilroberta-base": [' <s> ', ' </s>'],
                      "distilbert-base-uncased": [' [CLS] ', ' [SEP]'],
                      "google-electra-base-discriminator": [' [CLS] ', ' [SEP]'],
                      "facebook-bart-base": [' <s> ', ' </s>'],
                      "facebook-bart-large": [' <s> ', ' </s>'], 
                      "funnel-transformer-small": [' <cls> ', ' <sep>'],
                      "funnel-transformer-large": [' <cls> ', ' <sep>'],
                      "roberta-base": [' <s> ', ' </s>'],
                      "roberta-large": ['<s> ', ' </s>'],
                      "t5-base": [' <s> ', ' </s>'],
                      "t5-large": [' <s> ', ' </s>'],
                      "xlnet-base-cased": ['<s> ', ' </s>'],
                      "xlnet-large-cased": [' <s> ', ' </s>']
                      }
#         if SEP_TKN != " #### ":
#             cls_ = token_dict[self.model_name][0]
#             sep = token_dict[self.model_name][1]
#         else:
#             cls_ = ''
#             sep = SEP_TKN

        
        train_df[self.input_col] = (
                    train_df["prompt_title"] + " #### "
                    + train_df["text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + " #### "
                    + valid_df["text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        if self.model_name == files[0]:
            model_content = AutoModelForSequenceClassification.from_pretrained(
                f"/kaggle/input/{self.model_name}",
                config=self.model_config
            )
        elif self.model_name == (files[13] or files[14]):
            model_content = T5ForConditionalGeneration.from_pretrained(
                f"/kaggle/input/transformers/{self.model_name}",
                config=self.model_config
            )
        else:
            model_content = AutoModelForSequenceClassification.from_pretrained(
                f"/kaggle/input/transformers/{self.model_name}",
                config=self.model_config
            )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        token_dict = {"debertav3base": [' [CLS] ', ' [SEP]'],
                      "albert-large-v2": [' [CLS] ', ' [SEP]'],
                      "bert-base-uncased": [' [CLS] ', ' [SEP]'],
                      "bert-large-uncased": [' [CLS] ', ' [SEP]'],
                      "distilroberta-base": [' <s> ', ' </s>'],
                      "distilbert-base-uncased": [' [CLS] ', ' [SEP]'],
                      "google-electra-base-discriminator": [' [CLS] ', ' [SEP]'],
                      "facebook-bart-base": [' <s> ', ' </s>'],
                      "facebook-bart-large": [' <s> ', ' </s>'], 
                      "funnel-transformer-small": [' <cls> ', ' <sep>'],
                      "funnel-transformer-large": [' <cls> ', ' <sep>'],
                      "roberta-base": [' <s> ', ' </s>'],
                      "roberta-large": ['<s> ', ' </s>'],
                      "t5-base": [' <s> ', ' </s>'],
                      "t5-large": [' <s> ', ' </s>'],
                      "xlnet-base-cased": ['<s> ', ' </s>'],
                      "xlnet-large-cased": [' <s> ', ' </s>']
                      }
        
#         if SEP_TKN != ' #### ':
#             cls_ = token_dict[self.model_name][0]
#             sep = token_dict[self.model_name][1]
#         else:
#             cls_ = ''
#             sep = SEP_TKN

        in_text = (
                   test_df["prompt_title"] + " #### "
                    + test_df["text"]
                  )
        
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [18]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
#     if '/' in model_name:
#         model_name_ = model_name.split('/')[1]
    os.mkdir(model_name)
#     else:
#         model_name_ = model_name.copy()
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [19]:
for target in ["content", "wording"]:
    train_by_fold(
        train,
        model_name=CFG.model_name,
        save_each_model=False,
        target=target,
        learning_rate=CFG.learning_rate,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_mprob,
        weight_decay=CFG.weight_decay,
        num_train_epochs=CFG.num_train_epochs,
        n_splits=CFG.n_splits,
        batch_size=CFG.batch_size,
        save_steps=CFG.save_steps,
        max_length=CFG.max_length
    )
    
    
    train = validate(
        train,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_mprob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    test = predict(
        test,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_mprob,
        max_length=CFG.max_length
    )

fold 0:


Step,Training Loss,Validation Loss,Rmse
1000,1.1026,1.099696,1.048664
2000,1.0952,1.082463,1.040415


fold 1:


Step,Training Loss,Validation Loss,Rmse
1000,1.0816,1.111071,1.054073


fold 0:


fold 1:


cv content rmse: 1.048071407403576
fold 0:


fold 1:


fold 0:


Step,Training Loss,Validation Loss,Rmse
1000,0.8672,1.412668,1.188557
2000,0.9096,1.286071,1.134051


fold 1:


Step,Training Loss,Validation Loss,Rmse
1000,1.1344,0.898658,0.947976


fold 0:


fold 1:


cv wording rmse: 1.0341759862490243
fold 0:


fold 1:


In [20]:
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,fold,content_pred,wording_pred
0,000e8c3c7ddb,814d6b,Summarize how the Third Wave developed over su...,0.205683,0.380538,64,5,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,14,4,0.063492,0,0.0,0,0.0,-0.136062,-0.091997
1,0020ae56ffbf,ebad26,Summarize the various ways the factory would u...,-0.548304,0.506755,54,2,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,18,22,0.415094,10,0.192308,0,1.0,0.046871,-0.187534
2,004e978e639e,3b9047,"In complete sentences, summarize the structure...",3.128928,4.231226,269,32,Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,22,52,0.19403,23,0.086142,2,1.0,0.06334,-0.180703
3,005ab0199905,3b9047,"In complete sentences, summarize the structure...",-0.210614,-0.471415,28,5,Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,6,6,0.222222,5,0.192308,0,1.0,0.06334,-0.180703
4,0070c9e7af47,814d6b,Summarize how the Third Wave developed over su...,3.272894,3.219757,232,29,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,23,27,0.116883,5,0.021739,4,0.0,-0.136062,-0.091997


## LGBM model

In [21]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text",
                "prompt_title", 
                "prompt_text"
               ] + targets

In [22]:
model_dict = {}
if OPTUNA:
    for target in targets:
        models = []

        for fold in range(CFG.n_splits):

            X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
            y_train_cv = train[train["fold"] != fold][target]

            X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
            y_eval_cv = train[train["fold"] == fold][target]

            dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
            dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)


            def objective(trial):
                param_space = {'boosting_type': 'gbdt',
                    'random_state': 42,
                    'objective': 'regression',
                    'metric': 'rmse',
                    'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 0.1, log=True),
                    'max_depth' : 2 if IS_DEBUG else trial.suggest_int('max_depth', 3, 10),
                    'num_leaves': 2 if IS_DEBUG else trial.suggest_int('num_leaves', 2, 1024),
                    'lambda_l1' : trial.suggest_float('lambda_l1', 1e-7, 0.1, log=True),
                    'lambda_l2' : trial.suggest_float('lambda_l2', 1e-7, 0.1, log=True)
                }
                params = param_space.copy()
                for param, value in trial.params.items():
                    params[param] = value

                evaluation_results = {}
                model = lgb.train(params,
                                  num_boost_round= 2 if IS_DEBUG else 10000,
                                    #categorical_feature = categorical_features,
                                  valid_names=['train', 'valid'],
                                  train_set=dtrain,
                                  valid_sets=dval,
                                  callbacks=[
                                      lgb.early_stopping(stopping_rounds=30, verbose=True),
                                       lgb.log_evaluation(100),
                                      lgb.callback.record_evaluation(evaluation_results)
                                    ],
                                  )
                y_pred = model.predict(X_eval_cv)
                rmse = np.sqrt(mean_squared_error(y_eval_cv, y_pred))
                return rmse

            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials= 2 if IS_DEBUG else 500)

            best_params = study.best_trial.params
            evaluation_results = {}
            best_model = lgb.train(best_params,
                                  num_boost_round = 2 if IS_DEBUG else 10000,
                                  train_set=dtrain,
                                  valid_sets=dval,
                                  callbacks=[
                                  lgb.early_stopping(stopping_rounds=30, verbose=True),
                                   lgb.log_evaluation(100),
                                  lgb.callback.record_evaluation(evaluation_results)
                                  ])


            models.append(best_model)

        model_dict[target] = models

else:
    for target in targets:
        models = []

        for fold in range(CFG.n_splits):

            X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
            y_train_cv = train[train["fold"] != fold][target]

            X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
            y_eval_cv = train[train["fold"] == fold][target]

            dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
            dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

            params = {
                'boosting_type': 'gbdt',
                'random_state': 42,
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.048,
                'max_depth': 4,  #3
                'lambda_l1': 0.0,
                'lambda_l2': 0.011
            }

            evaluation_results = {}
            model = lgb.train(params,
                              num_boost_round=10000,
                                #categorical_feature = categorical_features,
                              valid_names=['train', 'valid'],
                              train_set=dtrain,
                              valid_sets=dval,
                              callbacks=[
                                  lgb.early_stopping(stopping_rounds=30, verbose=True),
                                   lgb.log_evaluation(100),
                                  lgb.callback.record_evaluation(evaluation_results)
                                ],
                              )
            models.append(model)

        model_dict[target] = models

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1173
[LightGBM] [Info] Number of data points in the train set: 4005, number of used features: 11
[LightGBM] [Info] Start training from score -0.018940
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[2]	train's rmse: 1.03267
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1173
[LightGBM] [Info] Number of data points in the train set: 4005, number of used features: 11
[LightGBM] [Info] Start training from score -0.018940
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[2]	train's rmse: 0.979184
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[Li

## CV Score

In [23]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 1.0084781411901365
wording_rmse : 1.0550894788208747
mcrmse : 1.0317838100055057


## Predict

In [24]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text",
                "prompt_title", 
                "prompt_text",
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [25]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [26]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [27]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,...,trigram_overlap_count,trigram_overlap_ratio,quotes_count,input,content_pred_0,content_pred_1,content,wording_pred_0,wording_pred_1,wording
0,000000ffffff,abc123,Summarize... #### Example text 1 #### Example...,3,0,Example Title 1,Heading\nText...,3,0,0,...,0,0.0,0,Example Title 1 #### Summarize... #### Example...,-0.154804,-0.009674,-0.082239,-0.21171,0.085454,-0.063128
1,111111eeeeee,def789,Summarize... #### Example text 2 #### Example...,3,0,Example Title 2,Heading\nText...,3,0,0,...,0,0.0,0,Example Title 2 #### Summarize... #### Example...,-0.154804,-0.009674,-0.082239,-0.21171,0.085454,-0.063128
2,222222cccccc,abc123,Summarize... #### Example text 3 #### Example...,3,0,Example Title 1,Heading\nText...,3,0,0,...,0,0.0,0,Example Title 1 #### Summarize... #### Example...,-0.154804,-0.009674,-0.082239,-0.21171,0.085454,-0.063128
3,333333dddddd,def789,Summarize... #### Example text 4 #### Example...,3,0,Example Title 2,Heading\nText...,3,0,0,...,0,0.0,0,Example Title 2 #### Summarize... #### Example...,-0.154804,-0.009674,-0.082239,-0.21171,0.085454,-0.063128


## Create Submission file

In [28]:
sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [29]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

In [30]:
test[["student_id", "content", "wording"]]

Unnamed: 0,student_id,content,wording
0,000000ffffff,-0.082239,-0.063128
1,111111eeeeee,-0.082239,-0.063128
2,222222cccccc,-0.082239,-0.063128
3,333333dddddd,-0.082239,-0.063128
