# Deberta inference

テストデータにて特徴量として利用

In [1]:
import pandas as pd 
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)
from datasets import Dataset
from glob import glob
import gc
import torch
from scipy.special import softmax

MAX_LENGTH = 1024
TEST_DATA_PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
MODEL_PATH = '/kaggle/input/aes2-400-20240419134941/*/*'
EVAL_BATCH_SIZE = 1

models = glob(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(models[0])

def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

df_test = pd.read_csv(TEST_DATA_PATH)
ds = Dataset.from_pandas(df_test).map(tokenize).remove_columns(['essay_id', 'full_text'])

args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=EVAL_BATCH_SIZE, 
    report_to="none"
)

predictions = []
for model in models:
    model = AutoModelForSequenceClassification.from_pretrained(model)
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=DataCollatorWithPadding(tokenizer), 
        tokenizer=tokenizer
    )
    
    preds = trainer.predict(ds).predictions
    predictions.append(softmax(preds, axis=-1))  
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()
    
predicted_score = 0.

for p in predictions:
    predicted_score += p
    
predicted_score /= len(predictions)

df_test['score'] = predicted_score.argmax(-1) + 1
df_test.head()

2024-05-31 10:18:15.044716: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-31 10:18:15.044835: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-31 10:18:15.187184: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


  0%|          | 0/3 [00:00<?, ?ex/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,5


## Import

In [2]:
import re
import copy
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
from tqdm.auto import tqdm,trange
from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score

## Load Data

In [3]:
#paragraph列の追加
columns = [  
    (
        pl.col("full_text").str.split(by="\n\n").alias("paragraph")
    ),
]
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"

# Load training and testing sets, while using \ n \ n character segmentation to list and renaming to paragraph for full_text data
train = pl.read_csv(PATH + "train.csv").with_columns(columns)
test = pl.read_csv(PATH + "test.csv").with_columns(columns)

train.head(5)

essay_id,full_text,score,paragraph
str,str,i64,list[str]
"""000d118""","""Many people ha…",3,"[""Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won't see a car in Vauban's streets because they are completely ""car free"" but If some that lives in VAUBAN that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $40,000 along with a home. The vauban people completed this in 2006 ,they said that this an example of a growing trend in Europe,The untile states and some where else are suburban life from auto use this is called ""smart planning"". The current efforts to drastically reduce greenhouse gas emissions from tailes the passengee cars are responsible for 12 percent of greenhouse gas emissions in Europe and up to 50 percent in some car intensive in the United States. I honeslty think that good idea that they did that is Vaudan because that makes cities denser and better for walking and in VAUBAN there are 5,500 residents within a rectangular square mile. In the artical David Gold berg said that ""All of our development since World war 2 has been centered on the cars,and that will have to change"" and i think that was very true what David Gold said because alot thing we need cars to do we can go anyway were with out cars beacuse some people are a very lazy to walk to place thats why they alot of people use car and i think that it was a good idea that that they did that in VAUBAN so people can see how we really don't need car to go to place from place because we can walk from were we need to go or we can ride bycles with out the use of a car. It good that they are doing that if you thik about your help the earth in way and thats a very good thing to. In the United states ,the Environmental protection Agency is promoting what is called ""car reduced""communtunties,and the legislators are starting to act,if cautiously. Maany experts expect pubic transport serving suburbs to play a much larger role in a new six years federal transportation bill to approved this year. In previous bill,80 percent of appropriations have by law gone to highways and only 20 percent to other transports. There many good reason why they should do this. ""]"
"""000fe60""","""I am a scienti…",3,"[""I am a scientist at NASA that is discussing the ""face"" on mars. I will be explaining how the ""face"" is a land form. By sharing my information about this isue i will tell you just that."", ""First off, how could it be a martions drawing. There is no plant life on mars as of rite now that we know of, which means so far as we know it is not possible for any type of life. That explains how it could not be made by martians. Also why and how would a martion build a face so big. It just does not make any since that a martian did this."", … ""To sum all this up the ""face"" on mars is a landform but others would like to beleive it's a martian sculpture. Which every one that works at NASA says it's a landform and they are all the ones working on the planet and taking pictures.""]"
"""001ab80""","""People always …",4,"[""People always wish they had the same technology that they have seen in movies, or the best new piece of technology that is all over social media. However, nobody seems to think of the risks that these kinds of new technologies may have. Cars have been around for many decades, and now manufacturers are starting to get on the bandwagon and come up with the new and improved technology that they hope will appeal to everyone. As of right now, it seems as though the negative characteristics of these cars consume the positive idea that these manufacturers have tried to convey."", ""Currently, this new technology in cars has a very long way to go before being completely ""driverless"". Drivers still need to be on alert when they are driving, as well as control the car near any accidents or complicated traffic situations. This seems to totally defeat the purpose of the ""driverless"" car. Eventually the technology may improve, but nobody can be certain that the driverless car will eventually become completely ""driverless"". This idea just seems like a lot of hard work and money for something that is not very neccessary. If someone does not want to drive their car they can just take a city bus or a subway. There are so many options of transportation that can already solve this problem. Even if masnufacturers are trying to make driving more ""fun"", driving is not meant to be ""fun"" it is meant to get people where they need to go. Playing around in a car just to have ""fun"" is just a recipe for disaster."", … ""The technology car manufacturers are trying to develope may just be a diasaster in the making. There are many alternative options of transportations if you do not feel like driving yourself, and these options are way less expensive than buying a brand new car. Although this technology is relatively new, we can not be certain that this new idea will even pay off in the end, it may just be a waste of money and time. Sometimes the newest technology is not the most benefical. ""]"
"""001bdc0""","""We all heard a…",4,"[""We all heard about Venus, the planet without almost oxygen with earthquakes, erupting volcanoes and temperatures average over 800 degrees Fahrenheit but what if scientist project the futur into this planet ? Through this article, the author uses evidences appealing to reason and concession to make us realize why we should care about studying this planet so that people must give a chance to Venus."", ""Venus is the closest planet to Earth in terms density and size but has a really different climate. As it is evoked by the author:"", … ""In conclusion, despite of Venus hostility put in advance by the concession, the author makes the audience realize that there's a solution but that we can find it only if we study the planet. He make us find out that challenge and curiosity is part of human life. But also that danger and fear should not stop us from discovering new things. After all, we are Humans.""]"
"""002ba53""","""Dear, State Se…",3,"[""Dear, State Senator"", ""This is a letter to argue in favor of keeping the Electoral College.""There are many reasons to keep the Electoral College"" one reason is because it is widely regarded as an anachronism, a dispute over the outcome of an Electoral College vote is possible, but it is less likely than a dispute over the popular vote, and the Electoral College restores some of the weight in the political balance that large states (by population) lose by virue of the mal apportionment of the Senate decreed in the Constitution."", … ""From, PROPER_NAME ""]"


## Features engineering

### 1.Preprocessing

In [4]:
cList = {
  "ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because",  "could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",
  "haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is",
  "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have",
  "isn't": "is not","it'd": "it had","it'd've": "it would have","it'll": "it will", "it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
  "might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
  "shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
  "should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there had","there'd've": "there would have","there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we had",
  "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
  "weren't": "were not","what'll": "what will","what'll've": "what will have",
  "what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have",
  "where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is",
  "why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
  "wouldn't've": "would not have","y'all": "you all","y'alls": "you alls","y'all'd": "you all would",
  "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have","you're": "you are",  "you've": "you have"
   }

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)
def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
#     x = expandContractions(x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

### 2.Paragraph Features（段落の処理）

In [5]:
#スペルミスのカウント

import spacy
import re

nlp = spacy.load("en_core_web_sm")

with open('/kaggle/input/english-word-hx/words.txt', 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)
    
def count_spelling_errors(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    spelling_errors = sum(1 for token in lemmatized_tokens if token not in english_vocab)
    return spelling_errors

In [6]:
#句読点の削除

import string
def remove_punctuation(text):
    """
    Remove all punctuation from the input text.
    
    Args:
    - text (str): The input text.
    
    Returns:
    - str: The text with punctuation removed.
    """

    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [7]:
def Paragraph_Preprocess(tmp):

    tmp = tmp.explode('paragraph')
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(remove_punctuation).alias('paragraph_no_pinctuation'))
    tmp = tmp.with_columns(pl.col('paragraph_no_pinctuation').map_elements(count_spelling_errors).alias("paragraph_error_num"))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                    pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
    return tmp

# feature_eng
paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
paragraph_fea2 = ['paragraph_error_num'] + paragraph_fea
def Paragraph_Eng(train_tmp):
    num_list = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600]
    num_list2 = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700]
    aggs = [
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_>{i}_cnt") for i in [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700] ], 
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_<{i}_cnt") for i in [25,49]], 
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea2],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea2],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea2],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in paragraph_fea2],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea2],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea2],
        *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in paragraph_fea2],
        *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in paragraph_fea2],
        *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in paragraph_fea2],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
tmp = Paragraph_Preprocess(train)
train_feats = Paragraph_Eng(tmp)
train_feats['score'] = train['score']

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  53


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,paragraph_word_cnt_kurtosis,paragraph_error_num_q1,paragraph_len_q1,paragraph_sentence_cnt_q1,paragraph_word_cnt_q1,paragraph_error_num_q3,paragraph_len_q3,paragraph_sentence_cnt_q3,paragraph_word_cnt_q3,score
0,000d118,1,1,1,1,1,1,1,1,1,...,,27.0,2640.0,14.0,491.0,27.0,2640.0,14.0,491.0,3
1,000fe60,5,5,5,5,5,5,5,4,3,...,-1.38846,1.0,235.0,4.0,46.0,1.0,398.0,5.0,77.0,3
2,001ab80,4,4,4,4,4,4,4,4,4,...,-1.696723,1.0,576.0,5.0,101.0,2.0,927.0,8.0,165.0,4


### 3.Sentence Features（文の処理）

In [8]:
def Sentence_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))    
    return tmp

# feature_eng
sentence_fea = ['sentence_len','sentence_word_cnt']
def Sentence_Eng(train_tmp):
    aggs = [
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_>{i}_cnt") for i in [0,15,50,100,150,200,250,300] ], 
        *[pl.col('sentence').filter(pl.col('sentence_len') <= i).count().alias(f"sentence_<{i}_cnt") for i in [15,50] ], 
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in sentence_fea],
        *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in sentence_fea],
        *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in sentence_fea],
    
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

tmp = Sentence_Preprocess(train)
train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  81


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,sentence_len_first,sentence_word_cnt_first,sentence_len_last,sentence_word_cnt_last,sentence_len_kurtosis,sentence_word_cnt_kurtosis,sentence_len_q1,sentence_word_cnt_q1,sentence_len_q3,sentence_word_cnt_q3
0,000d118,1,1,1,1,1,1,1,1,1,...,36,7,0,1,1.438632,2.175806,109.0,19.0,225.0,37.0
1,000fe60,5,5,5,5,5,5,5,4,3,...,62,13,0,1,0.917062,0.505776,51.0,12.0,124.0,25.0
2,001ab80,4,4,4,4,4,4,4,4,4,...,144,27,0,1,-0.004393,0.270079,86.0,17.0,151.0,29.0


### 4.Word Features（単語の処理）

In [9]:
# word feature
def Word_Preprocess(tmp):
    # Preprocess full_text and use spaces to separate words from the text
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    # Calculate the length of each word
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    # Delete data with a word length of 0
    tmp = tmp.filter(pl.col('word_len')!=0)
    
    return tmp
# feature_eng
def Word_Eng(train_tmp):
    aggs = [
        # Count the number of words with a length greater than i+1
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ], 
        # other
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

tmp = Word_Preprocess(train)
# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  102


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,word_12_cnt,word_13_cnt,word_14_cnt,word_15_cnt,word_len_max,word_len_mean,word_len_std,word_len_q1,word_len_q2,word_len_q3
0,000d118,1,1,1,1,1,1,1,1,1,...,6,6,5,2,25,4.378819,2.538495,3.0,4.0,5.0
1,000fe60,5,5,5,5,5,5,5,4,3,...,0,0,0,0,11,4.012048,2.060968,2.0,4.0,5.0
2,001ab80,4,4,4,4,4,4,4,4,4,...,14,10,5,2,15,4.574545,2.604621,3.0,4.0,5.0


### 5.Tf-idf features

In [10]:
# TfidfVectorizer parameter
vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(3,6),
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)
# Fit all datasets into TfidfVector,this may cause leakage and overly optimistic CV scores
train_tfid = vectorizer.fit_transform([i for i in train['full_text']])
# Convert to array
dense_matrix = train_tfid.toarray()
# Convert to dataframe
df = pd.DataFrame(dense_matrix)
# rename features　
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = train_feats['essay_id']
# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(df, on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  19729


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,tfid_19617,tfid_19618,tfid_19619,tfid_19620,tfid_19621,tfid_19622,tfid_19623,tfid_19624,tfid_19625,tfid_19626
0,000d118,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000fe60,5,5,5,5,5,5,5,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,001ab80,4,4,4,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## CountVectorizer Features

In [11]:
vectorizer_cnt = CountVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(2,3),
            min_df=0.10,
            max_df=0.85,
)
train_tfid = vectorizer_cnt.fit_transform([i for i in train['full_text']])
dense_matrix = train_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_cnt_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = train_feats['essay_id']
train_feats = train_feats.merge(df, on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  21899


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,tfid_cnt_2160,tfid_cnt_2161,tfid_cnt_2162,tfid_cnt_2163,tfid_cnt_2164,tfid_cnt_2165,tfid_cnt_2166,tfid_cnt_2167,tfid_cnt_2168,tfid_cnt_2169
0,000d118,1,1,1,1,1,1,1,1,1,...,3,0,0,0,0,0,0,0,0,0
1,000fe60,5,5,5,5,5,5,5,4,3,...,2,0,0,1,1,0,0,0,0,0
2,001ab80,4,4,4,4,4,4,4,4,4,...,1,0,2,0,0,0,0,0,0,0


# Deberta predictions to LGBM as features

In [12]:
# idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective

def quadratic_weighted_kappa(y_true, y_pred):
    y_true = y_true + a
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True
def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess
a = 2.948
b = 1.092

In [13]:
import joblib
# add Deberta predictions to LGBM as features
deberta_oof = joblib.load('/kaggle/input/aes2-400-20240419134941/oof.pkl')
print(deberta_oof.shape, train_feats.shape)

for i in range(6):
    train_feats[f'deberta_oof_{i}'] = deberta_oof[:, i]

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ', len(feature_names))    

train_feats.shape

(17307, 6) (17307, 21901)
Features Number:  21905


(17307, 21907)

In [14]:
# Converting the 'text' column to string type and assigning to X
X = train_feats[feature_names].astype(np.float32).values

# Converting the 'score' column to integer type and assigning to y
y_split = train_feats['score'].astype(int).values
y = train_feats['score'].astype(np.float32).values-a

# Feature Selection

In [15]:
def feature_select_wrapper():
    """
    lgm
    :param train
    :param test
    :return
    """
    # Part 1.
    print('feature_select_wrapper...')
    features = feature_names

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    fse = pd.Series(0, index=features)
         
    for train_index, test_index in skf.split(X, y_split):

        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold, y_test_fold_int = y[train_index], y[test_index], y_split[test_index]

        model = lgb.LGBMRegressor(
                    objective = qwk_obj,
                    metrics = 'None',
                    learning_rate = 0.05,
                    max_depth = 5,
                    num_leaves = 10,
                    colsample_bytree=0.3,
                    reg_alpha = 0.7,
                    reg_lambda = 0.1,
                    n_estimators=700,
                    random_state=412,
                    extra_trees=True,
                    class_weight='balanced',
                    verbosity = - 1)

        predictor = model.fit(X_train_fold,
                              y_train_fold,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train_fold, y_train_fold), (X_test_fold, y_test_fold)],
                              eval_metric=quadratic_weighted_kappa,
                              callbacks=callbacks)
        models.append(predictor)
        predictions_fold = predictor.predict(X_test_fold)
        predictions_fold = predictions_fold + a
        predictions_fold = predictions_fold.clip(1, 6).round()
        predictions.append(predictions_fold)
        f1_fold = f1_score(y_test_fold_int, predictions_fold, average='weighted')
        f1_scores.append(f1_fold)

        kappa_fold = cohen_kappa_score(y_test_fold_int, predictions_fold, weights='quadratic')
        kappa_scores.append(kappa_fold)

#         cm = confusion_matrix(y_test_fold_int, predictions_fold, labels=[x for x in range(1,7)])

#         disp = ConfusionMatrixDisplay(confusion_matrix=cm,
#                                       display_labels=[x for x in range(1,7)])
#         disp.plot()
#         plt.show()
        print(f'F1 score across fold: {f1_fold}')
        print(f'Cohen kappa score across fold: {kappa_fold}')

        fse += pd.Series(predictor.feature_importances_, features)  
    
    # Part 4.
    feature_select = fse.sort_values(ascending=False).index.tolist()[:13000]
    print('done')
    return feature_select

In [16]:
f1_scores = []
kappa_scores = []
models = []
predictions = []
callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=75,first_metric_only=True)]
feature_select = feature_select_wrapper()

feature_select_wrapper...
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.780875	valid's QWK: 0.772165
[50]	train's QWK: 0.824297	valid's QWK: 0.80881
[75]	train's QWK: 0.833835	valid's QWK: 0.82052
[100]	train's QWK: 0.839835	valid's QWK: 0.827825
[125]	train's QWK: 0.843701	valid's QWK: 0.830386
[150]	train's QWK: 0.846801	valid's QWK: 0.832561
[175]	train's QWK: 0.849728	valid's QWK: 0.833696
[200]	train's QWK: 0.851845	valid's QWK: 0.832718
[225]	train's QWK: 0.854701	valid's QWK: 0.833161
Early stopping, best iteration is:
[155]	train's QWK: 0.847646	valid's QWK: 0.834323
Evaluated only: QWK
F1 score across fold: 0.650174023108636
Cohen kappa score across fold: 0.834323161397957
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.781597	valid's QWK: 0.769969
[50]	train's QWK: 0.822816	valid's QWK: 0.817215
[75

In [17]:
X = train_feats[feature_select].astype(np.float32).values
print('Features Select Number: ', len(feature_select))  

Features Select Number:  13000


## Train
* I have trained and saved the model
* you can choose to retrain or load the model

In [18]:
LOAD = True # re-train
# Define the number of splits for cross-validation
n_splits = 15
models = []

if LOAD:
    for i in range(n_splits):
        models.append(lgb.Booster(model_file=f'/kaggle/input/tfidf-lgbm-hisa/fold_{i+1}.txt'))
else:
    # Initialize StratifiedKFold with the specified number of splits
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    # Lists to store scores
    f1_scores = []
    kappa_scores = []
    models = []
    predictions = []
    callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=75,first_metric_only=True)]
    # Loop through each fold of the cross-validation
    i=1
    for train_index, test_index in skf.split(X, y_split):
        # Split the data into training and testing sets for this fold
        print('fold',i)
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold, y_test_fold_int = y[train_index], y[test_index], y_split[test_index]

        model = lgb.LGBMRegressor(
                    objective = qwk_obj,
                    metrics = 'None',
                    learning_rate = 0.05,
                    max_depth = 5,
                    num_leaves = 10,
                    colsample_bytree=0.3,
                    reg_alpha = 0.7,
                    reg_lambda = 0.1,
                    n_estimators=700,
                    random_state=42,
                    extra_trees=True,
                    class_weight='balanced',
                    device='cpu',
                    verbosity = - 1)

        # Fit the model on the training data for this fold  
        predictor = model.fit(X_train_fold,
                              y_train_fold,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train_fold, y_train_fold), (X_test_fold, y_test_fold)],
                              eval_metric=quadratic_weighted_kappa,
                              callbacks=callbacks
                             )

        models.append(predictor)
        # Make predictions on the test data for this fold
        predictions_fold = predictor.predict(X_test_fold)
        predictions_fold = predictions_fold + a
        predictions_fold = predictions_fold.clip(1, 6).round()
        predictions.append(predictions_fold)
        # Calculate and store the F1 score for this fold
        f1_fold = f1_score(y_test_fold_int, predictions_fold, average='weighted')
        f1_scores.append(f1_fold)

        # Calculate and store the Cohen's kappa score for this fold
        kappa_fold = cohen_kappa_score(y_test_fold_int, predictions_fold, weights='quadratic')
        kappa_scores.append(kappa_fold)
        predictor.booster_.save_model(f'fold_{i}.txt')

        print(f'F1 score across fold: {f1_fold}')
        print(f'Cohen kappa score across fold: {kappa_fold}')
        i+=1

In [19]:
# old code(2024-5-28)
# LOAD = False
# n_splits = 15
# models = []
# if LOAD:
#     for i in range(5):
#         models.append(lgb.Booster(model_file=f'../input/tfidf-lgbm-hisa/hisa_fold_{i}.txt'))
        
# else:
#     # OOF is used to store the prediction results of each model on the validation set
#     oof = []
#     x= train_feats
#     y= train_feats['score'].values
#     # 15 fold
#     kfold = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
#     callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=75,first_metric_only=True)]
#     for fold_id, (trn_idx, val_idx) in tqdm(enumerate(kfold.split(x.copy(), y.copy().astype(str)))):
#             # create model
# #             model = lgb.LGBMRegressor(
# #                 objective = qwk_obj,
# #                 metrics = 'None',
# #                 learning_rate = 0.1,
# #                 max_depth = 5, 
# #                 num_leaves = 10,
# #                 colsample_bytree=0.5,
# #                 reg_alpha = 0.1, #L1
# #                 reg_lambda = 0.8, #L2
# #                 n_estimators=1024,
# #                 random_state=42,
# #                 verbosity = - 1)
#             model = lgb.LGBMRegressor(
#                 objective = qwk_obj,
#                 metrics = 'None',
#                 learning_rate = 0.05,
#                 max_depth = 5,
#                 num_leaves = 10,
#                 colsample_bytree=0.3,
#                 reg_alpha = 0.7,
#                 reg_lambda = 0.1,
#                 n_estimators=700,
#                 random_state=42,
#                 extra_trees=True,
#                 class_weight='balanced',
#                 verbosity = - 1)
#             # Take out the training and validation sets for 5 kfold segmentation separately
#             X_train = train_feats.iloc[trn_idx][feature_names]
#             Y_train = train_feats.iloc[trn_idx]['score'] - a

#             X_val = train_feats.iloc[val_idx][feature_names]
#             Y_val = train_feats.iloc[val_idx]['score'] - a
#             print('\nFold_{} Training ================================\n'.format(fold_id+1))
#             # Training model
#             lgb_model = model.fit(X_train,
#                                   Y_train,
#                                   eval_names=['train', 'valid'],
#                                   eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                                   eval_metric=quadratic_weighted_kappa,
#                                   callbacks=callbacks,)
#             # Use the trained model to predict the validation set
#             pred_val = lgb_model.predict(
#                 X_val, num_iteration=lgb_model.best_iteration_)
#             df_tmp = train_feats.iloc[val_idx][['essay_id', 'score']].copy()
#             df_tmp['pred'] = pred_val + a
#             oof.append(df_tmp)
#             # Save model parameters
#             models.append(model.booster_)
#             lgb_model.booster_.save_model(f'hisa_fold_{fold_id}.txt')
#     df_oof = pd.concat(oof)
#     print("train end")

### CV

In [20]:
# if LOAD:
#     print('acc: ',0.6326919743456405)
#     print('kappa: ',0.805136843120887)
# else:
#     acc = accuracy_score(df_oof['score'], df_oof['pred'].clip(1, 6).round())
#     kappa = cohen_kappa_score(df_oof['score'], df_oof['pred'].clip(1, 6).round(), weights="quadratic")
#     print('acc: ',acc)
#     print('kappa: ',kappa)
    
    
if LOAD:
    print(f'Mean F1 score across {n_splits} folds: 0.6694070084827064')
    print(f'Mean Cohen kappa score across {n_splits} folds: 0.835342584985933')
else:
    # Calculate the mean scores across all folds
    mean_f1_score = np.mean(f1_scores)
    mean_kappa_score = np.mean(kappa_scores)
    # Print the mean scores
    print(f'Mean F1 score across {n_splits} folds: {mean_f1_score}')
    print(f'Mean Cohen kappa score across {n_splits} folds: {mean_kappa_score}')

Mean F1 score across 15 folds: 0.6694070084827064
Mean Cohen kappa score across 15 folds: 0.835342584985933


## Submission

In [21]:
#テストデータに対する前処理
# Paragraph
tmp = Paragraph_Preprocess(test)
test_feats = Paragraph_Eng(tmp)
# Sentence
tmp = Sentence_Preprocess(test)
test_feats = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
# Word
tmp = Word_Preprocess(test)
test_feats = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
# Tfidf
test_tfid = vectorizer.transform([i for i in test['full_text']])
dense_matrix = test_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(df, on='essay_id', how='left')
# CountVectorizer
test_tfid = vectorizer_cnt.transform([i for i in test['full_text']])
dense_matrix = test_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_cnt_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(df, on='essay_id', how='left')

# feature deberta
for i in range(6):
    test_feats[f'deberta_oof_{i}'] = predicted_score[:, i]

# Features number
feature_names = list(filter(lambda x: x not in ['essay_id','score'], test_feats.columns))
print('Features number: ',len(feature_names))
test_feats.head(3)

Features number:  21905


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,tfid_cnt_2166,tfid_cnt_2167,tfid_cnt_2168,tfid_cnt_2169,deberta_oof_0,deberta_oof_1,deberta_oof_2,deberta_oof_3,deberta_oof_4,deberta_oof_5
0,000d118,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0.005867,0.274048,0.693365,0.026059,0.000465,0.000196
1,000fe60,5,5,5,5,5,5,5,4,3,...,0,0,0,0,0.000463,0.034626,0.911631,0.052943,0.000277,6e-05
2,001ab80,4,4,4,4,4,4,4,4,4,...,0,0,0,0,0.000969,0.0016,0.018889,0.454451,0.515863,0.008228


In [22]:
# old code(2024-5-28)
# prediction = test_feats[['essay_id']].copy()
# prediction['score'] = 0
# pred_test = models[0].predict(test_feats[feature_names]) + a
# for i in range(4):
#     pred_now = models[i+1].predict(test_feats[feature_names]) + a
#     pred_test = np.add(pred_test,pred_now)
# # The final prediction result needs to be divided by 5 because the prediction results of 5 models were added together
# pred_test = pred_test/15
# print(pred_test)

probabilities = []
for model in models:
    proba = model.predict(test_feats[feature_select]) + a
    probabilities.append(proba)
    
# Compute the average probabilities across all models
predictions = np.mean(probabilities, axis=0)
predictions = np.round(predictions.clip(1, 6))

# Print the predictions
print(predictions)

[2. 3. 5.]


In [23]:
# old code(2024-5-28)
# Round the prediction result to an integer and limit it to a range of 1-6 (score range)
# pred_test = pred_test.clip(1, 6).round()
# prediction['score'] = pred_test
# prediction.to_csv('submission.csv', index=False)
# prediction.head(3)

submission = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
submission['score'] = predictions
submission['score'] = submission['score'].astype(int)
submission.to_csv("submission.csv", index=None)
display(submission.head())

Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,5


## Reference Notebook
#### I would like to give thanks to the authors of these public notebooks. I have learned a lot from you.
* https://www.kaggle.com/code/davidjlochner/base-tfidf-lgbm
* https://www.kaggle.com/code/yunsuxiaozi/aes2-0-baseline-naivebayesclassifier
* https://www.kaggle.com/code/finlay/llm-detect-0-to-1
* https://www.kaggle.com/code/awqatak/silver-bullet-single-model-165-features
* https://www.kaggle.com/code/hiarsl/feature-engineering-sentence-paragraph-features
* https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective