


🌟 **@NOGAWANOGAWA**: Thank you for your generosity in sharing such valuable information! 🙏

🔥 **@OLEKSIY KONONENKO**: Immensely grateful for the script! I grabbed it immediately! 😂


🔍 **Context**: Due to GPU limitations, I've focused on optimizing a shallow model for best results.



🛠️ **My Setup**:
1. **LightGBM**: Utilized with nearly 30 handcrafted features to cover most scoring criteria. 📊
2. **4-Fold Cross-Validation**: Grouped by `prompt_id`. 🔄
3. **Optuna**: Conducted 100 trials for all folds. 🎯

📈 **Initial Result**: Achieved a score of approximately 0.51.



🔄 **Changes by OLEKSIY**:
- Scrapped the `prompt_quest` meta-information, leading to significant changes.
- Grouped by `grade` and used the median instead of the mean. 📝

📈 **Updated Result**: Score improved to 0.49. I'm contemplating strategies to push the limit further. 🚀



👍 If you find this insightful, please upvote! 🌟



# Initization

In [1]:
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"
#!pip install "/kaggle/input/pyphen-0100/Pyphen-0.10.0-py3-none-any.whl"

Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=baa1756506121a9560a34bc5d1c0f719c963a68ef342cf0171299c689c66b55f
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1
Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


# Meta Data Cleansing

In [2]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

class FeatureEngineering:
    
    def __init__(self, df):
        self.df = df
        self.df['grade'].fillna(0, inplace=True)  # Fill NA values in 'grade' with 0

    def classify_author(self, author):
        doc = nlp(author)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return 'person'
        return 'org'

    def encode_author_type(self):
        self.df['author_type'] = self.df['author'].apply(self.classify_author)
        le = LabelEncoder()
        self.df['author_type'] = le.fit_transform(self.df['author_type'])

    def frequency_encoding(self):
        logging.info("Applying Frequency Encoding on 'author'")
        self.df['author_frequency'] = self.df['author'].map(self.df['author'].value_counts())

    def one_hot_encoding(self):
        logging.info("Applying One-Hot Encoding on 'genre'")
        onehot_encoder = OneHotEncoder(sparse=False)
        genre_onehot = onehot_encoder.fit_transform(self.df[['genre']])
        df_onehot = pd.DataFrame(genre_onehot, columns=onehot_encoder.get_feature_names_out(['genre']))
        self.df = pd.concat([self.df, df_onehot], axis=1)

    def feature_scaling(self):
        logging.info("Applying Feature Scaling on 'lexile'")
        scaler = StandardScaler()
        self.df['lexile_scaled'] = scaler.fit_transform(self.df[['lexile']])

    def transform(self):
        self.encode_author_type()
        self.frequency_encoding()
#         self.one_hot_encoding()
        self.feature_scaling()
        return self.df

# Initialize FeatureEngineering class and apply transformations
prompt_grade = pd.read_csv(r'/kaggle/input/commonlit-texts/commonlit_texts.csv')
feature_engineer = FeatureEngineering(prompt_grade)
transformed_df = feature_engineer.transform()

# Display the transformed DataFrame
prompt_grade = transformed_df

In [3]:
keep_columns = ['title','author','description','grade','genre','lexile','lexile_scaled','is_prose','author_type','author_frequency']
prompt_grade = prompt_grade[keep_columns]

In [4]:
# prompt_grade = prompt_grade[['title','grade','lexile_md','genre_big_group_encode','author_type']]

In [5]:
# for _ in list(set(df.author.to_list())):
#     print(_)

# Import Data

In [6]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import optuna
import optuna.integration.lightgbm as lgb
#import pyphen
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
from nltk import ne_chunk, word_tokenize, pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer

import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)



In [7]:
class CFG:
    model_name="debertav3base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100
    max_length=512
    adjustment_factor= 0.5 

# Joining prmpt and meta data 

In [8]:
def preprocess_and_join(df1, df2, df1_title_col, df2_title_col, grade_col):
    # Copy dataframes to avoid modifying the originals
    df1 = df1.copy()
    df2 = df2.copy()

    # Preprocess titles
    df1[df1_title_col] = df1[df1_title_col].str.replace('"', '').str.strip()
    df2[df2_title_col] = df2[df2_title_col].str.replace('"', '').str.strip()

    # Remove duplicate grades
    df2 = df2.drop_duplicates(subset=df2_title_col, keep='first')

    # Join dataframes
    merged_df = df1.merge(df2, how='left', left_on=df1_title_col, right_on=df2_title_col)
    

    # Postprocess grades
    merged_df[grade_col] = merged_df[grade_col].fillna(0)
    merged_df[grade_col] = merged_df[grade_col].astype(int).astype('category')

 
    return merged_df

# Usage
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"
prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")
# prompt_grade = pd.read_csv(r'/kaggle/input/litess-titles/all_titles.csv')
prompts_train = preprocess_and_join(prompts_train, prompt_grade, 'prompt_title', 'title', 'grade')
prompts_test = preprocess_and_join(prompts_test, prompt_grade, 'prompt_title', 'title', 'grade')

In [9]:
prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,title,author,description,grade,genre,lexile,lexile_scaled,is_prose,author_type,author_frequency
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,On Tragedy,Aristotle,"This excerpt from Aristotle's famous work ""Poe...",9,Philosophy,1070.0,0.341991,1,1,2
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,Egyptian Social Structure,USHistory.org,This informational text describes the social s...,7,Informational Text,890.0,-0.387469,1,0,42
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,The Third Wave,CommonLit Staff,"In 1967, a history teacher's social experiment...",9,Informational Text,1260.0,1.111977,1,0,24
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",Excerpt from The Jungle,Upton Sinclair,"In this disturbing piece of political fiction,...",11,Fiction - General,1400.0,1.679335,1,0,1


In [10]:
prompts_test

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,title,author,description,grade,genre,lexile,lexile_scaled,is_prose,author_type,author_frequency
0,abc123,Summarize...,Example Title 1,Heading\nText...,,,,0,,,,,,
1,def789,Summarize...,Example Title 2,Heading\nText...,,,,0,,,,,,


# Documentation for Text Preprocessing Function `run`

## Overview

The `run` function is a comprehensive text preprocessing pipeline designed to prepare and enrich text data for further analysis or machine learning tasks. The function takes in two data frames, `prompts` and `summaries`, along with a `mode` parameter, and returns a processed data frame with various linguistic and statistical features.

---

## Parameters

- **prompts: pd.DataFrame**  
  - A DataFrame containing the prompts with a column named `prompt_text`.

- **summaries: pd.DataFrame**  
  - A DataFrame containing the summaries with a column named `text`.

- **mode: str**  
  - The mode in which the function operates, although its specific use is not detailed in the code snippet.

---

## Features Grouped by Cognitive or Exam Criteria

### Text Length and Tokenization

- `prompt_length`: Length of the prompt in terms of tokens.
- `summary_length`: Length of the summary in terms of tokens.
- `prompt_tokens`: Tokenized form of the prompt.
- `summary_tokens`: Tokenized form of the summary.

### Spelling and Grammar

- `splling_err_num`: Number of spelling errors in the summary.
- `gunning_fog`, `flesch_kincaid_grade_level`, `flesch_reading_ease`: Readability scores for both prompts and summaries.

### Linguistic Features

- `word_count`, `sentence_length`, `vocabulary_richness`: Basic text statistics.
- `avg_word_length`, `comma_count`, `semicolon_count`: Additional linguistic features.
- `pos_ratios`: Part-of-speech ratios in the text.
- `punctuation_ratios`: Punctuation ratios in the text.

### Text Similarity and Overlap

- `word_overlap_count`, `bigram_overlap_count`, `trigram_overlap_count`: N-gram overlaps between prompts and summaries.
- `jaccard_similarity`: Jaccard similarity between prompts and summaries.
- `text_similarity`: Custom text similarity metric.

### Sentiment Analysis

- `sentiment_polarity`, `sentiment_subjectivity`: Sentiment scores.
- `sentiment_scores`: Detailed sentiment scores, further decomposed into individual columns.



In [11]:
#dic = pyphen.Pyphen(lang='en')
sid = SentimentIntensityAnalyzer()

class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        #self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def calculate_text_similarity(self, row):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([row['prompt_text'], row['text']])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    def sentiment_analysis(self, text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity, analysis.sentiment.subjectivity
    
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def calculate_unique_words(self,text):
        unique_words = set(text.split())
        return len(unique_words)
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def calculate_pos_ratios(self , text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios
    
    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios
    
    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)
    
    def count_syllables(self,word):
        word = word.lower()
        vowels = "aeiouy"
        count = 0
        count += sum(1 for letter in word if letter in vowels)
        if word.endswith('e'):
            count -= 1
        count -= sum(word.count(diph) for diph in ['oi', 'oy', 'ou', 'ow', 'au', 'aw', 'oo', 'ee', 'ea', 'ie', 'ei', 'ai', 'ay', 'ey', 'ua', 'ue', 'ui'])
        for i in range(1, len(word) - 1):
            if word[i] not in vowels and word[i-1] in vowels and word[i+1] in vowels:
                count += 1
        count = max(1, count)
        return count

    def flesch_reading_ease_manual(self,text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        flesch_score = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
        return flesch_score
    
    def flesch_kincaid_grade_level(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        fk_grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
        return fk_grade
    
    def gunning_fog(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        complex_words = sum(1 for word in TextBlob(text).words if self.count_syllables(word) > 2)

        if total_sentences == 0 or total_words == 0:
            return 0

        fog_index = 0.4 * ((total_words / total_sentences) + 100 * (complex_words / total_words))
        return fog_index
    
    def calculate_sentiment_scores(self,text):
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores
    
    def count_difficult_words(self, text, syllable_threshold=3):
        words = TextBlob(text).words
        difficult_words_count = sum(1 for word in words if self.count_syllables(word) >= syllable_threshold)
        return difficult_words_count


    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].progress_apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].progress_apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].progress_apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].progress_apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].progress_apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
        prompts['gunning_fog_prompt'] = prompts['prompt_text'].progress_apply(self.gunning_fog)
        prompts['flesch_kincaid_grade_level_prompt'] = prompts['prompt_text'].progress_apply(self.flesch_kincaid_grade_level)
        prompts['flesch_reading_ease_prompt'] = prompts['prompt_text'].progress_apply(self.flesch_reading_ease_manual)

        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
#         summaries["fixed_summary_text"] = summaries["text"].progress_apply(
#             lambda x: self.speller(x)
#         )
        
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")
        input_df['flesch_reading_ease'] = input_df['text'].progress_apply(self.flesch_reading_ease_manual)
        input_df['word_count'] = input_df['text'].progress_apply(lambda x: len(x.split()))
        input_df['sentence_length'] = input_df['text'].progress_apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].progress_apply(lambda x: len(set(x.split())))

        input_df['word_count2'] = [len(t.split(' ')) for t in input_df.text]
        input_df['num_unq_words']=[len(list(set(x.lower().split(' ')))) for x in input_df.text]
        input_df['num_chars']= [len(x) for x in input_df.text]

        # Additional features
        input_df['avg_word_length'] = input_df['text'].progress_apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].progress_apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].progress_apply(lambda x: x.count(';'))

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        input_df['exclamation_count'] = input_df['text'].progress_apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].progress_apply(lambda x: x.count('?'))
        input_df['pos_ratios'] = input_df['text'].progress_apply(self.calculate_pos_ratios)

        # Convert the dictionary of POS ratios into a single value (mean)
        input_df['pos_mean'] = input_df['pos_ratios'].progress_apply(lambda x: np.mean(list(x.values())))
        input_df['punctuation_ratios'] = input_df['text'].progress_apply(self.calculate_punctuation_ratios)

        # Convert the dictionary of punctuation ratios into a single value (sum)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].progress_apply(lambda x: np.sum(list(x.values())))
        input_df['keyword_density'] = input_df.progress_apply(self.calculate_keyword_density, axis=1)
        input_df['jaccard_similarity'] = input_df.progress_apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)
        tqdm.pandas(desc="Performing Sentiment Analysis")
        input_df[['sentiment_polarity', 'sentiment_subjectivity']] = input_df['text'].progress_apply(
            lambda x: pd.Series(self.sentiment_analysis(x))
        )
        tqdm.pandas(desc="Calculating Text Similarity")
        input_df['text_similarity'] = input_df.progress_apply(self.calculate_text_similarity, axis=1)
        #Calculate sentiment scores for each row
        input_df['sentiment_scores'] = input_df['text'].progress_apply(self.calculate_sentiment_scores)
        
        input_df['gunning_fog'] = input_df['text'].progress_apply(self.gunning_fog)
        input_df['flesch_kincaid_grade_level'] = input_df['text'].progress_apply(self.flesch_kincaid_grade_level)
        input_df['count_difficult_words'] = input_df['text'].progress_apply(self.count_difficult_words)

        # Convert sentiment_scores into individual columns
        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)
        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].progress_apply(self.calculate_sentiment_scores)
        # Convert sentiment_scores_prompt into individual columns
        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]
        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)
        columns =  ['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']
        cols_to_drop = [col for col in columns if col in input_df.columns]
        if cols_to_drop:
            input_df = input_df.drop(columns=cols_to_drop)
        
        print(cols_to_drop)
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

# Group by grade instead of prompt_id

In [12]:
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

# Calculate the number of unique groups
n_unique_groups = train["grade"].nunique()

# Set n_splits to be the smaller of CFG.n_splits and the number of unique groups
n_splits = min(CFG.n_splits, n_unique_groups)
gkf = GroupKFold(n_splits=n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["grade"])):
    train.loc[val_index, "fold"] = i

100%|██████████| 4/4 [00:00<00:00, 119.87it/s]
100%|██████████| 4/4 [00:00<00:00, 197.30it/s]
100%|██████████| 7165/7165 [00:04<00:00, 1736.47it/s]
100%|██████████| 7165/7165 [00:04<00:00, 1695.51it/s]
100%|██████████| 4/4 [00:00<00:00, 38.82it/s]
100%|██████████| 4/4 [00:00<00:00, 41.33it/s]
100%|██████████| 4/4 [00:00<00:00, 48.60it/s]
100%|██████████| 4/4 [00:00<00:00, 47.99it/s]
100%|██████████| 7165/7165 [00:01<00:00, 7155.02it/s]
100%|██████████| 7165/7165 [00:17<00:00, 406.63it/s]
100%|██████████| 7165/7165 [00:00<00:00, 153396.36it/s]
100%|██████████| 7165/7165 [00:00<00:00, 434801.69it/s]
100%|██████████| 7165/7165 [00:00<00:00, 76498.06it/s]
100%|██████████| 7165/7165 [00:00<00:00, 38481.68it/s]
100%|██████████| 7165/7165 [00:00<00:00, 444618.19it/s]
100%|██████████| 7165/7165 [00:00<00:00, 565368.98it/s]
100%|██████████| 7165/7165 [00:00<00:00, 8309.58it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4402.90it/s]
100%|██████████| 7165/7165 [00:01<00:00, 3937.99it/s]
100%|██████

['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']


Calculating Text Similarity: 100%|██████████| 2/2 [00:00<00:00, 2286.97it/s]
Calculating Text Similarity: 100%|██████████| 2/2 [00:00<00:00, 2746.76it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 4549.14it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 4819.65it/s]
Calculating Text Similarity: 100%|██████████| 2/2 [00:00<00:00, 38.74it/s]
Calculating Text Similarity: 100%|██████████| 2/2 [00:00<00:00, 1367.11it/s]
Calculating Text Similarity: 100%|██████████| 2/2 [00:00<00:00, 1522.43it/s]
Calculating Text Similarity: 100%|██████████| 2/2 [00:00<00:00, 1607.01it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 8991.01it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 2070.49it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 8616.96it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 6288.31it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 7096.96it/s]
C

['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']





In [13]:
# train = preprocessor.run(prompts_train, summaries_train, mode="train")
# test = preprocessor.run(prompts_test, summaries_test, mode="test")

# # Calculate the number of unique groups
# n_unique_groups = train["grade"].nunique()

# # Set n_splits to be the smaller of CFG.n_splits and the number of unique groups
# n_splits = min(CFG.n_splits, n_unique_groups)
# gkf = GroupKFold(n_splits=n_splits)

# for i, (_, val_index) in enumerate(gkf.split(train, groups=train["grade"])):
#     train.loc[val_index, "fold"] = i

In [14]:
CFG.n_splits = n_splits

In [15]:
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,...,count_difficult_words,neg,neu,pos,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,13,0.033,0.832,0.135,0.7845,0.027,0.873,0.1,0.9915,0.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,3,0.0,0.946,0.054,0.431,0.086,0.879,0.035,-0.9949,2.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,52,0.047,0.814,0.139,0.9725,0.063,0.845,0.092,0.9283,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,5,0.0,1.0,0.0,0.0,0.063,0.845,0.092,0.9283,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,60,0.0,0.896,0.104,0.9696,0.027,0.873,0.1,0.9915,0.0


In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

# 📄 LightGBM Hyperparameter Tuning with Optuna: Professional Documentation

## 🎯 Overview

The script performs hyperparameter optimization for LightGBM using Optuna. It aims to minimize the RMSE (Root Mean Square Error) for a regression task. The script employs k-fold cross-validation and saves the best models for each target.

---

## 🛠 Parameters

- **boosting_type**: Gradient Boosting Decision Tree (`gbdt`).
- **random_state**: Seed for reproducibility (`42`).
- **objective**: Task objective (`regression`).
- **metric**: Evaluation metric (`rmse`).
- **learning_rate**: Learning rate, optimized by Optuna.
- **max_depth**: Maximum depth of the trees.
- **lambda_l1, lambda_l2**: L1 and L2 regularization, optimized by Optuna.
- **num_leaves**: Number of leaves, optimized by Optuna.
- **verbosity**: Logging level (`-1` to suppress warnings).

---

## 📊 Model Training

1. **Initialization**: Create an empty dictionary `model_dict` to store the best models for each target.
2. **Cross-Validation**: Loop through each fold and split the data into training and validation sets.
3. **Optimization**: Use Optuna to optimize hyperparameters.
4. **Model Training**: Train LightGBM models with the optimized parameters.
5. **Evaluation**: Store the best models and their scores.

---

## 📈 Key Functions

- `lgb.train()`: Trains the LightGBM model.
- `optuna.create_study()`: Creates an Optuna study object.
- `study.optimize()`: Runs the optimization.

---

## 📋 Logging Insights

Structured logs can be incorporated to track the progress of each trial and the best parameters found.

```python
import logging
logging.info(f"Best trial: score {study.best_value}, params {study.best_params}")
```

---

In [17]:
# targets = ["content", "wording"]

# drop_columns = ["fold", "student_id", "prompt_id", "text",
#                 "prompt_question", "prompt_title", 
#                 "prompt_text","title", "author", "description", "genre"
#                ] + targets


# def identify_invalid_dtype_columns(df, valid_dtypes):
#     invalid_columns = [col for col in df.columns if df[col].dtype not in valid_dtypes]
#     if invalid_columns and invalid_columns not in drop_columns:
#         print(f"Columns with invalid data types: {invalid_columns}")
#     else:
#         print("All columns have valid data types.")

# # List of valid data types
# valid_dtypes = [int, float, bool]

# # Run the function to identify columns with unexpected data types
# identify_invalid_dtype_columns(train, valid_dtypes)

In [18]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre"
               ] + targets

N = 10  # Adjust based on preference or observations
import optuna
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
import xgboost as xgb
def objective(trial, X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
    dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
    max_depth = trial.suggest_int('max_depth', 9, 20)
    params = {
        'boosting_type': 'gbdt',
        'random_state': 42,
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': max_depth,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 10),
        'verbosity': -3  # Add this line to suppress warnings and info messages

    }

    evaluation_results = {}
    model = lgb.train(params,
                      num_boost_round=10000,
                      valid_names=['train', 'valid'],
                      train_set=dtrain,
                      valid_sets=dval,
                      verbose_eval=1000,
                      early_stopping_rounds=30,
                      callbacks=[lgb.record_evaluation(evaluation_results)])

    # Use the last metric for early stopping
    evals_result = model.best_score
    last_metric = list(evals_result.values())[-1]
    trial.set_user_attr('best_model', model)  # Save the model in the trial
    return last_metric[list(last_metric.keys())[-1]]

model_dict = {
    "content": {
        "main_models": [],
        "post_models": [],
        "top_features": []
    },
    "wording": {
        "main_models": [],
        "post_models": [],
        "top_features": []
    }
}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):
        print(f'For {target} and fold {fold}')
        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, X_train_cv, y_train_cv, X_eval_cv, y_eval_cv), n_trials=100)
        
        print('Best trial: score {}, params {}'.format(study.best_value, study.best_params))

        best_model = study.trials[study.best_trial.number].user_attrs['best_model']
        model_dict[target]["main_models"].append(best_model)
        
        y_pred_val = best_model.predict(X_eval_cv)
        residuals_val = y_eval_cv - y_pred_val
        
        correlations = X_eval_cv.corrwith(pd.Series(residuals_val))
        top_features = correlations.abs().sort_values(ascending=False).head(N).index
        
        xgb_reg = xgb.XGBRegressor(learning_rate=0.01, max_depth =3,min_child_weight=1, gamma = 0, subsample=0.6, n_estimators=1000,objective="reg:squarederror", booster= "gbtree"  )
        xgb_reg.fit(X_eval_cv[top_features],residuals_val)
        model_dict[target]["post_models"].append(xgb_reg)
        model_dict[target]["top_features"].append(top_features)
#     model_dict[target] = models

For content and fold 0
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[910]	train's rmse: 0.466484
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[312]	train's rmse: 0.465688
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[251]	train's rmse: 0.4682
Training until validation scores don't improve for 30 rounds
[1000]	train's rmse: 0.494725
[2000]	train's rmse: 0.479097
[3000]	train's rmse: 0.471811
[4000]	train's rmse: 0.468214
[5000]	train's rmse: 0.465933
[6000]	train's rmse: 0.464563
[7000]	train's rmse: 0.463653
Early stopping, best iteration is:
[7087]	train's rmse: 0.463564
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[165]	train's rmse: 0.465389
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[293]	train's rmse: 0.466204
Training until val

In [19]:
def adjust_predictions(X, main_model, post_process_model, top_features):
    main_pred = main_model.predict(X)
    #dtest = xgb.DMatrix(X[top_features])
    adjustment_values = post_process_model.predict(X[top_features])
    print(main_pred)
    print(adjustment_values)
    return main_pred + adjustment_values

In [20]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]['main_models']

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.473593153079758
wording_rmse : 0.6269464405628608
mcrmse : 0.5502697968213094


In [21]:
for target in targets:
    adj_preds = []
    adj_trues = []
    for fold in range(CFG.n_splits):
        print(f'For {target} and fold {fold}')
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]
        main_model = model_dict[target]["main_models"][fold]
        post_model = model_dict[target]["post_models"][fold]
        top_features_for_fold  =  model_dict[target]["top_features"][fold]
        adjusted_preds = adjust_predictions(X_eval_cv, main_model, post_model, top_features_for_fold)
        adj_trues.extend(y_eval_cv)
        adj_preds.extend(adjusted_preds)
        
    rmse = np.sqrt(mean_squared_error(adj_trues, adj_preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]
    
print(f"mcrmse : {sum(rmses) / len(rmses)}")

For content and fold 0
[-0.08591465  2.20268918 -0.12398932 ...  1.68067757 -0.27837908
 -0.37692698]
[0.05325427 0.78904307 0.04733507 ... 0.4629864  0.08407933 0.01785106]
For content and fold 1
[ 2.260857   -0.95026464  0.34885964 ...  0.53150487 -0.23976734
 -0.22056285]
[ 0.30741692 -0.09709336 -0.52047276 ... -0.07599487 -0.13985902
 -0.43074024]
For content and fold 2
[-0.42549884 -0.76803258 -0.63832177 ...  0.47517993 -0.30347078
  0.7730632 ]
[-0.03480146  0.35953164  0.08014546 ...  0.06225091  0.07976427
  0.3645098 ]
content_rmse : 0.37299035044478734
For wording and fold 0
[ 0.56316995  2.05459527  0.11974757 ...  0.98129833 -0.41767359
  0.08998563]
[3.8479808e-01 3.8479808e-01 2.1898717e-04 ... 3.8479808e-01 2.1898717e-04
 2.1898717e-04]
For wording and fold 1
[ 1.17222953 -0.39856105  0.43717492 ...  0.57256657  0.03778151
  0.08617045]
[ 1.5045826  -0.39562267 -0.03379241 ... -0.31459117 -0.01240441
 -0.01027669]
For wording and fold 2
[ 0.23524524  0.06230033 -0.0121

# Create Submission Infomration

In [22]:
# drop_columns = [
#                 #"fold", 
#                 "student_id", "prompt_id", "text", "fixed_summary_text",
#                 "prompt_question", "prompt_title", 
#                 "prompt_text",
#                 "input"
#                ] + [
#                 f"content_pred_{i}" for i in range(CFG.n_splits)
#                 ] + [
#                 f"wording_pred_{i}" for i in range(CFG.n_splits)
#                 ]


drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre"]

In [23]:
pred_dict = {}

for target in targets:
    models = model_dict[target]["main_models"]
    post_models = model_dict[target]["post_models"]
    top_features_list = model_dict[target]["top_features"]
    preds = []

    for fold, (model, post_model, top_features) in enumerate(zip(models, post_models, top_features_list)):
        X_eval_cv = test.drop(columns=drop_columns)
        #X_eval_cv.fillna(0,inplace=True)
        pred = adjust_predictions(X_eval_cv, model, post_model, top_features)
        preds.append(pred)
    
    pred_dict[target] = preds
    
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    # Calculate the median across the K-Fold predictions
    medians = test[[f'{target}_pred_{fold}' for fold in range(CFG.n_splits)]].median(axis=1)

    # Calculate the standard deviation across the K-Fold predictions
    std_devs = test[[f'{target}_pred_{fold}' for fold in range(CFG.n_splits)]].std(axis=1)

    # Adjust the median using the standard deviation
    adjusted_medians = medians + (CFG.adjustment_factor * std_devs)

    test[target] = adjusted_medians

    print(test)

[-1.48745238 -1.48745238 -1.48745238 -1.48745238]
[-0.40551642 -0.40551642 -0.40551642 -0.40551642]
[-0.83593238 -0.83593238 -0.83593238 -0.83593238]
[-1.7302157 -1.7302157 -1.7302157 -1.7302157]
[-1.36687625 -1.36687625 -1.36687625 -1.36687625]
[-0.07307541 -0.07307541 -0.07307541 -0.07307541]
[-1.24134365 -1.24134365 -1.24134365 -1.24134365]
[0.00021899 0.00021899 0.00021899 0.00021899]
[0.95691048 0.95691048 0.95691048 0.95691048]
[-0.4262491 -0.4262491 -0.4262491 -0.4262491]
[-1.7255097 -1.7255097 -1.7255097 -1.7255097]
[-0.38556734 -0.38556734 -0.38556734 -0.38556734]
     student_id prompt_id            text  summary_length  splling_err_num  \
0  000000ffffff    abc123  Example text 1               3                0   
1  111111eeeeee    def789  Example text 2               3                0   
2  222222cccccc    abc123  Example text 3               3                0   
3  333333dddddd    def789  Example text 4               3                0   

  prompt_question     prompt_

In [24]:
# pred_dict = {}
# for target in targets:
#     models = model_dict[target]
#     preds = []

#     for fold, model in enumerate(models):
#         X_eval_cv = test.drop(columns=drop_columns)

#         pred = model.predict(X_eval_cv)
#         preds.append(pred)
    
#     pred_dict[target] = preds
    
    
# for target in targets:
#     preds = pred_dict[target]
#     for i, pred in enumerate(preds):
#         test[f"{target}_pred_{i}"] = pred

#     # Calculate the median across the K-Fold predictions
#     medians = test[[f'{target}_pred_{fold}' for fold in range(CFG.n_splits)]].median(axis=1)

#     # Calculate the standard deviation across the K-Fold predictions
#     std_devs = test[[f'{target}_pred_{fold}' for fold in range(CFG.n_splits)]].std(axis=1)

#     # Adjust the median using the standard deviation
#     adjusted_medians = medians + (CFG.adjustment_factor * std_devs)

#     test[target] = adjusted_medians

#     print(test)

In [25]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

In [26]:
!touch submission.csv