In [None]:
from pathlib import Path
import re

INPUT_DIR = Path("../input/")

In [None]:
!pip install -q /kaggle/input/language-tool-python-2-7-1/language_tool_python-2.7.1-py3-none-any.whl

In [None]:
import os
import zipfile
from zipfile import ZipFile
import shutil

# create download path
def get_language_tool_cache_path():

    # Get download path from environment or use default.
    download_path = os.environ.get(
        'LTP_PATH',
        os.path.join(os.path.expanduser("~"), ".cache", "language_tool_python")
    )
    # Make download path, if it doesn't exist.
    os.makedirs(download_path, exist_ok=True)
    return download_path

lt_path = get_language_tool_cache_path()
lt_path

'/root/.cache/language_tool_python'

In [None]:
def get_all_file_paths(directory):

    # initializing empty file paths list
    file_paths = []

    # crawling through directory and subdirectories
    for root, directories, files in os.walk(directory):
        for filename in files:
            # join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)

    # returning all file paths
    return file_paths

def main():
    # path to folder which needs to be zipped
    directory = '../input/language-tool-python-2-7-1/LanguageTool-5.7/LanguageTool-5.7'

    # calling function to get all file paths in the directory
    file_paths = get_all_file_paths(directory)

    # writing files to a zipfile
    with ZipFile('./lt.zip','w') as zip:
        # writing each file one by one
        for file in file_paths:
            zip.write(file)

    print('All files zipped successfully!')

main()

zip_file = "./lt.zip"

try:
    with zipfile.ZipFile(zip_file) as z:
        z.extractall()
        print("Extracted all")
except:
    print("Invalid file")

#move to cache
!mv {'./input/language-tool-python-2-7-1/LanguageTool-5.7/LanguageTool-5.7'} {lt_path}
print(os.listdir('/root/.cache/language_tool_python/'))

#remove files from output

shutil.rmtree('./input')
os.remove("./lt.zip")

All files zipped successfully!
Extracted all
['LanguageTool-5.7']


In [None]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

In [None]:
import pandas as pd  # For data manipulation
import gc  # To manage memory manually
import pickle  # For object serialization
import torch  # For GPU computation


# Load the test dataset and check its size
_test = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")
ENABLE_DONT_WASTE_YOUR_RUN_TIME = len(_test) < 10  # Flag to optimize runtime based on dataset size

# If the dataset is tiny, free up memory immediately
if ENABLE_DONT_WASTE_YOUR_RUN_TIME:
    import shutil
    del _test
    gc.collect()

# Check and display whether a CUDA-capable GPU is available
CUDA_AVAILABLE = torch.cuda.is_available()
print(f"{CUDA_AVAILABLE = }")

CUDA_AVAILABLE = True


In [None]:
import xgboost as xgb
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
from glob import glob
from scipy.special import softmax

MAX_LENGTH = 1024  #1024
TEST_DATA_PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
MODEL_PATH = '/kaggle/input/aes2-400-20240419134941/*/*'
EVAL_BATCH_SIZE = 1 #1

2024-06-13 21:47:00.740260: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 21:47:00.740406: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 21:47:00.848358: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Deberta Model

This code loads multiple pretrained DeBERTa models, tokenizes a test dataset, and uses the models to generate predictions. The predictions are averaged to obtain a final predicted score. Memory is managed by deleting models and clearing the CUDA cache after each iteration.

In [None]:
# Get the list of model paths
models = glob(MODEL_PATH)

# Load the tokenizer from the first model
tokenizer = AutoTokenizer.from_pretrained(models[0])

# Function to tokenize input text
def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

# Load the test dataset
df_test = pd.read_csv(TEST_DATA_PATH)

# Convert dataset to Hugging Face's Dataset format, tokenize it, and remove unnecessary columns
ds = Dataset.from_pandas(df_test).map(tokenize).remove_columns(['essay_id', 'full_text'])

# Define evaluation arguments for the Trainer
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    report_to="none"  # Disable logging to external tracking tools
)

predictions = []

# Loop through each model for inference
for model in models:
    model = AutoModelForSequenceClassification.from_pretrained(model)  # Load the model

    # Initialize the Trainer for evaluation
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer
    )

    # Get model predictions
    preds = trainer.predict(ds).predictions
    predictions.append(softmax(preds, axis=-1))  # Apply softmax to get probabilities

    # Free memory after processing each model
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

# Aggregate predictions by averaging across models
predicted_score = 0.

for p in predictions:
    predicted_score += p

predicted_score /= len(predictions)  # Compute final averaged prediction


  0%|          | 0/3 [00:00<?, ?ex/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Assign the final predicted score to the 'score' column
# The highest probability class (argmax) is selected, and 1 is added to adjust the score range
df_test['score'] = predicted_score.argmax(-1) + 1

# Display the first few rows of the updated dataframe
df_test.head()


Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,5


In [None]:
df_test[['essay_id', 'score']].to_csv('submission1.csv', index=False)

In [None]:
# Importing necessary libraries
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor
import numpy as np
import pandas as pd
import re
import spacy
import language_tool_python
import string
import random
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier,BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.metrics import cohen_kappa_score
from lightgbm import log_evaluation, early_stopping
from sklearn.linear_model import SGDClassifier
import polars as pl
import joblib



In [None]:
columns = [
    (
        pl.col("full_text").str.split(by="\n\n").alias("paragraph")
    ),
]
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"

# Load training and testing sets, while using \ n \ n character segmentation to list and renaming to paragraph for full_text data
train = pl.read_csv(PATH + "train.csv").with_columns(columns)
test = pl.read_csv(PATH + "test.csv").with_columns(columns)

nlp = spacy.load("en_core_web_sm")
with open('/kaggle/input/english-word-hx/words.txt', 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)

# Display the first sample data in the training set
train.head(1)

essay_id,full_text,score,paragraph
str,str,i64,list[str]
"""000d118""","""Many people ha…",3,"[""Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won't see a car in Vauban's streets because they are completely ""car free"" but If some that lives in VAUBAN that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $40,000 along with a home. The vauban people completed this in 2006 ,they said that this an example of a growing trend in Europe,The untile states and some where else are suburban life from auto use this is called ""smart planning"". The current efforts to drastically reduce greenhouse gas emissions from tailes the passengee cars are responsible for 12 percent of greenhouse gas emissions in Europe and up to 50 percent in some car intensive in the United States. I honeslty think that good idea that they did that is Vaudan because that makes cities denser and better for walking and in VAUBAN there are 5,500 residents within a rectangular square mile. In the artical David Gold berg said that ""All of our development since World war 2 has been centered on the cars,and that will have to change"" and i think that was very true what David Gold said because alot thing we need cars to do we can go anyway were with out cars beacuse some people are a very lazy to walk to place thats why they alot of people use car and i think that it was a good idea that that they did that in VAUBAN so people can see how we really don't need car to go to place from place because we can walk from were we need to go or we can ride bycles with out the use of a car. It good that they are doing that if you thik about your help the earth in way and thats a very good thing to. In the United states ,the Environmental protection Agency is promoting what is called ""car reduced""communtunties,and the legislators are starting to act,if cautiously. Maany experts expect pubic transport serving suburbs to play a much larger role in a new six years federal transportation bill to approved this year. In previous bill,80 percent of appropriations have by law gone to highways and only 20 percent to other transports. There many good reason why they should do this. ""]"


# Data Preprocessing

The dataPreprocessing function cleans and standardizes text data for further processing. It first converts all text to lowercase to ensure uniformity. Then, it removes unwanted elements such as HTML tags, mentions (words starting with @), numbers, and URLs. The function also ensures text readability by replacing consecutive spaces, commas, and periods with single instances of each. Finally, it trims any leading or trailing whitespace, ensuring a clean and structured output.

In [None]:
def dataPreprocessing(x):

    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

# Feature Engineering
The feature engineering process involves extracting meaningful insights from essays by analyzing different textual aspects at the paragraph, sentence, and word levels. Below is a detailed breakdown of what has been done.

All extracted features (paragraph, sentence, and word-level) are merged into a single dataset for training, ensuring a comprehensive feature set for model training.

The final dataset includes a variety of linguistic and structural features, enhancing model performance in essay scoring tasks.

## Spelling and Grammar Features
The count_spelling_errors function uses lemmatization to check for spelling mistakes by comparing words against an English vocabulary set.
The grammar function calculates key grammatical features, including the number of adjectives, adverbs, and grammatical mistakes in each essay using NLP techniques such NLTK for POS tagging and python language tool for grammar.

In [None]:
def count_spelling_errors(text):
    """
    Count the number of spelling errors in the given text.

    Args:
    - text (str): The input text.

    Returns:
    - int: The number of words that are not in the predefined English vocabulary.
    """
    doc = nlp(text)
    # Lemmatize words and convert them to lowercase
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    # Count words that are not in the predefined English vocabulary
    spelling_errors = sum(1 for token in lemmatized_tokens if token not in english_vocab)
    return spelling_errors

def removeHTML(x):
    """
    Remove HTML tags from the input text.

    Args:
    - x (str): The input text with potential HTML tags.

    Returns:
    - str: The text with HTML tags removed.
    """
    html = re.compile(r'<.*?>')
    return html.sub(r'', x)

def remove_punctuation(text):
    """
    Remove all punctuation from the input text.

    Args:
    - text (str): The input text.

    Returns:
    - str: The text with punctuation removed.
    """
    # Create a translation table that maps punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def grammar(text):
    """
    Extract grammatical features from the text, including:
    - Number of adjectives
    - Number of adverbs
    - Number of grammatical mistakes

    Args:
    - text (DataFrame): A DataFrame containing an essay column named 'full_text'.

    Returns:
    - DataFrame: The input DataFrame with additional columns for grammatical features.
    """
    adj_list = []  # List to store the count of adjectives per essay
    adv_list = []  # List to store the count of adverbs per essay
    mist_list = []  # List to store the number of grammar mistakes per essay

    for essay in text['full_text']:
        adj = 0  # Counter for adjectives
        adv = 0  # Counter for adverbs

        # Tokenize the essay into sentences
        for sent in sent_tokenize(essay):
            wordtokens = word_tokenize(sent)  # Tokenize each sentence into words

            # Count adjectives (JJ) and adverbs (RB) in the sentence
            adj += sum(1 for word, pos in nltk.pos_tag(wordtokens) if 'JJ' in pos)
            adv += sum(1 for word, pos in nltk.pos_tag(wordtokens) if 'RB' in pos)

        adj_list.append(adj)  # Append adjective count for the essay
        adv_list.append(adv)  # Append adverb count for the essay
        mist_list.append(len(tool.check(essay)))  # Count the number of grammatical mistakes

    # Add extracted grammatical features as new columns to the DataFrame
    return text.with_columns([
        pl.Series(name='no_adjectives', values=adj_list),
        pl.Series(name='no_adverbs', values=adv_list),
        pl.Series(name='no_mistakes', values=mist_list)
    ])



## Paragraph Features

The Paragraph_Preprocess function processes essay paragraphs by removing HTML, punctuation, and counting spelling errors.
It calculates various paragraph-level statistics, including paragraph length, number of sentences, and word count per paragraph.
The Paragraph_Eng function aggregates paragraph-based features, computing counts of paragraphs exceeding or below specific lengths, along with statistical measures (max, min, mean, sum, kurtosis, and quantiles).

In [None]:

def Paragraph_Preprocess(tmp):
    # Expand the paragraph list into several lines of data

    tmp = tmp.explode('paragraph')

    # Paragraph preprocessing
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(remove_punctuation).alias('paragraph_no_pinctuation'))
    tmp = tmp.with_columns(pl.col('paragraph_no_pinctuation').map_elements(count_spelling_errors).alias("paragraph_error_num"))
    # Calculate the length of each paragraph
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    # Calculate the number of sentences and words in each paragraph
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                    pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
    return tmp
# feature_eng
paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
paragraph_fea2 = ['paragraph_error_num'] + paragraph_fea

def Paragraph_Eng(train_tmp):
    num_list = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600]
    num_list2 = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700]
    aggs = [
        # Count the number of paragraph lengths greater than and less than the i-value
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_{i}_cnt") for i in [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700] ],
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_{i}_cnt") for i in [25,49]],
        # other
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea2],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea2],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea2],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in paragraph_fea2],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea2],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea2],
        *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in paragraph_fea2],
        *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in paragraph_fea2],
        *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in paragraph_fea2],
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

if ENABLE_DONT_WASTE_YOUR_RUN_TIME:
    with open("/kaggle/input/aes2-cache/paragraph_preprocess_tmp.pickle", "rb") as f:
        tmp = pickle.load(f)
    with open("/kaggle/input/aes2-cache/paragraph_preprocess_train_feats.pickle", "rb") as f:
        train_feats = pickle.load(f)
else:
    tmp = Paragraph_Preprocess(train)
    train_feats = Paragraph_Eng(tmp)

feats_new = grammar(train) #incorporating new grammar features
train_feats['score'] = feats_new['score']
train_feats['no_adjectives'] = feats_new['no_adjectives']
train_feats['no_adverbs'] = feats_new['no_adverbs']
train_feats['no_mistakes'] = feats_new['no_mistakes']


# Obtain feature names
feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  56


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,paragraph_sentence_cnt_q1,paragraph_word_cnt_q1,paragraph_error_num_q3,paragraph_len_q3,paragraph_sentence_cnt_q3,paragraph_word_cnt_q3,score,no_adjectives,no_adverbs,no_mistakes
0,000d118,1,1,1,1,1,1,1,1,1,...,14.0,491.0,27.0,2640.0,14.0,491.0,3,33,27,55
1,000fe60,5,5,5,5,5,5,5,4,3,...,4.0,46.0,1.0,398.0,5.0,77.0,3,14,27,24
2,001ab80,4,4,4,4,4,4,4,4,4,...,5.0,101.0,2.0,927.0,8.0,165.0,4,40,56,11


## Sentence Features

The Sentence_Preprocess function processes full text by splitting it into sentences and calculating sentence length and word count per sentence.
The Sentence_Eng function derives aggregated sentence-based features, such as the count of sentences exceeding certain length thresholds, maximum/minimum sentence length, and statistical measures like mean, standard deviation, and quartiles.

In [None]:
# sentence feature
def Sentence_Preprocess(tmp):

    # Preprocess full_text and use periods to segment sentences in the text
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')

    # Calculate the length of a sentence
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))

    # Filter out the portion of data with a sentence length greater than 15
    tmp = tmp.filter(pl.col('sentence_len')>=15)

    # Count the number of words in each sentence
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))

    return tmp

# feature_eng
sentence_fea = ['sentence_len','sentence_word_cnt']

def Sentence_Eng(train_tmp):

    aggs = [
        # Count the number of sentences with a length greater than i
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [0,15,50,100,150,200,250,300] ],
        *[pl.col('sentence').filter(pl.col('sentence_len') <= i).count().alias(f"sentence_<{i}_cnt") for i in [15,50] ],
        # other
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in sentence_fea],
        *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in sentence_fea],
        *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in sentence_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

tmp = Sentence_Preprocess(train)
# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  84


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,sentence_len_first,sentence_word_cnt_first,sentence_len_last,sentence_word_cnt_last,sentence_len_kurtosis,sentence_word_cnt_kurtosis,sentence_len_q1,sentence_word_cnt_q1,sentence_len_q3,sentence_word_cnt_q3
0,000d118,1,1,1,1,1,1,1,1,1,...,36,7,47,10,1.514267,2.1117,110.0,21.0,225.0,37.0
1,000fe60,5,5,5,5,5,5,5,4,3,...,62,13,124,25,1.126323,0.642912,53.0,13.0,124.0,25.0
2,001ab80,4,4,4,4,4,4,4,4,4,...,144,27,58,10,-0.423362,0.129704,90.0,17.0,151.0,29.0


## Word Features

The Word_Preprocess function tokenizes essays into individual words, calculates word lengths, and removes invalid entries.
The Word_Eng function generates aggregated word-level features, such as the count of words exceeding various length thresholds, maximum/minimum word length, mean word length, and quartiles.

In [None]:
# word feature
def Word_Preprocess(tmp):
    # Preprocess full_text and use spaces to separate words from the text
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    # Calculate the length of each word
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    # Delete data with a word length of 0
    tmp = tmp.filter(pl.col('word_len')!=0)

    return tmp

# feature_eng
def Word_Eng(train_tmp):
    aggs = [
        # Count the number of words with a length greater than i+1
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ],
        # other
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

tmp = Word_Preprocess(train)
# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  105


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,word_12_cnt,word_13_cnt,word_14_cnt,word_15_cnt,word_len_max,word_len_mean,word_len_std,word_len_q1,word_len_q2,word_len_q3
0,000d118,1,1,1,1,1,1,1,1,1,...,6,6,5,2,25,4.378819,2.538495,3.0,4.0,5.0
1,000fe60,5,5,5,5,5,5,5,4,3,...,0,0,0,0,11,4.012048,2.060968,2.0,4.0,5.0
2,001ab80,4,4,4,4,4,4,4,4,4,...,14,10,5,2,15,4.574545,2.604621,3.0,4.0,5.0


## Vectorizer

The provided code demonstrates the process of generating features from text data using two different vectorization techniques: TfidfVectorizer and CountVectorizer.

1. The TfidfVectorizer is first initialized with specific parameters, such as n-grams ranging from 3 to 6 and sublinear term frequency scaling. It is applied to the 'full_text' column of the train dataset, transforming the text into a sparse matrix and then into a dense matrix that is converted into a pandas DataFrame.

2. The generated DataFrame is merged with an existing dataset train_feats based on the 'essay_id' column to create a new feature set.

3. The features are renamed, and the resulting DataFrame is updated to include the new features.
4. A similar procedure is followed for CountVectorizer, which is configured with different parameters, such as n-grams ranging from 2 to 3 and adjusted document frequency thresholds.
5. Both vectorization methods are used to generate features for Gradient Boosting models learning models.

In summary, this code applies text vectorization techniques to extract features from essay text data, which are then merged into an existing feature set for model training.

In [None]:
# TfidfVectorizer parameter
vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(3,6), #(3,6)
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)
# Fit all datasets into TfidfVector,this may cause leakage and overly optimistic CV scores
train_tfid = vectorizer.fit_transform([i for i in train['full_text']])

# Convert to array
dense_matrix = train_tfid.toarray()

# Convert to dataframe
df = pd.DataFrame(dense_matrix)

# rename features
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = train_feats['essay_id']

# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(df, on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

Features Number:  19732


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,tfid_19617,tfid_19618,tfid_19619,tfid_19620,tfid_19621,tfid_19622,tfid_19623,tfid_19624,tfid_19625,tfid_19626
0,000d118,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000fe60,5,5,5,5,5,5,5,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,001ab80,4,4,4,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Initialize a CountVectorizer to tokenize and extract n-grams from the text data
vectorizer_cnt = CountVectorizer(
            tokenizer=lambda x: x,  # Use the existing tokenization
            preprocessor=lambda x: x,  # No preprocessing
            token_pattern=None,  # Allow all tokens (to handle pre-tokenized data)
            strip_accents='unicode',  # Normalize Unicode accents
            analyzer='word',  # Analyze by words (n-grams)
            ngram_range=(2,3),  # Use 2-grams and 3-grams
            min_df=0.10,  # Ignore terms that appear in less than 10% of documents
            max_df=0.85,  # Ignore terms that appear in more than 85% of documents
)

# Transform the 'full_text' column of the dataset into a sparse matrix of token counts
train_tfid = vectorizer_cnt.fit_transform([i for i in train['full_text']])

# Convert the sparse matrix to a dense matrix for easier manipulation
dense_matrix = train_tfid.toarray()

# Convert the dense matrix into a DataFrame
df = pd.DataFrame(dense_matrix)

# Rename the columns to represent the n-gram features (e.g., tfid_cnt_0, tfid_cnt_1, etc.)
tfid_columns = [ f'tfid_cnt_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns

# Add the essay_id from the original training features to the new DataFrame
df['essay_id'] = train_feats['essay_id']

# Merge the new feature DataFrame (with n-gram counts) with the original feature DataFrame
train_feats = train_feats.merge(df, on='essay_id', how='left')

# Filter out 'essay_id' and 'score' columns to focus on the feature names
feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))

# Print the total number of features added
print('Features Number: ', len(feature_names))

# Display the first 3 rows of the updated features DataFrame
train_feats.head(3)


Features Number:  21902


Unnamed: 0,essay_id,paragraph_>0_cnt,paragraph_>50_cnt,paragraph_>75_cnt,paragraph_>100_cnt,paragraph_>125_cnt,paragraph_>150_cnt,paragraph_>175_cnt,paragraph_>200_cnt,paragraph_>250_cnt,...,tfid_cnt_2160,tfid_cnt_2161,tfid_cnt_2162,tfid_cnt_2163,tfid_cnt_2164,tfid_cnt_2165,tfid_cnt_2166,tfid_cnt_2167,tfid_cnt_2168,tfid_cnt_2169
0,000d118,1,1,1,1,1,1,1,1,1,...,3,0,0,0,0,0,0,0,0,0
1,000fe60,5,5,5,5,5,5,5,4,3,...,2,0,0,1,1,0,0,0,0,0
2,001ab80,4,4,4,4,4,4,4,4,4,...,1,0,2,0,0,0,0,0,0,0


## Deberta predictions to LightGBM and XGBoost as features

The code loads out-of-fold (OOF) predictions from a pre-trained DeBERTa model and adds them as features to the train_feats dataset. It iterates over the six prediction columns and appends them as new features, renaming them accordingly. Finally, it prints the number of features and the updated shape of the dataset.

In [None]:
# Load DeBERTa model's out-of-fold (OOF) predictions from a pre-trained model
# These predictions will be used as additional features for training
deberta_oof = joblib.load('/kaggle/input/aes2-400-20240419134941/oof.pkl')

# Print the shape of the loaded DeBERTa predictions and the existing training feature set
print(deberta_oof.shape, train_feats.shape)

# Add the 6 prediction outputs from DeBERTa as separate features in the training dataset
for i in range(6):
    train_feats[f'deberta_oof_{i}'] = deberta_oof[:, i]

# Update the list of feature names, excluding 'essay_id' and 'score'
feature_names = list(filter(lambda x: x not in ['essay_id', 'score'], train_feats.columns))

# Print the total number of features after adding DeBERTa predictions
print('Features Number: ', len(feature_names))

# Print the shape of the updated training dataset to confirm the added features
train_feats.shape


(17307, 6) (17307, 21904)
Features Number:  21908


(17307, 21910)

In [None]:
# idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
def quadratic_weighted_kappa(y_true, y_pred):
    if isinstance(y_pred, xgb.QuantileDMatrix):
        # XGB
        y_true, y_pred = y_pred, y_true

        y_true = (y_true.get_label() + a).round()
        y_pred = (y_pred + a).clip(1, 6).round()
        qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
        return 'QWK', qwk

    else:
        # For lgb
        y_true = y_true + a
        y_pred = (y_pred + a).clip(1, 6).round()
        qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
        return 'QWK', qwk, True

def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess

a = 2.998
b = 1.092

In [None]:
# Converting the 'text' column to string type and assigning to X
X = train_feats[feature_names].astype(np.float32).values

# Converting the 'score' column to integer type and assigning to y
y_split = train_feats['score'].astype(int).values
y = train_feats['score'].astype(np.float32).values-a

# Feature Selection

In [None]:
def feature_select_wrapper():
    """
    lgm
    :param train
    :param test
    :return
    """
    # Part 1.
    print('feature_select_wrapper...')
    features = feature_names

    skf = StratifiedKFold(n_splits=10 , shuffle=True, random_state=0) #n_splits=5
    fse = pd.Series(0, index=features)
    scaler = StandardScaler()

    for train_index, test_index in skf.split(X, y_split):

        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold, y_test_fold_int = y[train_index], y[test_index], y_split[test_index]

        # Normalize the features
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_test_fold = scaler.transform(X_test_fold)

        model = lgb.LGBMRegressor(
                    objective = qwk_obj,
                    metrics = 'None',
                    learning_rate = 0.05, #0.05
                    max_depth = 5,
                    num_leaves = 10, #10
                    colsample_bytree=0.3,
                    reg_alpha = 0.7,
                    reg_lambda = 0.1,
                    n_estimators=700,
                    random_state=412,
                    extra_trees=True,
                    class_weight='balanced',
                    verbosity = - 1)

        predictor = model.fit(X_train_fold,
                              y_train_fold,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train_fold, y_train_fold), (X_test_fold, y_test_fold)],
                              eval_metric=quadratic_weighted_kappa,
                              callbacks=callbacks)
        models.append(predictor)
        predictions_fold = predictor.predict(X_test_fold)
        predictions_fold = predictions_fold + a
        predictions_fold = predictions_fold.clip(1, 6).round()
        predictions.append(predictions_fold)
        f1_fold = f1_score(y_test_fold_int, predictions_fold, average='weighted')
        f1_scores.append(f1_fold)

        kappa_fold = cohen_kappa_score(y_test_fold_int, predictions_fold, weights='quadratic')
        kappa_scores.append(kappa_fold)

        print(f'F1 score across fold: {f1_fold}')
        print(f'Cohen kappa score across fold: {kappa_fold}')

        fse += pd.Series(predictor.feature_importances_, features)
        if ENABLE_DONT_WASTE_YOUR_RUN_TIME:
            break

    # Part 4.
    feature_select = fse.sort_values(ascending=False).index.tolist()[:5000] #13000
    print('done')
    return feature_select

In [None]:
f1_scores = []
kappa_scores = []
models = []
predictions = []
callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=75,first_metric_only=True)]

if ENABLE_DONT_WASTE_YOUR_RUN_TIME:
    with open("/kaggle/input/aes2-cache/feature_select.pickle", "rb") as f:
        feature_select = pickle.load(f)
else:
    feature_select = feature_select_wrapper()

In [None]:
try:
    X = train_feats[feature_select].astype(np.float32).values
except KeyError:
    with open("/kaggle/input/aes2-preprocessing/X.pickle", "rb") as f:
        X = pickle.load(f)
    with open("/kaggle/input/aes2-preprocessing/y.pickle", "rb") as f:
        y = pickle.load(f)
    with open("/kaggle/input/aes2-preprocessing/y_split.pickle", "rb") as f:
        y_split = pickle.load(f)
print('Features Select Number: ', len(feature_select))

Features Select Number:  13000


# Model training

In [None]:
LOAD = True # re-train
# Define the number of splits for cross-validation
# n_splits = 15
n_splits = 20
models = []


import xgboost as xgb

class Predictor:
    def __init__(self, models: list):
        self.models = models
    def predict(self, X):
        n_models = len(self.models)
        predicted = None
        n = 0.749
        for i, model in enumerate(self.models):
            if i == 0:
                predicted = n*model.predict(X)   #0.76
            else:
                predicted += (1-n)*model.predict(X)  #0.24
        return predicted


if not LOAD:
    for i in range(n_splits):
        models.append(lgb.Booster(model_file=f'/kaggle/input/aes-15fold/fold_{i+1}.txt'))
else:
    # Initialize StratifiedKFold with the specified number of splits
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    # Lists to store scores
    f1_scores = []
    kappa_scores = []
    models = []
    predictions = []
    callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=75,first_metric_only=True)]
    # Loop through each fold of the cross-validation
    i=1
    for train_index, test_index in skf.split(X, y_split):
        # Split the data into training and testing sets for this fold
        print('fold',i)
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold, y_test_fold_int = y[train_index], y[test_index], y_split[test_index]

        light = lgb.LGBMRegressor(
                    objective = qwk_obj,
                    metrics = 'None',
                    learning_rate = 0.05, #0.05
                    max_depth = 8,#10
                    num_leaves = 10,#15
                    colsample_bytree=0.3,
                    reg_alpha = 0.7,#1
                    reg_lambda = 0.1,
                    n_estimators=700,
                    random_state=42,
                    extra_trees=True,
                    class_weight='balanced',
                    device='gpu' if CUDA_AVAILABLE else 'cpu',
                    verbosity = - 1
        )

        # Fit the model on the training data for this fold
        light.fit(X_train_fold,
                              y_train_fold,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train_fold, y_train_fold), (X_test_fold, y_test_fold)],
                              eval_metric=quadratic_weighted_kappa,
                              callbacks=callbacks
                             )

        xgb_regressor = xgb.XGBRegressor(
            objective = qwk_obj,
            metrics = 'None',
            learning_rate = 0.1, #0.1
            max_depth = 8,#10
            num_leaves = 10, #15
            colsample_bytree=0.5,
            reg_alpha = 0.1, #1
            reg_lambda = 0.8, #0.1
            n_estimators=1024,
            random_state=42,
            extra_trees=True,
            class_weight='balanced',
            tree_method="hist",
            device="gpu" if CUDA_AVAILABLE else "cpu"
        #             device='gpu',
        #             verbosity = 1
        )

        xgb_callbacks = [
            xgb.callback.EvaluationMonitor(period=25),
            xgb.callback.EarlyStopping(75, metric_name="QWK", maximize=True, save_best=True)
        ]
        xgb_regressor.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_train_fold, y_train_fold), (X_test_fold, y_test_fold)],
            eval_metric=quadratic_weighted_kappa,
            callbacks=xgb_callbacks
        )
        predictor = Predictor([light, xgb_regressor])
        # predictor = lgb_regressor.fit(X_train_fold, y_train_fold)
        #predictor = xgb_regressor.fit(X_train_fold, y_train_fold)
        models.append(predictor)
        # Make predictions on the test data for this fold
        predictions_fold = predictor.predict(X_test_fold)
        predictions_fold = predictions_fold + a
        predictions_fold = predictions_fold.clip(1, 6).round()
        predictions.append(predictions_fold)
        # Calculate and store the F1 score for this fold
        f1_fold = f1_score(y_test_fold_int, predictions_fold, average='weighted')
        f1_scores.append(f1_fold)

        # Calculate and store the Cohen's kappa score for this fold
        kappa_fold = cohen_kappa_score(y_test_fold_int, predictions_fold, weights='quadratic')
        kappa_scores.append(kappa_fold)
#         predictor.booster_.save_model(f'fold_{i}.txt')

        print(f'F1 score across fold: {f1_fold}')
        print(f'Cohen kappa score across fold: {kappa_fold}')
        i+=1
        gc.collect()
        if ENABLE_DONT_WASTE_YOUR_RUN_TIME:
            break

fold 1




[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.779087	valid's QWK: 0.764434
[50]	train's QWK: 0.821432	valid's QWK: 0.814442
[75]	train's QWK: 0.831539	valid's QWK: 0.826173
[100]	train's QWK: 0.837542	valid's QWK: 0.827916
[125]	train's QWK: 0.841732	valid's QWK: 0.827514
[150]	train's QWK: 0.844854	valid's QWK: 0.829682
[175]	train's QWK: 0.847213	valid's QWK: 0.828685
[200]	train's QWK: 0.849417	valid's QWK: 0.829446
[225]	train's QWK: 0.851336	valid's QWK: 0.833724
[250]	train's QWK: 0.853347	valid's QWK: 0.837106
[275]	train's QWK: 0.85537	valid's QWK: 0.835591
[300]	train's QWK: 0.857125	valid's QWK: 0.83686
[325]	train's QWK: 0.858885	valid's QWK: 0.835312
[350]	train's QWK: 0.861452	valid's QWK: 0.834555
[375]	train's QWK: 0.863279	valid's QWK: 0.836257
Early stopping, best iteration is:
[312]	train's QWK: 0.85794	valid's QWK: 0.838587
Evaluated only: QWK


Parameters: { "class_weight", "extra_trees", "metrics", "num_leaves" } are not used.



[0]	validation_0-rmse:1.07496	validation_0-QWK:0.46441	validation_1-rmse:1.08116	validation_1-QWK:0.42674
[0]	validation_0-rmse:1.07496	validation_0-QWK:0.46441	validation_1-rmse:1.08116	validation_1-QWK:0.42674
[1]	validation_0-rmse:0.97176	validation_0-QWK:0.47091	validation_1-rmse:0.98227	validation_1-QWK:0.44659
[2]	validation_0-rmse:0.87759	validation_0-QWK:0.49067	validation_1-rmse:0.89347	validation_1-QWK:0.45515
[3]	validation_0-rmse:0.79483	validation_0-QWK:0.56661	validation_1-rmse:0.82042	validation_1-QWK:0.55313
[4]	validation_0-rmse:0.72537	validation_0-QWK:0.64134	validation_1-rmse:0.75959	validation_1-QWK:0.61686
[5]	validation_0-rmse:0.66764	validation_0-QWK:0.69069	validation_1-rmse:0.71077	validation_1-QWK:0.65792
[6]	validation_0-rmse:0.62244	validation_0-QWK:0.72506	validation_1-rmse:0.67737	validation_1-QWK:0.68282
[7]	validation_0-rmse:0.58356	validation_0-QWK:0.76640	validation_1-rmse:0.64918	validation_1-QWK:0.72540
[8]	validation_0-rmse:0.55125	validation_0-QWK

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1 score across fold: 0.6723215008036963
Cohen kappa score across fold: 0.8341301935954841


In [None]:
if not LOAD:
    print(f'Mean F1 score across {n_splits} folds: 0.6694070084827064')
    print(f'Mean Cohen kappa score across {n_splits} folds: 0.835342584985933')
else:
    # Calculate the mean scores across all folds
    mean_f1_score = np.mean(f1_scores)
    mean_kappa_score = np.mean(kappa_scores)
    # Print the mean scores
    print(f'Mean F1 score across {n_splits} folds: {mean_f1_score}')
    print(f'Mean Cohen kappa score across {n_splits} folds: {mean_kappa_score}')

Mean F1 score across 20 folds: 0.6723215008036963
Mean Cohen kappa score across 20 folds: 0.8341301935954841


# Inference

In [None]:
if ENABLE_DONT_WASTE_YOUR_RUN_TIME:
    import shutil

    shutil.copyfile("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv", "submission.csv")
else:

    tmp = Paragraph_Preprocess(test)
    test_feats = Paragraph_Eng(tmp)

    #adding new grammar features
    feats_new = grammar(test)
    test_feats['no_adjectives'] = feats_new['no_adjectives']
    test_feats['no_adverbs'] = feats_new['no_adverbs']
    test_feats['no_mistakes'] = feats_new['no_mistakes']


    # Sentence
    tmp = Sentence_Preprocess(test)
    test_feats = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
    # Word
    tmp = Word_Preprocess(test)
    test_feats = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

    # TfidfVectorizer
    test_tfid = vectorizer.transform([i for i in test['full_text']])
    dense_matrix = test_tfid.toarray()
    df = pd.DataFrame(dense_matrix)
    tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
    df.columns = tfid_columns
    df['essay_id'] = test_feats['essay_id']
    test_feats = test_feats.merge(df, on='essay_id', how='left')

    # CountVectorizer
    test_tfid = vectorizer_cnt.transform([i for i in test['full_text']])
    dense_matrix = test_tfid.toarray()
    df = pd.DataFrame(dense_matrix)
    tfid_columns = [ f'tfid_cnt_{i}' for i in range(len(df.columns))]
    df.columns = tfid_columns
    df['essay_id'] = test_feats['essay_id']
    test_feats = test_feats.merge(df, on='essay_id', how='left')


    for i in range(6):
        test_feats[f'deberta_oof_{i}'] = predicted_score[:, i]

    # Features number
    feature_names = list(filter(lambda x: x not in ['essay_id','score'], test_feats.columns))
    print('Features number: ',len(feature_names))
    test_feats.head(3)

    # Submission

    probabilities = []
    for model in models:
        proba = model.predict(test_feats[feature_select]) + a
        probabilities.append(proba)

    # Compute the average probabilities across all models
    predictions = np.mean(probabilities, axis=0)
    predictions = np.round(predictions.clip(1, 6))

    # Print the predictions
    print(predictions)

    submission = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
    submission['score'] = predictions
    submission['score'] = submission['score'].astype(int)
    submission.to_csv("submission.csv", index=None)
    display(submission.head())