In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [2]:
import os
import pandas as pd
train_file = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
test_file = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
df_train = pd.read_csv(train_file)
df_train.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [3]:
import nltk

## max_no_words_per_chunk = 416 
## spliting the full text in chunks of sentences such that no sentence have more than 416 words 
## then each chnuk can be passed to LLM for feature extracion

def split_into_chunks(text, max_words_per_chunk=416):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Initialize variables
    chunks = []
    current_word_count = 0
    
    # Iterate through each sentence
    big_sentence = ''
    for idx, sentence in enumerate(sentences):
        # Calculate the word count of the current sentence
        word_count = len(sentence.split())
        
        # Check if adding the current sentence to the chunk exceeds the max word limit
        if current_word_count + word_count <= max_words_per_chunk:
            big_sentence = big_sentence + sentence
            current_word_count += word_count
        else:
            # Add the big sentence to the list of chunks
            chunks.append([big_sentence])
            # Start a new big sentence with the current sentence
            big_sentence = sentence
            current_word_count = word_count
            
        if idx == len(sentences)-1:
            chunks.append([big_sentence])
   
    
    return chunks


df_train['chunk_text'] = df_train['full_text'].apply(split_into_chunks)
## drop the full text column to save memory as it is of no use 
df_train.drop(columns=['full_text'], inplace=True)
print(df_train.head())

  essay_id  score                                         chunk_text
0  000d118      3  [[Many people have car where they live.The thi...
1  000fe60      3  [[I am a scientist at NASA that is discussing ...
2  001ab80      4  [[People always wish they had the same technol...
3  001bdc0      4  [[We all heard about Venus, the planet without...
4  002ba53      3  [[Dear, State Senator\n\nThis is a letter to a...


In [4]:
from transformers import AutoTokenizer
import torch
from transformers import AutoModel

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to(device)

cpu


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [6]:
def extract_features(text_lst, score):
    all_max_features = None  # Initialize an empty array to store max features
    
    for idx, sub_text in enumerate(text_lst):
        inputs = tokenizer(sub_text, truncation=True, return_tensors="pt")
        inputs = {k:v.to(device) for k,v in inputs.items()}
        ## last hidden layer
        with torch.no_grad():
            last_hidden_state = model(**inputs).last_hidden_state.cpu().numpy()
            #last_hidden_state = last_hidden_state.cpu().numpy()
            #print(last_hidden_state.shape)  
        ## CLS Token feature
        if idx==0:
            cls_feature = last_hidden_state[:, 0]
            last_hidden_state = last_hidden_state[:, 1:, :]
            #print(cls_feature.shape)
        ## max feature
        max_feature = np.amax(np.squeeze(last_hidden_state, axis=0), axis=0, keepdims=True)
        if all_max_features is None:
            all_max_features = max_feature
        else:
            all_max_features = np.vstack((all_max_features, max_feature))

    max_max_feature = np.amax(all_max_features, axis=0, keepdims=True)
    ## concatenate CLS token and Max feature
    final_feature = np.hstack((cls_feature, max_max_feature))
    #final_feature = np.squeeze(final_feature)
    # print(max_max_feature.shape)
    #print(final_feature.shape)
    return {'feature': final_feature, 'score': score}

In [7]:
df_train_sub = df_train.sample(500)
dataset = df_train_sub.apply(lambda row: extract_features(row['chunk_text'], row['score']), axis=1)

# Concatenate all features and scores into numpy arrays
features = np.vstack([data['feature'] for data in dataset])
scores = np.array([data['score'] for data in dataset])
# Save to output_folder
output_folder = '/kaggle/working'
feature_name = 'feature_1536.npy'
lbl_name = 'score.npy'
feature_path = os.path.join(output_folder, feature_name)
lbl_path = os.path.join(output_folder, lbl_name)
np.save(feature_path, features)
np.save(lbl_path, scores)
