In [1]:
import pandas as pd
import re
import string
import numpy as np
from nltk.corpus import stopwords
import spacy
import tensorflow as tf
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
summaries_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')

In [4]:
df = pd.merge(summaries_df, prompts_df, on='prompt_id', how='left')

In [5]:
def data_cleaning(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['text-cleaned'] = df.iloc[:, 2].apply(data_cleaning)

In [6]:
stop_words = stopwords.words('english')

def stopwords_removal(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

df['text-cleaned_punc'] = df.iloc[:, 8].apply(stopwords_removal)

In [7]:
nlp = spacy.load('en_core_web_sm')

def lemmatization(text):
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc])
    return text

df['text_clean_punc_lemmatized'] = df.iloc[:, 9].apply(lemmatization)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def bert_encode(data, maximum_length) :
    input_ids = []
    attention_masks = []

    for text in data:
        encoded = tokenizer.encode_plus(
            text, 
            truncation=True,
            add_special_tokens=True,
            max_length=maximum_length,
            pad_to_max_length=True,

            return_attention_mask=True,
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)

texts = df.iloc[: 10]
train_input_ids, train_attention_masks = bert_encode(texts, 60)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



In [9]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def encoded_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs[0][:,0,:].detach().numpy()

df['encoded'] = df.iloc[:, 10].apply(encoded_text)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
X = np.vstack(df.encoded.values)

In [11]:
df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,text-cleaned,text-cleaned_punc,text_clean_punc_lemmatized,encoded
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave was an experimentto see how peo...,third wave experimentto see people reacted new...,third wave experimentto see people react new o...,"[[-0.26248914, 0.0884993, 0.21228164, 0.125994..."
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",they would rub it up with soda to make the sme...,would rub soda make smell go away wouldnt bad ...,would rub soda make smell go away would not ba...,"[[-0.21544293, -0.004725341, -0.011959298, 0.0..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,in egypt there were many occupations and socia...,egypt many occupations social classes involved...,egypt many occupation social class involve day...,"[[-0.121495865, 0.21592598, 0.23811074, 0.1806..."
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,the highest class was pharaohs these people we...,highest class pharaohs people godsthen highes...,high class pharaohs people godsthen high cla...,"[[-0.44376627, 0.26307082, -0.51151323, -0.060..."
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave developed rapidly because the ...,third wave developed rapidly students genuinl...,third wave develop rapidly student genuinly ...,"[[-0.16627862, -0.17343572, 0.5428854, -0.0438..."
...,...,...,...,...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",they used all sorts of chemical concoctions to...,used sorts chemical concoctions make meat seem...,use sort chemical concoction make meat seem fi...,"[[-0.45913303, -0.07596844, -0.04327337, -0.10..."
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,the lowest classes are slaves and farmers slav...,lowest classes slaves farmers slaves people ta...,low class slave farmer slave people take war f...,"[[-0.5296791, 0.33453462, -0.2653051, 0.131163..."
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,they sorta made people start workin...,sorta made people start working str...,sorta make people start work struc...,"[[-0.108444646, 0.41298097, 0.124201536, 0.003..."
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,an ideal tragety has three elements that make ...,ideal tragety three elements make ideal start ...,ideal tragety three element make ideal start g...,"[[-0.67162293, -0.32171193, 0.399227, 0.113353..."
