In [1]:
import os
import pandas as pd
import argparse
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import BertTokenizer
from transformers import BertPreTrainedModel
from transformers import BertConfig, BertModel, AdamW
from transformers import AutoTokenizer

from transformers import get_linear_schedule_with_warmup
import torch.nn as nn
from torch.nn import functional as F
import seaborn as sns
import random 
import datetime

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv


In [2]:
dirname = '/kaggle/input/feedback-prize-english-language-learning'
train = pd.read_csv(dirname + '/train.csv')

In [3]:
train.drop(columns=['text_id'], inplace=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
train_df, valid_df = train_test_split(train, test_size=0.10)

cuda


In [5]:
print(train.head())

                                           full_text  cohesion  syntax  \
0  I think that students would benefit from learn...       3.5     3.5   
1  When a problem is a change you have to let it ...       2.5     2.5   
2  Dear, Principal\n\nIf u change the school poli...       3.0     3.5   
3  The best time in life is when you become yours...       4.5     4.5   
4  Small act of kindness can impact in other peop...       2.5     3.0   

   vocabulary  phraseology  grammar  conventions  
0         3.0          3.0      4.0          3.0  
1         3.0          2.0      2.0          2.5  
2         3.0          3.0      3.0          2.5  
3         4.5          4.5      4.0          5.0  
4         3.0          3.0      2.5          2.5  


In [10]:
model_path_or_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)

In [11]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = []
        self.max_length = max_length
        for i, row in dataframe.iterrows():
            text = row['full_text']
            cohesion = row['cohesion']
            syntax = row['syntax']
            vocabulary = row['vocabulary']
            phraseology = row['phraseology']
            grammar = row['grammar']
            conventions = row['conventions']
            target = [cohesion, syntax, vocabulary, phraseology, grammar, conventions]
            self.data.append((text, target))
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, targets = self.data[idx]

        input = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        input_ids = input['input_ids'].squeeze(0)
        attention_mask = input['attention_mask'].squeeze(0)
        targets = torch.tensor([float(target) for target in targets])

        return input_ids, attention_mask, targets 

In [12]:
train_dataset = CustomDataset(train_df, tokenizer)
val_dataset = CustomDataset(valid_df, tokenizer)

batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [13]:
class BertForMultipleRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        return logits

In [14]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
model = BertForMultipleRegression.from_pretrained(model_path_or_name)

model = model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleRegression: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForMultipleRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleRegression were not initialized from the model checkpoint at bert-base-uncas

In [17]:
epochs = 10 
optimizer = AdamW (model.parameters(),
                  lr = 1e-6,
                  eps = 1e-8,
                )

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [18]:
train_losses = []
val_losses = []
show_every = 20

In [19]:
for epoch_i in range(0, epochs):
    
    store_train_loss = []
    store_val_loss = []
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs_ids, attention_masks, labels = batch
        model.zero_grad()

        outputs = model(inputs_ids, 
                    attention_mask=attention_masks)
        loss = F.mse_loss(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        store_train_loss.append(loss.item())

        if step % show_every == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {} / {}.  Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('Training loss: %.3f'%(np.mean(store_train_loss[-show_every:]) )) 
    train_losses.append(np.mean(store_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    
    for batch in val_dataloader:
        
        batch = tuple(t.to(device) for t in batch)
        val_inputs_ids, val_attention_masks, val_labels = batch
        with torch.no_grad():  
            val_outputs = model(val_inputs_ids,  
                            attention_mask=val_attention_masks)
        val_logits = val_outputs[0]
        val_loss = F.mse_loss(val_outputs, val_labels)

        store_val_loss.append(val_loss.item())
    mean_val_loss = np.mean(store_val_loss)
    val_losses.append(mean_val_loss)


    print("Epoch {}: Train Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch_i, train_losses[-1], val_losses[-1]))
print("")
print("Training complete!")


Training...
  Batch 20 / 440.  Elapsed: 0:00:18.
Training loss: 9.375
  Batch 40 / 440.  Elapsed: 0:00:34.
Training loss: 8.322
  Batch 60 / 440.  Elapsed: 0:00:49.
Training loss: 8.235
  Batch 80 / 440.  Elapsed: 0:01:06.
Training loss: 7.889
  Batch 100 / 440.  Elapsed: 0:01:23.
Training loss: 7.910
  Batch 120 / 440.  Elapsed: 0:01:40.
Training loss: 7.376
  Batch 140 / 440.  Elapsed: 0:01:56.
Training loss: 7.052
  Batch 160 / 440.  Elapsed: 0:02:12.
Training loss: 6.251
  Batch 180 / 440.  Elapsed: 0:02:29.
Training loss: 5.893
  Batch 200 / 440.  Elapsed: 0:02:45.
Training loss: 5.239
  Batch 220 / 440.  Elapsed: 0:03:02.
Training loss: 4.890
  Batch 240 / 440.  Elapsed: 0:03:18.
Training loss: 4.052
  Batch 260 / 440.  Elapsed: 0:03:35.
Training loss: 3.843
  Batch 280 / 440.  Elapsed: 0:03:51.
Training loss: 3.623
  Batch 300 / 440.  Elapsed: 0:04:07.
Training loss: 3.015
  Batch 320 / 440.  Elapsed: 0:04:24.
Training loss: 2.632
  Batch 340 / 440.  Elapsed: 0:04:40.
Training 

In [None]:
test = pd.read_csv(dirname + '/test.csv')
test['cohesion'] = 0
test['syntax'] = 0
test['vocabulary'] = 0
test['phraseology'] = 0
test['grammar'] = 0
test['conventions'] = 0
print(test.head())

        text_id                                          full_text  cohesion  \
0  0000C359D63E  when a person has no experience on a job their...         0   
1  000BAD50D026  Do you think students would benefit from being...         0   
2  00367BB2546B  Thomas Jefferson once states that "it is wonde...         0   

   syntax  vocabulary  phraseology  grammar  conventions  
0       0           0            0        0            0  
1       0           0            0        0            0  
2       0           0            0        0            0  


In [22]:
model.eval()
test_dataset = CustomDataset(test, tokenizer)
batch_size = 8
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
outputs = torch.zeros(len(test_dataset), 6)
for step, batch in enumerate(test_dataloader):
    with torch.no_grad():
        batch = tuple(t.to(device) for t in batch)
        inputs_ids, attention_masks, labels = batch
        outputs[step*batch_size:(step+1)*batch_size] = model(inputs_ids, attention_mask=attention_masks)

In [23]:
test['cohesion'] = outputs[:,0].detach().cpu().numpy()
test['syntax'] = outputs[:,1].detach().cpu().numpy()
test['vocabulary'] = outputs[:,2].detach().cpu().numpy()
test['phraseology'] = outputs[:,3].detach().cpu().numpy()
test['grammar'] = outputs[:,4].detach().cpu().numpy()
test['conventions'] = outputs[:,5].detach().cpu().numpy()

test.drop(columns=['full_text'], inplace=True)
print(test.head())

test.to_csv('submission.csv', index=False)

        text_id  cohesion    syntax  vocabulary  phraseology   grammar  \
0  0000C359D63E  3.103090  2.989382    3.289109     3.112214  2.984290   
1  000BAD50D026  3.052192  2.956181    3.237710     3.094946  3.002252   
2  00367BB2546B  3.711895  3.552477    3.543804     3.622966  3.530829   

   conventions  
0     3.032859  
1     3.056203  
2     3.603334  


In [1]:
#RoBERTa

In [24]:
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F

model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

model = model.to(device)

class RobertaForMultipleRegression(nn.Module):
    def __init__(self):
        super(RobertaForMultipleRegression, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # استفاده از بردار [CLS]
        pooled_output = self.dropout(pooled_output)
        return self.regressor(pooled_output)

epochs = 5
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)

def train_model():
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch

        outputs = model(input_ids=input_ids, attention_mask=attention_masks)
        loss = F.mse_loss(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
    return total_loss / len(train_dataloader)

def evaluate_model():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, labels = batch

            outputs = model(input_ids=input_ids, attention_mask=attention_masks)
            loss = F.mse_loss(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(val_dataloader)

for epoch in range(epochs):
    train_loss = train_model()
    val_loss = evaluate_model()
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

print("Training complete!")
