# part1 predict gold index

In [1]:
import json
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from lion_pytorch import Lion

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx][0]['input_ids'].squeeze()
        attention_mask = self.data[idx][0]['attention_mask'].squeeze()
        labels = self.data[idx][1]
        return input_ids, attention_mask, labels
    
class CustomTestDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx][0]['input_ids'].squeeze()
        attention_mask = self.data[idx][0]['attention_mask'].squeeze()
        return input_ids, attention_mask

def update_data(file_name, all_gold_pred):
    # Load the existing data
    data = load_data(file_name)

    # Modify the data
    for i, item in enumerate(data):
        item['s.gold.index.predict'] = all_gold_pred[i]

    updated_file_name = './dataset/updated_' + file_name
    with open(updated_file_name, 'w') as file:
        json.dump(data, file, indent=4)

def load_data(file_name):
    file_path = os.path.join('dataset', file_name)
    print(f'Loading data from {file_path}')
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def preprocess_text(text):
    return text.strip().lower().replace('[', '').replace(']', '')

def tokenize_and_format(data, tokenizer, max_length=128):
    processed_data = []

    for i, item in tqdm(enumerate(data)):
        utterance = preprocess_text(item['u'])
        situational_statements = [preprocess_text(statement) for statement in item['s']]
        gold_index = item.get('s.gold.index', [])
        response = preprocess_text(item['r'])
        response_label = item.get('r.label', None)
        for index, statement in enumerate(situational_statements):
            formatted_text = f'{utterance} {response} {statement}'
            if index in gold_index:
                label_tensor = torch.tensor(1, dtype=torch.float)
            else:
                label_tensor = torch.tensor(0, dtype=torch.float)

            tokenized_text = tokenizer.encode_plus(
                formatted_text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt'
            )

            processed_data.append((tokenized_text, label_tensor))

    return processed_data


def train(model, train_data, val_data, test_data, optimizer, criterion, device, epochs):
    model.to(device)

    min_val_loss = 10000
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for input_ids, attention_mask, labels in tqdm(train_data):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.unsqueeze(1).to(device)

            model.zero_grad()

            # 執行模型
            outputs = model(input_ids, attention_mask=attention_mask).logits
            
            # print(f'outputs: {outputs.shape}, labels: {labels.shape}')

            loss = criterion(outputs, labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()

            # print(f'outputs: {outputs}, labels: {labels}')
        # 計算驗證集的損失
        val_loss = 0
        accuracy = 0
        all_val_preds = []
        all_val_labels = []
        all_gold_pred_val = []
        model.eval()
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_data:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.unsqueeze(1).to(device)

                # print(f'input_ids: {input_ids}, attention_mask: {attention_mask}, labels: {labels}')
                # exit(0)
                outputs = model(input_ids, attention_mask=attention_mask).logits
                val_loss += criterion(outputs, labels)

                # 計算驗證集的準確率, if accuracy > 0.5, then 1, else 0
                # print(f'outputs: {outputs}, labels: {labels}')
                outputs = (torch.sigmoid(outputs) > 0.5).cpu().numpy().astype(int)

                gold_pred = [i for i, output in enumerate(outputs) if output == 1]
                all_gold_pred_val.append(gold_pred)

                all_val_preds.append(outputs)
                all_val_labels.append(labels.cpu().numpy())

            all_val_preds = np.concatenate(all_val_preds, axis=0)
            all_val_labels = np.concatenate(all_val_labels, axis=0)
            accuracy = accuracy_score(all_val_labels, all_val_preds)

            if val_loss/len(val_data) < min_val_loss:
                min_val_loss = val_loss/len(val_data)
                # print(f'Saving model at epoch {epoch+1}, min_val_loss: {min_val_loss:.4f}')
                torch.save(model.state_dict(), 'best_index_model.pt')

        tqdm.write(f'Epoch [{epoch+1}/{epochs}], '
                   f'Loss: {total_loss / len(train_data):.4f}, '
                   f'Validation Loss: {val_loss/len(val_data):.4f}, '
                   f'Validation Accuracy: {accuracy:.4f}')
    
    with torch.no_grad():
        all_gold_pred_test = []
        for input_ids, attention_mask in test_data:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask).logits

            outputs = (torch.sigmoid(outputs) > 0.5).cpu().numpy().astype(int)

            gold_pred = [i for i, output in enumerate(outputs) if output == 1]

            all_gold_pred_test.append(gold_pred)
    
    return all_gold_pred_val, all_gold_pred_test

def calculate_pos_weight(train_data):
    positive_count = sum([label for _, _, label in train_data])
    total_count = len(train_data)
    negative_count = total_count - positive_count
    print(f'positive_count: {positive_count}, negative_count: {negative_count}')
    return negative_count.float() / positive_count.float()

model_name = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Adjust the file paths as necessary
train_data = load_data('train.json')
val_data = load_data('val.json')
test_data = load_data('test.json')

processed_train = tokenize_and_format(train_data, tokenizer)
processed_val = tokenize_and_format(val_data, tokenizer)
processed_test = tokenize_and_format(test_data, tokenizer)
epochs = 1

# optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-2)
optimizer = Lion(model.parameters(), lr=1e-6, weight_decay=5e-2)

# 轉換為 Dataset
train_dataset = CustomDataset(processed_train)
val_dataset = CustomDataset(processed_val)
test_dataset = CustomTestDataset(processed_test)

criterion = torch.nn.BCEWithLogitsLoss()

# 設定 batch size
batch_size = 12  # 或者您希望的其他數值
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 創建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 呼叫訓練函數
all_gold_pred_val, all_gold_pred_test = train(model, train_loader, val_loader, test_loader, optimizer, criterion, device, epochs)

print(f'all_gold_pred_val: {all_gold_pred_val[0]}')
print(f'all_gold_pred_test: {all_gold_pred_test[0]}')

# write to json file, add new key: s.gold.index.predict
update_data('val.json', all_gold_pred_val)
update_data('test.json', all_gold_pred_test)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading data from dataset/train.json
Loading data from dataset/val.json
Loading data from dataset/test.json


3696it [00:05, 687.43it/s]
792it [00:01, 733.62it/s]
792it [00:01, 627.00it/s]
100%|██████████| 3696/3696 [02:19<00:00, 26.59it/s]


Epoch [1/1], Loss: 0.3894, Validation Loss: 0.3270, Validation Accuracy: 0.8741
all_gold_pred_val: [2, 3, 5, 6, 7, 8, 9]
all_gold_pred_test: [0, 1, 2, 4, 5, 7, 10, 11]
Loading data from dataset/val.json
Loading data from dataset/test.json


# part2 preprocess 

In [5]:
import json
import os
from transformers import BertTokenizer
import torch
from transformers import AutoTokenizer
from gold import get_gold_indices
from tqdm import tqdm

def load_data(file_name):
    file_path = os.path.join('dataset', file_name)
    print(f'Loading data from {file_path}')
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def preprocess_text(text):
    return text.strip().lower().replace('[', '').replace(']', '')

def tokenize_and_format(data, tokenizer, max_length=256, dataset_type='train'):
    processed_data = []

    for i, item in tqdm(enumerate(data)):
        utterance = preprocess_text(item['u'])
        situational_statements = [preprocess_text(statement) for statement in item['s']]
        # situational_types = item['s.type']
        response = preprocess_text(item['r'])
        response_label = item.get('r.label', None)

        if dataset_type != 'train':
            gold_index = item.get('s.gold.index.predict', [])
            combined_statements = ' '.join([f"{preprocess_text(statement)}"
                                            for index, statement in enumerate(situational_statements) if index in gold_index])
        else:
            gold_index = item.get('s.gold.index', [])
            combined_statements = ' '.join([f"{preprocess_text(statement)}"
                                            for index, statement in enumerate(situational_statements) if index in gold_index])
        
        formatted_text = f"{utterance} {combined_statements} {response}"
        # formatted_text = f"{utterance}  {response}"

        # BERT tokenizer
        tokenized_text = tokenizer.encode_plus(
            formatted_text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )


        if dataset_type == 'test':
            label_tensor = None
            processed_data.append(tokenized_text)
        else:
            label_tensor = torch.tensor(response_label, dtype=torch.float) if response_label is not None else None
            processed_data.append((tokenized_text, label_tensor))

    return processed_data

# part3 train process

In [6]:
import torch
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
from preprocess import load_data, tokenize_and_format
from lion_pytorch import Lion
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification

from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx][0]['input_ids'].squeeze()
        attention_mask = self.data[idx][0]['attention_mask'].squeeze()
        labels = self.data[idx][1]
        return input_ids, attention_mask, labels

def train(model, train_data, val_data, optimizer, criterion, device, epochs):
    model.to(device)

    min_val_loss = 10000
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for input_ids, attention_mask, labels in tqdm(train_data):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.unsqueeze(1).to(device)

            model.zero_grad()

            # 執行模型
            outputs = model(input_ids, attention_mask=attention_mask).logits
            
            # print(f'outputs: {outputs.shape}, labels: {labels.shape}')

            loss = criterion(outputs, labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()

            # print(f'outputs: {outputs}, labels: {labels}')
        # 計算驗證集的損失
        val_loss = 0
        accuracy = 0
        all_val_preds = []
        all_val_labels = []
        model.eval()
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_data:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.unsqueeze(1).to(device)

                # print(f'input_ids: {input_ids}, attention_mask: {attention_mask}, labels: {labels}')
                # exit(0)
                outputs = model(input_ids, attention_mask=attention_mask).logits
                val_loss += criterion(outputs, labels)

                # 計算驗證集的準確率, if accuracy > 0.5, then 1, else 0
                # print(f'outputs: {outputs}, labels: {labels}')
                outputs = (outputs > 0.5).cpu().numpy().astype(float)

                all_val_preds.append(outputs)
                all_val_labels.append(labels.cpu().numpy())

            all_val_preds = np.concatenate(all_val_preds, axis=0)
            all_val_labels = np.concatenate(all_val_labels, axis=0)
            # print(f'all_val_preds: {all_val_preds[:5]}, all_val_labels: {all_val_labels[:5]}')
            # print(f'all_val_preds: {np.sum(all_val_preds, axis=0)}, all_val_labels: {np.sum(all_val_labels, axis=0)}')
            accuracy = accuracy_score(all_val_labels, all_val_preds)

            if val_loss/len(val_data) < min_val_loss:
                min_val_loss = val_loss/len(val_data)
                # print(f'Saving model at epoch {epoch+1}, min_val_loss: {min_val_loss:.4f}')
                torch.save(model.state_dict(), 'best_model.pt')

        tqdm.write(f'Epoch [{epoch+1}/{epochs}], '
                   f'Loss: {total_loss / len(train_data):.4f}, '
                   f'Validation Loss: {val_loss/len(val_data):.4f}, '
                   f'Validation Accuracy: {accuracy:.4f}')
        


def calculate_pos_weight(train_data):
    positive_count = sum([label for _, _, label in train_data])
    total_count = len(train_data)
    negative_count = total_count - positive_count
    return negative_count.float() / positive_count.float()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'microsoft/deberta-v3-large'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_data = load_data('train.json')
val_data = load_data('updated_val.json')

processed_train = tokenize_and_format(train_data, tokenizer, dataset_type='train')
processed_val = tokenize_and_format(val_data, tokenizer, dataset_type='val')

epochs = 8

# optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-2)
optimizer = Lion(model.parameters(), lr=1e-6, weight_decay=5e-2)


# 轉換為 Dataset
train_dataset = CustomDataset(processed_train)
val_dataset = CustomDataset(processed_val)

pos_weight = calculate_pos_weight(train_dataset)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))

# 設定 batch size
batch_size = 8  # 或者您希望的其他數值

# 創建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 呼叫訓練函數
train(model, train_loader, val_loader, optimizer, criterion, device, epochs)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading data from dataset/train.json
Loading data from dataset/updated_val.json


3696it [00:00, 5285.17it/s]
792it [00:00, 5409.28it/s]
100%|██████████| 462/462 [01:26<00:00,  5.36it/s]


Epoch [1/8], Loss: 0.7474, Validation Loss: 0.8692, Validation Accuracy: 0.7323


100%|██████████| 462/462 [01:26<00:00,  5.37it/s]


Epoch [2/8], Loss: 0.3767, Validation Loss: 0.8438, Validation Accuracy: 0.7955


100%|██████████| 462/462 [01:26<00:00,  5.37it/s]


Epoch [3/8], Loss: 0.2055, Validation Loss: 1.0434, Validation Accuracy: 0.7942


100%|██████████| 462/462 [01:26<00:00,  5.37it/s]


Epoch [4/8], Loss: 0.1435, Validation Loss: 1.3464, Validation Accuracy: 0.8005


100%|██████████| 462/462 [01:26<00:00,  5.36it/s]


Epoch [5/8], Loss: 0.0995, Validation Loss: 1.3225, Validation Accuracy: 0.8119


100%|██████████| 462/462 [01:26<00:00,  5.37it/s]


Epoch [6/8], Loss: 0.0676, Validation Loss: 1.5578, Validation Accuracy: 0.7917


100%|██████████| 462/462 [01:26<00:00,  5.37it/s]


Epoch [7/8], Loss: 0.0557, Validation Loss: 1.6171, Validation Accuracy: 0.7955


100%|██████████| 462/462 [01:26<00:00,  5.36it/s]


Epoch [8/8], Loss: 0.0602, Validation Loss: 1.9707, Validation Accuracy: 0.7955


# part4 inference process

In [7]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from preprocess import load_data, tokenize_and_format
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx]['input_ids'].squeeze()
        attention_mask = self.data[idx]['attention_mask'].squeeze()
        return input_ids, attention_mask

def predict(model, test_data, device):
    model.to(device)
    model.eval()

    predictions = []
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(test_data):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask).logits
            preds = (torch.sigmoid(outputs) > 0.5).cpu().numpy().astype(int)
            predictions.append(preds)

    predictions = np.concatenate(predictions, axis=0)
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 加载训练好的模型
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.load_state_dict(torch.load('best_model.pt'))
model.eval()


# 加载并处理测试数据
test_data = load_data('updated_test.json')
processed_test = tokenize_and_format(test_data, tokenizer, dataset_type='test')

# 转换为 Dataset
test_dataset = CustomDataset(processed_test)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# 进行预测
test_preds = predict(model, test_loader, device)

# 创建 submission 文件
submission = pd.DataFrame(test_preds, columns=['response_quality'])
submission.to_csv('submission.csv', index=True, index_label='index')


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading data from dataset/updated_test.json


792it [00:00, 4973.91it/s]
100%|██████████| 99/99 [00:05<00:00, 18.52it/s]


# part5 ensemble part

In [8]:
import pandas as pd

# Load the five CSV files
df1 = pd.read_csv('data/responses.csv')
df2 = pd.read_csv('data/responses.csv')
df3 = pd.read_csv('data/submission_85123.csv')
df4 = pd.read_csv('data/submission_85123.csv')  # replace with your actual filename
df5 = pd.read_csv('data/submission_result.csv')  # replace with your actual filename

# Set 'index' as the index for correct row-wise operations
df1.set_index('index', inplace=True)
df2.set_index('index', inplace=True)
df3.set_index('index', inplace=True)
df4.set_index('index', inplace=True)
df5.set_index('index', inplace=True)

# Sum the response_quality columns
sum_df = df1['response_quality'] + df2['response_quality'] + df3['response_quality'] + df4['response_quality'] + df5['response_quality']

# Create a new DataFrame for the result
result_df = pd.DataFrame(sum_df, columns=['response_quality'])

# If the sum is greater than or equal to 3, set to 1; otherwise, set to 0
result_df['response_quality'] = result_df['response_quality'].apply(lambda x: 1 if x >= 3 else 0)

# Reset the index to turn it back into a column
result_df.reset_index(inplace=True)

# Save the result to a new CSV file
result_df.to_csv('data/submission_result2.csv', index=False)