In [None]:
如果你有超过BERT模型最大序列长度（512个tokens）的代码片段，你将需要采取一些策略来处理这个限制。以下是一些可行的方法：

1. 分块（Chunking）
将代码分成多个块，每个块都小于或等于最大序列长度。每个块可以单独处理，然后可以采取不同的策略来整合块的结果。

In [1]:
code_snippet = """
import os
import sys

def read_file(filename):
   
    try:
        with open(filename, 'r') as file:
            return file.read()
    except IOError:
        print(f"Error opening {filename}")
        sys.exit(1)

def process_content(content):
    
    processed_content = content.upper()
    return processed_content

def write_file(filename, content):
   
    try:
        with open(filename, 'w') as file:
            file.write(content)
    except IOError:
        print(f"Error writing to {filename}")
        sys.exit(1)

def main():
    if len(sys.argv) != 3:
        print("Usage: python script.py input_file output_file")
        sys.exit(1)
    
    input_file = sys.argv[1]
    output_file = sys.argv[2]
    
    # Read the input file
    content = read_file(input_file)
    
    # Process the content
    processed_content = process_content(content)
    
    # Write the processed content to the output file
    write_file(output_file, processed_content)
    
    print(f"Processed content written to {output_file}")

if __name__ == "__main__":
    main()


"""

max_length = 512  # BERT's maximum sequence length

# The actual code snippet is now set in the `code_snippet` variable and exceeds typical length limits, 
# requiring the sliding window approach for processing with models like CodeBERT.


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")
model.eval()  # 设置模型为评估模式
chunk_size = 512  # 每个chunk的长度

In [3]:
#这段代码首先将代码分割成多个chunk，每个chunk由500个token组成。
# 然后，对每个chunk独立地获取embedding，并计算这些embeddings的平均值以获得代表整个代码段的单个embedding向量。
# 这种方法允许您处理超过模型最大序列长度限制的长代码段，并且通过平均embeddings来捕获整体的代码信息。
# 将代码分割为多个chunks
tokens = tokenizer.tokenize(code_snippet)
input_ids_chunks = [tokenizer.convert_tokens_to_ids(tokens[i:i+chunk_size]) for i in range(0, len(tokens), chunk_size)]

# 初始化一个列表来存储每个chunk的embedding
chunk_embeddings = []

for chunk in input_ids_chunks:
    # 将chunk转换为tensor，并添加批次维度
    input_ids_tensor = torch.tensor([chunk])
    
    with torch.no_grad():
        # 通过模型获取输出
        outputs = model(input_ids=input_ids_tensor)
        # 获取最后一层的隐藏状态作为embedding
        chunk_embeddings.append(outputs.last_hidden_state.mean(dim=1))

# 计算所有chunk的embeddings的平均值
average_embedding = torch.mean(torch.stack(chunk_embeddings), dim=0)


In [18]:
average_embedding

tensor([[-4.8975e-01,  6.4742e-02,  2.7215e-01,  1.1820e-01, -2.4820e-01,
         -7.1653e-01, -1.9560e-02,  3.4860e-01,  5.0434e-01,  4.0049e-01,
         -3.4864e-01,  8.6760e-01, -2.3258e-01, -1.3339e-01,  8.2739e-01,
         -2.1255e-01,  1.7405e-01,  3.5642e-01, -3.8591e-02,  5.6129e-02,
         -2.0588e-01, -2.7615e-01,  6.3956e-01, -5.2021e-01,  4.6999e-01,
          4.3322e-01, -7.4934e-02,  8.6133e-01, -7.2590e-01,  8.3926e-01,
         -8.2937e-02,  1.3590e-01,  1.3960e+00,  1.9649e-01,  3.8412e-01,
         -3.7863e-01, -4.5276e-01,  1.8557e-01,  5.8113e-02, -3.6326e-01,
          2.3319e-02,  6.1426e-01, -9.7316e-01, -1.3430e-01,  4.3775e-01,
          3.0119e-01,  6.3022e-01, -8.0048e-02,  1.3813e-01,  6.5626e-01,
          5.5321e-01,  1.6538e-01, -5.5476e-01, -3.6035e-01,  4.7931e-01,
          5.6815e-01, -1.1156e+00, -8.2188e-01, -2.0880e-01, -4.3521e-01,
         -6.7213e-02, -4.1730e-01, -3.5018e-01, -9.1060e-02,  1.3303e+00,
          2.8497e-01,  7.0275e-01,  7.

In [5]:
#average
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch import nn

# 加载数据
data = pd.DataFrame({
    "text": [code_snippet, code_snippet*2, code_snippet*3],
    "label": [0.8, 0.9, 0.2]
})

# 定义数据集
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# 初始化tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# 准备数据集
train_data, val_data = train_test_split(data, test_size=0.1)
train_dataset = SentimentDataset(train_data['text'].to_numpy(), train_data['label'].to_numpy(), tokenizer)
val_dataset = SentimentDataset(val_data['text'].to_numpy(), val_data['label'].to_numpy(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# 定义BERT回归模型
class BertRegressor(nn.Module):
    def __init__(self):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained("microsoft/codebert-base")
        self.regressor = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1),
            nn.Sigmoid()
        )
    
    def forward(self, input_ids, attention_mask):
        # 检测是否有多个chunks
        if input_ids.size(1) > 512:  # 假设batch_size在第一维
            all_embeddings = []
            step_size = 512
            for i in range(0, input_ids.size(1), step_size):
                chunk_input_ids = input_ids[:, i:i+step_size]
                chunk_attention_mask = attention_mask[:, i:i+step_size]
                chunk_outputs = self.bert(input_ids=chunk_input_ids, attention_mask=chunk_attention_mask)
                chunk_embeddings = chunk_outputs.pooler_output
                all_embeddings.append(chunk_embeddings)
            
            # 计算所有chunks embeddings的平均值
            embeddings = torch.mean(torch.stack(all_embeddings, dim=0), dim=0)
        else:
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.pooler_output
        
        return self.regressor(embeddings)

model = BertRegressor()

# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()

# 训练模型
model.train()
for epoch in range(1):  # 这里的epoch数仅为示例，根据实际情况调整
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch}, Loss: {loss.item()}")


# 假设outputs是模型的logits输出
# 转换为预测的类别
_, predicted_labels = torch.max(outputs, dim=1)

# 计算准确率
accuracy = (predicted_labels == labels).float().mean()

# 简化的评估和预测步骤（根据需要实现完整的评估逻辑）
model.eval()
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        print(f"Predicted label: {outputs.squeeze().item()}, Actual label: {batch['labels'].item()}")


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Epoch 0, Loss: 0.13820284605026245
Predicted label: 0.5388351082801819, Actual label: 0.800000011920929


In [6]:
#not average
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch import nn

# 加载数据
data = pd.DataFrame({
    "text": [code_snippet, code_snippet*2, code_snippet*3],
    "label": [0.8, 0.9, 0.2]
})

# 定义数据集
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# 初始化tokenizer
model = AutoModel.from_pretrained("microsoft/codebert-base")

# 准备数据集
train_data, val_data = train_test_split(data, test_size=0.1)
train_dataset = SentimentDataset(train_data['text'].to_numpy(), train_data['label'].to_numpy(), tokenizer)
val_dataset = SentimentDataset(val_data['text'].to_numpy(), val_data['label'].to_numpy(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# 定义BERT回归模型
class BertRegressor(nn.Module):
    def __init__(self):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained("microsoft/codebert-base")
        self.regressor = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1),
            nn.Sigmoid()
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output)

model = BertRegressor()

# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()

# 训练模型
model.train()
for epoch in range(1):  # 这里的epoch数仅为示例，根据实际情况调整
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch}, Loss: {loss.item()}")


# 假设outputs是模型的logits输出
# 转换为预测的类别
_, predicted_labels = torch.max(outputs, dim=1)

# 计算准确率
accuracy = (predicted_labels == labels).float().mean()

# 简化的评估和预测步骤（根据需要实现完整的评估逻辑）
model.eval()
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        print(f"Predicted label: {outputs.squeeze().item()}, Actual label: {batch['labels'].item()}")


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Epoch 0, Loss: 0.12283027917146683
Predicted label: 0.46814465522766113, Actual label: 0.800000011920929
