In [3]:
from transformers import BertTokenizer
import pandas as pd

# 加载BERT分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def adjust_token_length(text, lower_limit=200, upper_limit=512):
    """
    调整文本的token数量。如果token数量超过upper_limit，通过跳跃采样减少token数量。
    如果token数量低于lower_limit，则返回None以便后续删除这些行。
    """
    tokens = tokenizer.encode(text, add_special_tokens=True)
    length = len(tokens)
    
    if length < lower_limit:
        return None  # token数量低于下限，标记为删除
    
    while length > upper_limit:
        # 每隔一个token取一个token
        tokens = tokens[::2] + tokens[1::2] if length > 2 * upper_limit else tokens[::2]
        length = len(tokens)
    
    # 将调整后的tokens转换回文本
    adjusted_text = tokenizer.decode(tokens, skip_special_tokens=True)
    return adjusted_text

# 读取CSV文件
df = pd.read_csv('./dataset/test.csv', encoding='latin-1')

# 确保description列中的所有数据都是字符串类型
df['description'] = df['description'].astype(str)

# 应用调整token长度的函数
df['description'] = df['description'].apply(adjust_token_length)

# 删除description为None的行
df.dropna(subset=['description'], inplace=True)

# 保存处理后的数据集到新的CSV文件
df.to_csv('./dataset/test_total_processed_new.csv', index=False, encoding='latin-1')

Token indices sequence length is longer than the specified maximum sequence length for this model (724 > 512). Running this sequence through the model will result in indexing errors
