用于生成new.gff文件

In [4]:
def process_gff(input_file_path, output_file_path):
    with open(input_file_path, 'r') as file, open(output_file_path, 'w') as output_file:
        for line in file:
            if line.startswith('#'):  # 跳过注释行
                continue
            columns = line.strip().split('\t')
            if columns[2] == 'CDS':  # 检查第三列是否为CDS
                seqid = columns[0]
                start = columns[3]
                end = columns[4]
                score = columns[6]
                # 提取第九列的属性，并处理ID值
                attributes = columns[8].split(';')
                for attribute in attributes:
                    if attribute.startswith('ID='):
                        id_value = attribute.split('-')[1].split(';')[0]  # 提取ID值，并去除"ID="和分号
                        break
                # 将提取的数据写入到输出文件中，以制表符分隔
                output_file.write('\t'.join([seqid, start, end, score, id_value]) + '\n')

# 假设你的GFF文件名为"input.gff"，输出文件名为"output.gff"
input_file_path = "GCA_008124465.1_ASM812446v1_genomic.gff"
output_file_path = "output.gff"

process_gff(input_file_path, output_file_path)

In [1]:
import re

def process_data(input_file_path, output_file_path):
    data = []
    with open(input_file_path, 'r') as file:
        for line in file:
            if line.startswith('CTG'):  # 跳过注释行
                continue
            columns = line.strip().split('\t')
            if columns[0].startswith('HIC_ASM_'):
                # 去除'Chr'前缀，包括前导零
                seqid = re.sub(r'^HIC_ASM_*', '', columns[0])
                data.append('\t'.join([seqid] + columns[1:]))

    # 按第一列的数字排序
    data.sort(key=lambda x: int(x.split('\t')[0]))

    # 写入到输出文件
    with open(output_file_path, 'w') as output_file:
        for item in data:
            output_file.write(item + '\n')

# 输入文件和输出文件的路径
input_file_path = ".1.gff"
output_file_path = "over.gff"

process_data(input_file_path, output_file_path)

In [2]:
insat_file = r"over.gff"
data = []
num = 0
with open(insat_file) as object_file:
    lines = object_file.readlines()
    for line in lines:
        data.append(line.split())
    for k in range(len(data)):
        bb = data[k][0]
        if data[k][0] != data[k - 1][0]:
            num = 1
            aa = 'Cnu' + bb + 'g' + '%04d' % num
        else:
            num += 1
            aa = 'Cnu' + bb + 'g' + '%04d' % num
        outsat_file = r'Cnu.new.gff'
        with open(outsat_file, 'a') as obj:
            obj.write(bb + '\t' + aa + '\t' + data[k][1] + '\t' + data[k][2] + '\t' + data[k][3] + '\t' + str(num) + '\t' + data[k][4] + '\n')


用于生成lens文件

In [10]:
##以重新命名排序的gff文件为准获取lens文件
ipf = r'Cnu.new.gff'
data = []
with open(ipf) as tj:
    lines = tj.readlines()
    for line in lines:
        data.append(line.split())
    for i in range(len(data)-1):
        if data[i + 1][0] != data[i][0]:
            opt = r'Cnu.lens'
            with open(opt, 'a') as tjo:
                tjo.write(data[i][0] + '\t' + data[i][3] + '\t' + data[i][5] + '\n')
        else:
            pass
    opt = r'Cnu.lens'
    with open(opt, 'a') as tjo:
        tjo.write(data[-1][0] + '\t' + data[-1][3] + '\t' + data[-1][5] + '\n')  

生成cds文件

In [11]:
def replace_seq_name(cds_file_path, gff_file_path, output_file_path):
    # 读取GFF文件，创建一个字典来存储第七列和第二列的对应关系
    gff_dict = {}
    with open(gff_file_path, 'r') as gff_file:
        for line in gff_file:
            if line.startswith('#'):  # 跳过注释行
                continue
            columns = line.strip().split('\t')
            gff_dict[columns[6]] = columns[1]  # 存储第七列和第二列的对应关系

    # 读取CDS文件，并替换序列名
    with open(cds_file_path, 'r') as cds_file, open(output_file_path, 'w') as output_file:
        for line in cds_file:
            if line.startswith('>'):  # 匹配序列名行
                seq_name = line.strip()[1:]  # 移除'>'字符
                if seq_name in gff_dict:  # 如果序列名在GFF字典中
                    output_file.write(f">{gff_dict[seq_name]}\n")  # 替换为GFF文件中的第二列内容
                else:
                    output_file.write(line)  # 如果不在字典中，保留原样
            else:
                output_file.write(line)  # 序列数据直接写入输出文件

# 输入文件和输出文件的路径
cds_file_path = "Amorphophallus_konjac.clean.cds"  # 请替换为CDS文件的实际路径
gff_file_path = "Ako.new.gff"  # 请替换为GFF文件的实际路径
output_file_path = "Ako.cds"  # 输出文件的名称

replace_seq_name(cds_file_path, gff_file_path, output_file_path)

生成pep文件

In [13]:
def replace_seq_name(cds_file_path, gff_file_path, output_file_path):
    # 读取GFF文件，创建一个字典来存储第七列和第二列的对应关系
    gff_dict = {}
    with open(gff_file_path, 'r') as gff_file:
        for line in gff_file:
            if line.startswith('#'):  # 跳过注释行
                continue
            columns = line.strip().split('\t')
            gff_dict[columns[6]] = columns[1]  # 存储第七列和第二列的对应关系

    # 读取CDS文件，并替换序列名
    with open(cds_file_path, 'r') as cds_file, open(output_file_path, 'w') as output_file:
        for line in cds_file:
            if line.startswith('>'):  # 匹配序列名行
                seq_name = line.strip()[1:]  # 移除'>'字符
                if seq_name in gff_dict:  # 如果序列名在GFF字典中
                    output_file.write(f">{gff_dict[seq_name]}\n")  # 替换为GFF文件中的第二列内容
                else:
                    output_file.write(line)  # 如果不在字典中，保留原样
            else:
                output_file.write(line)  # 序列数据直接写入输出文件

# 输入文件和输出文件的路径
cds_file_path = "Amorphophallus_konjac.clean.pep"  # 请替换为CDS文件的实际路径
gff_file_path = "Ako.new.gff"  # 请替换为GFF文件的实际路径
output_file_path = "Ako.pep"  # 输出文件的名称

replace_seq_name(cds_file_path, gff_file_path, output_file_path)