In [1]:
import re


def extract_dois_from_ris(ris_file_path, output_file_path):
    """
    从RIS格式文件中提取DOI号并保存到txt文件

    参数:
        ris_file_path: RIS格式文件路径
        output_file_path: 输出txt文件路径
    """
    try:
        # 读取RIS文件
        with open(ris_file_path, "r", encoding="utf-8") as file:
            ris_content = file.read()

        # 使用正则表达式匹配DOI号
        # 匹配格式为: DO  - [DOI内容] (直到遇到下一个字段标记如 IS  -)
        doi_pattern = r"DO\s*-\s*(.*?)(?=\n\s*[A-Z]{2}\s*-)"
        doi_matches = re.findall(doi_pattern, ris_content)

        # 去除每个DOI的前后空格
        doi_matches = [doi.strip() for doi in doi_matches]

        # 将匹配结果写入txt文件
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            for doi in doi_matches:
                output_file.write(doi + "\n")

        print(f"成功提取{len(doi_matches)}个DOI号，已保存到{output_file_path}")
        return len(doi_matches)

    except FileNotFoundError:
        print(f"错误：找不到文件 {ris_file_path}")
        return 0
    except Exception as e:
        print(f"处理文件时发生错误: {str(e)}")
        return 0


In [2]:
# 使用示例
if __name__ == "__main__":
    # 输入RIS文件路径
    ris_file = r"./new.txt"
    # 输出txt文件路径
    output_file = "./step1_new_txt_extracted_dois.txt"

    # 调用函数提取DOI
    extract_dois_from_ris(ris_file, output_file)


成功提取1865个DOI号，已保存到./step1_new_txt_extracted_dois.txt


In [3]:
import sqlite3

def read_db_data(db_file):
    # 1. 建立数据库连接
    try:
        conn = sqlite3.connect(db_file)
        print(f"成功连接到数据库: {db_file.split('\\')[-1]}")
    except Exception as e:
        print(f"连接数据库失败: {e}")
        return

    # 2. 创建一个游标对象
    cursor = conn.cursor()

    try:
        # 3. 编写并执行 SQL 查询语句
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        print(f"数据库中的表: {tables}")

        if tables:
            table_name = tables[0][0] 
            print(f"\n正在读取表 '{table_name}' 的数据...")
            
            sql_query = f"SELECT * FROM {table_name}"
            cursor.execute(sql_query)
            
            # 4. 获取所有结果
            rows = cursor.fetchall()
            
            # 5. 获取列名（表头）
            if cursor.description:
                column_names = [description[0] for description in cursor.description]
                print(f"列名: {column_names}")
                
                # --- 新增：查找目标列的索引 ---
                target_col = 'electronic_resource_number'
                if target_col in column_names:
                    col_index = column_names.index(target_col)
                    print(f"\n'{target_col}' 位于第 {col_index} 列。")
                else:
                    print(f"\n错误：未找到列 '{target_col}'")
                    return
                # -------------------------
            
            # 6. 遍历并打印数据
            count = 0 # 初始化计数器
            for i, row in enumerate(rows, 1):
                # 修改：通过索引访问数据，并检查是否为None
                # 注意：row[col_index] 对应的就是 electronic_resource_number
                if row[col_index] is not None:
                    print("=========================="+str(i)+"==========================")
                    print(row[col_index]) # 输出 DOI
                    count += 1
            
            print(f"\n共读取 {len(rows)} 行数据，其中 {count} 条包含有效的电子资源编号。")
            
    except Exception as e:
        print(f"读取数据时出错: {e}")
    
    finally:
        # 7. 关闭游标和连接
        cursor.close()
        conn.close()
        print("\n数据库连接已关闭。")

# 使用函数
db_path = r"./new.enl"
read_db_data(db_path)


成功连接到数据库: ./new.enl
数据库中的表: [('enl_refs',), ('sqlite_sequence',)]

正在读取表 'enl_refs' 的数据...
列名: ['id', 'trash_state', 'text_styles', 'reference_type', 'author', 'year', 'title', 'pages', 'secondary_title', 'volume', 'number', 'number_of_volumes', 'secondary_author', 'place_published', 'publisher', 'subsidiary_author', 'edition', 'keywords', 'type_of_work', 'date', 'abstract', 'label', 'url', 'tertiary_title', 'tertiary_author', 'notes', 'isbn', 'custom_1', 'custom_2', 'custom_3', 'custom_4', 'alternate_title', 'accession_number', 'call_number', 'short_title', 'custom_5', 'custom_6', 'section', 'original_publication', 'reprint_edition', 'reviewed_item', 'author_address', 'caption', 'custom_7', 'electronic_resource_number', 'translated_author', 'translated_title', 'name_of_database', 'database_provider', 'research_notes', 'language', 'access_date', 'last_modified_date', 'record_properties', 'added_to_library', 'record_last_updated', 'reserved3', 'fulltext_downloads', 'read_status', 'ratin

In [4]:
import sqlite3

def read_db_data(db_file, output_txt="output_doi.txt"):
    # 1. 建立数据库连接
    try:
        conn = sqlite3.connect(db_file)
        print(f"成功连接到数据库: {db_file.split(chr(92))[-1]}")
    except Exception as e:
        print(f"连接数据库失败: {e}")
        return

    # 2. 创建一个游标对象
    cursor = conn.cursor()

    try:
        # 3. 获取所有表名
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        print(f"数据库中的表: {tables}")

        if tables:
            table_name = tables[0][0]
            print(f"\n正在读取表 '{table_name}' 的数据...")

            sql_query = f"SELECT * FROM {table_name}"
            cursor.execute(sql_query)

            # 4. 获取所有结果
            rows = cursor.fetchall()

            # 5. 获取列名（表头）
            if cursor.description:
                column_names = [description[0] for description in cursor.description]
                print(f"列名: {column_names}")

                # 查找目标列的索引
                target_col = 'electronic_resource_number'
                if target_col in column_names:
                    col_index = column_names.index(target_col)
                    print(f"\n'{target_col}' 位于第 {col_index} 列。")
                else:
                    print(f"\n错误：未找到列 '{target_col}'")
                    return
            else:
                print("无法获取列信息。")
                return

            # 6. 遍历数据并写入 TXT 文件
            count = 0
            with open(output_txt, 'w', encoding='utf-8') as f:
                for i, row in enumerate(rows, 1):
                    if row[col_index] is not None:
                        value = str(row[col_index]).strip()
                        if value:  # 确保不是空字符串
                            f.write(value + '\n')
                            print(f"[{i}] {value}")
                            count += 1

            print(f"\n共读取 {len(rows)} 行数据，其中 {count} 条包含有效的电子资源编号。")
            print(f"结果已保存到: {output_txt}")

        else:
            print("数据库中没有找到任何表。")

    except Exception as e:
        print(f"读取数据时出错: {e}")

    finally:
        # 7. 关闭游标和连接
        cursor.close()
        conn.close()
        print("\n数据库连接已关闭。")



# 使用函数
db_path = r"./new.enl"
output_path = r"./step1_new_enl_extracted_dois.txt"

read_db_data(db_path, output_path)


成功连接到数据库: ./new.enl
数据库中的表: [('enl_refs',), ('sqlite_sequence',)]

正在读取表 'enl_refs' 的数据...
列名: ['id', 'trash_state', 'text_styles', 'reference_type', 'author', 'year', 'title', 'pages', 'secondary_title', 'volume', 'number', 'number_of_volumes', 'secondary_author', 'place_published', 'publisher', 'subsidiary_author', 'edition', 'keywords', 'type_of_work', 'date', 'abstract', 'label', 'url', 'tertiary_title', 'tertiary_author', 'notes', 'isbn', 'custom_1', 'custom_2', 'custom_3', 'custom_4', 'alternate_title', 'accession_number', 'call_number', 'short_title', 'custom_5', 'custom_6', 'section', 'original_publication', 'reprint_edition', 'reviewed_item', 'author_address', 'caption', 'custom_7', 'electronic_resource_number', 'translated_author', 'translated_title', 'name_of_database', 'database_provider', 'research_notes', 'language', 'access_date', 'last_modified_date', 'record_properties', 'added_to_library', 'record_last_updated', 'reserved3', 'fulltext_downloads', 'read_status', 'ratin

In [5]:
# 定义文件名
file1_name = './step1_new_txt_extracted_dois.txt'
file2_name = './step1_new_enl_extracted_dois.txt'
merged_file_name = 'step1_all_list_dois.txt'

try:
    # 打开（或创建）目标文件用于写入
    with open(merged_file_name, 'w', encoding='utf-8') as outfile:
        # 读取并写入第一个文件
        with open(file1_name, 'r', encoding='utf-8') as infile1:
            outfile.write(infile1.read())
            # 如果想在两个文件内容之间加换行，可以取消下面一行的注释
            # outfile.write('\n')
            
        # 读取并写入第二个文件
        with open(file2_name, 'r', encoding='utf-8') as infile2:
            outfile.write(infile2.read())
            
    print(f"合并成功！已生成 {merged_file_name}")

except FileNotFoundError:
    print("错误：找不到源文件，请检查文件名是否正确。")
except Exception as e:
    print(f"发生错误：{e}")


合并成功！已生成 step1_all_list_dois.txt


In [6]:
# 读取文件内容
with open(r'./step1_all_list_dois.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# 去除每行的换行符和前后空格
lines = [line.strip() for line in lines if line.strip()]

# 使用集合去重
unique_dois = list(set(lines))

# 按原始顺序排序（可选，如果需要保持原始顺序）
# unique_dois = list(dict.fromkeys(lines))

# 将去重后的结果写回文件
with open(r'./step1_all_list_doi.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(unique_dois))

print(f"去重完成！原始行数: {len(lines)}, 去重后行数: {len(unique_dois)}")


去重完成！原始行数: 3735, 去重后行数: 1852


In [15]:
import os

# 统计文件数目
folder_path = "./literatures_pdfs"  # 替换为你的文件夹路径
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
print(f"文件夹 '{folder_path}' 中的已找到文件数目: {file_count}")
print(f"未找到数目: {len(unique_dois) - file_count}")
print("完成！")

文件夹 './literatures_pdfs' 中的已找到文件数目: 1357
未找到数目: 495
完成！
