In [1]:
import os
import re
import pandas as pd
import pybtex
from tqdm import tqdm
from pybtex.database import parse_file

pybtex.errors.set_strict_mode(False)

class BibData:
    data_dicts: []
    data_df: pd.DataFrame
    # num_of_literature: int
    path_to_data: str

    def __init__(self, path_to_data: str, is_dir=True, show_log=False):
        # path_to_data: String. should point to either a .bib file or a directory directly contains .bib files
        # show_log: if set to true, will print the number of literature imported from each individual file
        self.data_df = None
        self.path_to_data = path_to_data
        if not os.path.isdir(path_to_data) and is_dir:
            print("Warning: A path to a single file is given, it's recommanded to set parameter \"is_dir\" to false.")
            is_dir = False

        if is_dir:
            self.data_dicts = self._dir_to_list(path_to_data, show_log)
        else:
            self.data_dicts = self._single_bib_to_list(path_to_data, show_log)
        self.num_of_literature = len(self.data_dicts)
        print("Imported " + str(self.num_of_literature) + " literature(s) from \"" + path_to_data + "\"")

    def _single_bib_to_list(self, file_path, show_log=False):
        # get the reference details in one .bib file in the format of a list.
        # reture format:
        # a tuple contains two elements:
        # 1st element: a list of lists contains names of the fields for each literature
        # 2nd element: a list of lists contains data of the fields for each literature
        entries_dicts = []

        bibdata = parse_file(file_path)
        for entry_key in bibdata.entries:
            entry_fields = ["entry_key"]
            entry_values = [entry_key]

            for field in bibdata.entries[entry_key].fields:
                entry_fields.append(str(field).lower())
                if str(field) == "doi" and "https://doi.org/" in str(bibdata.entries[entry_key].fields[field]):
                    entry_values.append(str(bibdata.entries[entry_key].fields[field])[16:])
                else:
                    entry_values.append(str(bibdata.entries[entry_key].fields[field]))
            entry_dict = dict(zip(entry_fields, entry_values))

            entries_dicts.append(entry_dict)
        if show_log:
            print("Imported " + str(len(entries_fields)) + " literature from " + file_path)
        return entries_dicts

    def _dir_to_list(self, path, show_log=False):
        # takes in a .bib file or a folder directly contains multiple .bib files as input
        # return a list of single literature lists
        dir_entries_dicts = []

        files = os.listdir(dir_path)
        file_num = len(files)
        error_num = 0
        for file in files:
            try:
                file_entries_dicts = self._single_bib_to_list(path + file)
                dir_entries_dicts = dir_entries_dicts + file_entries_dicts
                if show_log:
                    print("Imported " + str(len(file_entries_dicts)) + " literature from " + file)
            except Exception as err:
                print("Error occured when reading file \"" + file + "\". Message: ", err)
                error_num += 1
        print("\n\n-------------------------------------------------------")
        print(
            "Successfully read " + str(file_num - error_num) + " file(s), " + str(error_num) + " file(s) raised error")
        return dir_entries_dicts

    def remove_duplication(self, fill_na=False, duplication_key=[], keep="first", internal_call=False):
        if self.data_df is None:
            print("DataFrame is never generated, will generate a DataFrame first")
            self.to_DataFrame(True, duplication_key, keep)
        else:
            if fill_na:
                print("non values:")
                print("doi: ", bibdata.data_df["doi"].isnull().sum(), 
                      "\nurl:", bibdata.data_df["url"].isnull().sum(), 
                      "\ntitle:", bibdata.data_df["title"].isnull().sum(), 
                      "\nabstract:", bibdata.data_df["abstract"].isnull().sum())
                bibdata.data_df["doi"] = bibdata.data_df["doi"].fillna(bibdata.data_df["url"])
                bibdata.data_df["doi"] = bibdata.data_df["doi"].fillna(bibdata.data_df["title"])
                bibdata.data_df["doi"] = bibdata.data_df["doi"].fillna(bibdata.data_df["abstract"])
                bibdata.data_df["abstract"] = bibdata.data_df["abstract"].fillna(bibdata.data_df["title"])
                bibdata.data_df["url"] = bibdata.data_df["url"].fillna(bibdata.data_df["doi"])
                print("\nafter filling, non values:")
                print("doi: ", bibdata.data_df["doi"].isnull().sum(), 
                      "\nurl:", bibdata.data_df["url"].isnull().sum(), 
                      "\ntitle:", bibdata.data_df["title"].isnull().sum(), 
                      "\nabstract:", bibdata.data_df["abstract"].isnull().sum())
            if internal_call==False:
                print("\n\n-----------------DUPLICATION REMOVAL-------------------")
            pre_rm_dup = self.data_df.shape[0]
            if duplication_key == []:
                self.data_df = self.data_df.drop_duplicates(keep=keep)
            elif len(duplication_key)==1:
                self.data_df = self.data_df.drop_duplicates(keep=keep, subset=duplication_key)
            else:
                for key in duplication_key:
                    self.remove_duplication(duplication_key=[key], internal_call=True)
                 
            if internal_call==False:
                print("\n\nSummary:")
            print("key for duplication detection: \"", duplication_key, "\"")
            print("before duplication removal: " + str(pre_rm_dup))
            print("after duplication removal: " + str(self.data_df.shape[0]))
            print("removed records: " + str(pre_rm_dup - self.data_df.shape[0])+"\n")
            # self.data_df = self.data_df.reset_index()
        return

    def to_DataFrame(self, remove_duplicate=False, fill_na=False, duplication_key="doi", keep="first"):
        if self.data_df is None:
            columns_labels = []
            for entry_dictionary in tqdm(self.data_dicts, desc="Collecting Labels"):
                keys = list(entry_dictionary.keys())
                for key in keys:
                    if key not in columns_labels:
                        columns_labels.append(key)
            print(str(len(columns_labels)) + " labels are collected.")

            bib_df = pd.DataFrame(columns=columns_labels)
            for entry_dictionary in tqdm(self.data_dicts, desc="Filling in Data"):
                entry_df = pd.DataFrame(entry_dictionary, index=[0])
                bib_df = pd.concat([bib_df, entry_df], join="outer")
            bib_df = bib_df.reset_index()

            self.data_df = bib_df

            if remove_duplicate:
                self.remove_duplication(fill_na, duplication_key, keep)

            return self.data_df
        else:
            return self.data_df

    def to_excel(self, folder_path=""):
        if self.data_df is None:
            print("DataFrame is never generated, will generate a DataFrame first")
            self.to_DataFrame()
            self.data_df.to_excel(folder_path + "BibDataOutput.xlsx")
        else:
            self.data_df.to_excel(folder_path + "BibDataOutput.xlsx")
        print(str(self.data_df.shape[0]) + " records are output to\""+folder_path + "BibDataOutput.xlsx\"")
        return

    def record_num(self):
        return self.data_df.shape[0]

    def data_source(self):
        return self.path_to_data

In [2]:
dir_path = "data/refined_search_results/"
file_path = "data/references.bib"
bibdata = BibData(dir_path)
bibdata.to_DataFrame()
bibdata.remove_duplication(fill_na=True, duplication_key=["doi", "url", "title", "abstract", "entry_key"])



Error occured when reading file "springerlink.txt". Message:  plugin pybtex.database.input.suffixes for suffix .txt not found
Error occured when reading file ".DS_Store". Message:  plugin pybtex.database.input.suffixes. not found




Error occured when reading file "SearchResults2.csv". Message:  plugin pybtex.database.input.suffixes for suffix .csv not found




Error occured when reading file "SearchResults1-2.csv". Message:  plugin pybtex.database.input.suffixes for suffix .csv not found
Error occured when reading file "SearchResults1-1.csv". Message:  plugin pybtex.database.input.suffixes for suffix .csv not found
Error occured when reading file "springerlink2.txt". Message:  plugin pybtex.database.input.suffixes for suffix .txt not found
Error occured when reading file "springerlink1-2.txt". Message:  plugin pybtex.database.input.suffixes for suffix .txt not found
Error occured when reading file "springerlink1-1.txt". Message:  plugin pybtex.database.input.suffixes for suffix .txt not found


-------------------------------------------------------
Successfully read 37 file(s), 8 file(s) raised error
Imported 5438 literature(s) from "data/refined_search_results/"


Collecting Labels: 100%|████████████████| 5438/5438 [00:00<00:00, 767863.76it/s]


38 labels are collected.


Filling in Data: 100%|█████████████████████| 5438/5438 [00:21<00:00, 254.34it/s]

non values:
doi:  511 
url: 2440 
title: 0 
abstract: 7

after filling, non values:
doi:  0 
url: 0 
title: 0 
abstract: 0


-----------------DUPLICATION REMOVAL-------------------
key for duplication detection: " ['doi'] "
before duplication removal: 5438
after duplication removal: 4751
removed records: 687

key for duplication detection: " ['url'] "
before duplication removal: 4751
after duplication removal: 4751
removed records: 0

key for duplication detection: " ['title'] "
before duplication removal: 4751
after duplication removal: 4659
removed records: 92

key for duplication detection: " ['abstract'] "
before duplication removal: 4659
after duplication removal: 4619
removed records: 40

key for duplication detection: " ['entry_key'] "
before duplication removal: 4619
after duplication removal: 4614
removed records: 5



Summary:
key for duplication detection: " ['doi', 'url', 'title', 'abstract', 'entry_key'] "
before duplication removal: 5438
after duplication removal: 4614
re




In [4]:
bibdata.to_excel()

4614 records are output to"BibDataOutput.xlsx"


In [5]:
bibdata.record_num()
bibdata.data_source()

'data/refined_search_results/'