# Library

In [92]:
import os
import re
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
import webbrowser
import importlib
import subprocess

# Functions

In [93]:
def install_missing_packages(package_names):
    """
    Install Missing Packages

    This function checks if a list of packages is already installed and installs any missing packages using pip.

    Parameters:
    - package_names (list): A list of package names to be installed.

    Returns:
    - None

    Note: This function requires the `subprocess` and `importlib` modules to be imported.

    Example Usage:
    install_missing_packages(['h2o', 'numpy', 'pandas'])
    """
   


    for package_name in package_names:
        try:
            importlib.import_module(package_name)
            print(f"{package_name} package is already installed")
        except ImportError:
            print(f"{package_name} package not found, installing with pip...")
            subprocess.call(['pip', 'install', package_name])


In [94]:
package_list = ["bibtexparser"]
install_missing_packages(package_list)

bibtexparser package is already installed


## Processing .bib file

### extract_info

In [95]:
def generate_citation_key(author, year, title):
    # List of words to exclude
    exclude_words = [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
        "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having",
        "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
        "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during",
        "before", "after", "above", "below", "to", "from", "up", "down", "in",
        "out", "on", "off", "over", "under", "again", "further", "then",
        "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
        "t", "can", "will", "just", "don", "should", "now"
    ]

    # Extract the first significant word from the title
    first_word = re.findall(r'\b(\w+)\b', title)
    first_word = [
        word.lower() for word in first_word
        if word.lower() not in exclude_words
    ]
    first_word = first_word[0] if first_word else ''

    # Only take the last name of the first author, with the first word in lowercase
    first_author_parts = author.split(', ')
    first_author_last_name = first_author_parts[0].lower()
    first_author_first_word = first_author_parts[1].split()[0].lower() if len(
        first_author_parts) > 1 else ''

    # Construct citation key
    citation_key = f"{first_author_last_name}_{first_word}_{year}"

    return citation_key

In [96]:
def extract_info(file, info_option=1, counts_to_extract=200, start_count=0):
    file_path = os.path.join(os.getcwd(), "paper", file + ".bib")
    with open(file_path, encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
 
    data = []
    generated_keys = set()  # to keep track of generated keys
    for entry in bib_database.entries:
        title = entry.get('title', '')
        authors = entry.get('author', '').split(' and ')[0]  # Only consider the first author
        year = entry.get('year', '0000')  # Default year set to '0000' if not available
        cite = generate_citation_key(authors, year, title)  # Generate citation key
 
        # Check if the generated key already exists
        if cite in generated_keys:
            print(f"Warning: Duplicate citation key {cite} generated!")
        else:
            generated_keys.add(cite)
 
        abstract = entry.get('abstract', '')  # Extract abstract
 
        # Extract citation count from 'note' field
        note = entry.get('note', '')
        match = re.search(r'Cited by:\s*(\d+)', note, re.IGNORECASE)
        citation_count = match.group(1) if match else '0'  # Default citation count set to '0' if not available
 
        data.append({
            'cite': cite,
            'title': title,
            'authors': authors,
            'year': year,
            'abstract': abstract,
            'citation_count': citation_count
        })
 
    # Sort data by citation count, year in reverse order, then by authors and cite
    data.sort(key=lambda x: (-int(x['citation_count']), -int(x['year']), x['authors'], x['cite']))
 
    # Format data into string and limit to top counts_to_extract entries
    data_str = []
    for entry in data[start_count:min(len(data), counts_to_extract)]:
        if info_option == 1:
            # Option 1: Just titles
            entry_str = f"{entry['citation_count']}: {entry['title']}"
        elif info_option == 2:
            # Option 2: Cite, author, year, title
            entry_str = f"{entry['cite']}: {entry['title']} ({entry['authors']}, {entry['year']})"
        elif info_option == 3:
            # Option 3: Cite, author, year, title, abstract
            entry_str = f"{entry['cite']}: {entry['title']} ({entry['authors']}, {entry['year']})\n{entry['abstract']}"
        data_str.append(entry_str)
 
    html_content = "<br>".join(data_str)
    html_file_path = os.path.join(os.getcwd(), "paper", "output"+str(info_option)+".html")
    with open(html_file_path, 'w', encoding='utf-8') as html_file:
        html_file.write(html_content)
 
    # Open the HTML file in a web browser
    webbrowser.open_new_tab('file://' + os.path.realpath(html_file_path))

### file_for_zotero

In [104]:
def file_for_zotero(file):
    file_path = os.path.join(os.getcwd(), "paper", file + ".bib")
    with open(file_path, encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
 
    # Remove url and note fields from all entries
    for entry in bib_database.entries:
        entry.pop('url', None)
        entry.pop('note',None)
        
        title = entry.get('title', '')
        authors = entry.get('author', '').split(' and ')[0]  # Only consider the first author
        year = entry.get('year', '0000')  # Default year set to '0000' if not available
        cite = generate_citation_key(authors, year, title)  # Generate citation key
 
        # Check if the generated key already exists
        if cite in generated_keys:
            print(f"Warning: Duplicate citation key {cite} generated!")
        else:
            entry['ID'] = cite
 

    
    # Create the full path for the new file
    new_file_path = os.path.join(os.getcwd(), "paper", 'filtered_'+file + ".bib")
   
    # Write the new BibTeX database to a file with utf-8 encoding
    writer = BibTexWriter()
    with open(new_file_path, 'w', encoding='utf-8') as new_bibtex_file:
        new_bibtex_file.write(writer.write(bib_database))

# Extract Information

In [99]:
file = "scopus (20)"

In [100]:
titles = extract_info(file, info_option=1, counts_to_extract=150)

In [101]:
cite_titles = extract_info(file, info_option=2,  counts_to_extract=50)

In [102]:
cite_titles_abstract = extract_info(file, info_option=3,  counts_to_extract=10, start_count=0)

In [105]:
file_for_zotero(file)

