In [63]:
import os
from collections import Counter
from pathlib import Path
import pandas as pd

In [64]:
raw_data_path = '../raw_data/'
raw_data_dir_names = ['ethan', 'sai']
raw_data_dirs = [raw_data_path+raw_data_dir_name for raw_data_dir_name in raw_data_dir_names]

In [65]:
counter = Counter()
paths = {}
for dir in raw_data_dirs:
    for file in Path(dir).rglob("*"):
        if file.is_file():
            doc_format = file.suffix.lower() if file.suffix else "no_extension"
            counter[doc_format]+=1
            list_of_paths = paths.get(doc_format,[])
            list_of_paths.append(file)
            paths[doc_format] = list_of_paths

In [66]:
documents_summary_df = pd.DataFrame(counter.items(), columns=["Document Format", "Document Count"])
documents_summary_df

Unnamed: 0,Document Format,Document Count
0,.ris,2
1,.html,43
2,.pdf,253
3,no_extension,2
4,.docx,1


In [67]:
valid_extensions = ['.html','.pdf','.docx']
metadata_extension = ['.ris']
total_documents = documents_summary_df[documents_summary_df["Document Format"].isin(valid_extensions)]["Document Count"].sum()

print(f"Total Document Count: {total_documents}")

Total Document Count: 297


In [68]:
import re
import pandas as pd

metadata_dict = {
    "Title":[],
    "URL":[],
    "L1":[],
    "L2":[]
}
title_pattern = "^TI.+"
url_pattern = "^UR.+"
l1_pattern = "^L1.+"
l2_pattern = "^L2.+"
for path in paths[".ris"]:
    with open(path, "r", encoding="utf-8") as f:
        metadata_str = f.read()
        publications = metadata_str.split("\n\n")
        for publication in publications:
            title = ""
            url = ""
            l1 = ""
            l2 = ""
            title_match = re.search(title_pattern, publication, re.MULTILINE)
            if title_match:
                title = title_match[0].split(" - ")[-1]
            titles = metadata_dict.get("Title")
            titles.append(title)
            metadata_dict["Title"] = titles
            url_match = re.search(url_pattern, publication, re.MULTILINE)
            if url_match:
                url = url_match[0].split(" - ")[-1]
            urls = metadata_dict.get("URL")
            urls.append(url)
            metadata_dict["URL"] = urls
            l1_match = re.search(l1_pattern, publication, re.MULTILINE)
            if l1_match:
                l1 = l1_match[0].split(" - ")[-1]
            l1s = metadata_dict.get("L1")
            l1s.append(l1)
            metadata_dict["L1"] = l1s
            l2_match = re.search(l2_pattern, publication, re.MULTILINE)
            if l2_match:
                l2 = l2_match[0].split(" - ")[-1]
            l2s = metadata_dict.get("L2")
            l2s.append(l2)
            metadata_dict["L2"] = l2s
metadata_df = pd.DataFrame(metadata_dict)
metadata_df.head()

Unnamed: 0,Title,URL,L1,L2
0,Spatial Autocorrelation in Mass Spectrometry I...,https://doi.org/10.1021/acs.analchem.6b00672,https://pubs.acs.org/doi/pdf/10.1021/acs.analc...,
1,Probabilistic Segmentation of Mass Spectrometr...,https://www.sciencedirect.com/science/article/...,http://www.mcponline.org/article/S153594762033...,https://www-sciencedirect-com.ezproxy.neu.edu/...
2,A noise-robust deep clustering of biomolecular...,https://doi.org/10.1093/bioinformatics/btad067,https://academic.oup.com/bioinformatics/articl...,https://academic.oup.com/bioinformatics/articl...
3,Prioritization of m/z-Values in Mass Spectrome...,https://doi.org/10.1021/acs.analchem.9b05764,https://pubs.acs.org/doi/pdf/10.1021/acs.analc...,
4,Diagnostic biomarkers in knee osteoarthritis: ...,https://doi.org/10.1177/10225536241267027,https://journals.sagepub.com/doi/pdf/10.1177/1...,


In [97]:
metadata_df.to_csv("publication_urls.csv")

In [85]:
def get_suffix(path):
    return path.stem

In [94]:
titles_from_path = set(list(map(lambda x: get_suffix(x), paths['.pdf'],)) + list(map(lambda x: get_suffix(x), paths['.html'],)) + list(map(lambda x: get_suffix(x), paths['.docx'],)))

In [95]:
titles_from_metadata =set(metadata_df['Title'])

In [96]:
titles_from_path.intersection(titles_from_metadata)

{'4.0 Image Gradients and Gradient Filtering'}