In [2]:
import os
from collections import Counter
from pathlib import Path
import pandas as pd

In [3]:
raw_data_path = '/courses/DS5500.202610/data/team1/raw_data/'
raw_data_dir_names = ['ethan', 'sai']
raw_data_dirs = [raw_data_path+raw_data_dir_name for raw_data_dir_name in raw_data_dir_names]

In [4]:
counter = Counter()
paths = {}
for dir in raw_data_dirs:
    for file in Path(dir).rglob("*"):
        if file.is_file():
            doc_format = file.suffix.lower() if file.suffix else "no_extension"
            counter[doc_format]+=1
            list_of_paths = paths.get(doc_format,[])
            list_of_paths.append(str(file))
            paths[doc_format] = list_of_paths

In [5]:
documents_summary_df = pd.DataFrame(counter.items(), columns=["Document Format", "Document Count"])
documents_summary_df

Unnamed: 0,Document Format,Document Count
0,.ris,3
1,.html,43
2,.pdf,253
3,no_extension,2
4,.docx,1


In [7]:
import sys
!{sys.executable} -m pip install pdfkit


Defaulting to user installation because normal site-packages is not writeable


In [8]:
!{sys.executable} -m pip install PyMuPDF

Defaulting to user installation because normal site-packages is not writeable


In [9]:
!{sys.executable} -m pip install mammoth

Defaulting to user installation because normal site-packages is not writeable


In [10]:
!{sys.executable} -m pip install "camelot-py[cv]"

Defaulting to user installation because normal site-packages is not writeable


In [15]:
from pathlib import Path
from collections import Counter

raw_data_path = '/courses/DS5500.202610/data/team1/raw_data/'
raw_data_dir_names = ['ethan', 'sai']
raw_data_dirs = [raw_data_path + name for name in raw_data_dir_names]

# Output directories in your home space
processed_text_dir = Path('/home/anbarasan.p/processed_text')
processed_text_dir.mkdir(parents=True, exist_ok=True)

processed_tables_dir = Path('/home/anbarasan.p/processed_tables')
processed_tables_dir.mkdir(parents=True, exist_ok=True)

counter = Counter()
paths = {}

for dir in raw_data_dirs:
    for file in Path(dir).rglob("*"):
        if file.is_file():
            doc_format = file.suffix.lower() if file.suffix else "no_extension"
            counter[doc_format] += 1
            paths.setdefault(doc_format, []).append(str(file))

print(counter)


Counter({'.pdf': 253, '.html': 43, '.ris': 3, 'no_extension': 2, '.docx': 1})


In [20]:
from pathlib import Path
from joblib import Parallel, delayed
from bs4 import BeautifulSoup
from docx import Document
import fitz  # PyMuPDF
import warnings
import contextlib
import traceback
import os

# config
processed_base = Path('/home/anbarasan.p')
processed_text_dir = processed_base / 'processed_text'
processed_images_dir = processed_base / 'processed_images'

for d in [processed_text_dir, processed_images_dir]:
    d.mkdir(parents=True, exist_ok=True)

# Assuming `paths` is a dict like {'pdf': [...], '.html': [...], '.docx': [...], 'no_extension': [...]}
pdf_files = paths.get('.pdf', [])
non_pdf_files = paths.get('.html', []) + paths.get('.docx', []) + paths.get('no_extension', [])
all_files = pdf_files + non_pdf_files
print(f"Found {len(all_files)} total files to process ({len(pdf_files)} PDFs, {len(non_pdf_files)} non-PDFs).\n")


# extraction function
def extract_text(file_path):
    file_path = Path(file_path)
    name = file_path.stem
    suffix = file_path.suffix.lower()
    text_out = processed_text_dir / f"{name}.txt"
    image_out = processed_images_dir / f"{name}_images.txt"

    result = {"file": name, "status": "success", "type": suffix or "no_extension"}

    # Skip unreadable or missing files 
    if not file_path.exists() or not os.access(file_path, os.R_OK):
        print(f" Skipped {name}: File not readable or missing.")
        result["status"] = "error"
        result["error"] = "Permission denied or file not readable"
        return result

    try:
        text = ""

        # pdf
        if suffix == ".pdf":
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    with contextlib.redirect_stderr(open(os.devnull, 'w')):
                        with fitz.open(file_path) as doc:
                            text = "".join([page.get_text("text") for page in doc])
            except Exception as e:
                print(f" Skipped {name}: PDF read failed ({e}).")
                result["status"] = "error"
                result["error"] = str(e)
                return result

        # html
        elif suffix == ".html":
            try:
                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                    soup = BeautifulSoup(f, "html.parser")
                    text = soup.get_text(separator="\n", strip=True)
            except Exception as e:
                print(f" Skipped {name}: HTML parse failed ({e}).")
                result["status"] = "error"
                result["error"] = str(e)
                return result

        # docx
        elif suffix == ".docx":
            try:
                doc = Document(file_path)
                text = "\n".join([para.text for para in doc.paragraphs])
            except Exception as e:
                print(f" Skipped {name}: DOCX read failed ({e}).")
                result["status"] = "error"
                result["error"] = str(e)
                return result

        # no extension
        elif suffix == "" or suffix == "no_extension":
            try:
                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                    text = f.read()
            except Exception as e:
                print(f" Skipped {name}: Text read failed ({e}).")
                result["status"] = "error"
                result["error"] = str(e)
                return result

        else:
            print(f" Skipped {name}: Unsupported file type ({suffix}).")
            result["status"] = "error"
            result["error"] = f"Unsupported type: {suffix}"
            return result

        # Write extracted text and placeholder 
        text_out.write_text(text.strip(), encoding="utf-8")
        image_out.write_text("[IMAGE PLACEHOLDER]\n", encoding="utf-8")

    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)
        print(f" Skipped {name}: Unexpected error ({e}).")

    return result


# parallel execution
n_jobs = 2
results = Parallel(n_jobs=n_jobs, backend="loky")(
    delayed(extract_text)(file_path) for file_path in all_files
)

# summary
success = sum(1 for r in results if r["status"] == "success")
errors = len(all_files) - success

print("\n Extraction complete — results summary:")
print(f" {success} succeeded, {errors} failed/skipped")
print(f"Outputs saved under:\n- Text: {processed_text_dir}\n- Images: {processed_images_dir}")


Found 299 total files to process (253 PDFs, 46 non-PDFs).

MuPDF error: format error: object out of range (640 0 R); xref size 503


 Extraction complete — results summary:
 256 succeeded, 43 failed/skipped
Outputs saved under:
- Text: /home/anbarasan.p/processed_text
- Images: /home/anbarasan.p/processed_images


In [21]:
pip install rispy --user

Note: you may need to restart the kernel to use updated packages.


In [23]:
import rispy
from pathlib import Path
import pandas as pd

ris_dir = Path("/courses/DS5500.202610/data/team1/raw_data/sai")  # adjust
ris_files = list(ris_dir.rglob("*.ris"))

records = []
for ris_file in ris_files:
    with open(ris_file, "r", encoding="utf-8", errors="ignore") as f:
        entries = rispy.load(f)
        for entry in entries:
            records.append({
                "file": ris_file.name,
                "title": entry.get("title"),
                "authors": entry.get("authors"),
                "year": entry.get("year"),
                "doi": entry.get("doi")  
            })

ris_df = pd.DataFrame(records)
ris_df.head()


Unnamed: 0,file,title,authors,year,doi
0,phd.ris,Regional Registration of Whole Slide Image Sta...,"[Paknezhad, Mahsa, Loh, Sheng Yang Michael, Ch...",2020,
1,phd.ris,Deep Learning on Multimodal Chemical and Whole...,"[Haque, Md Inzamam Ul, Mukherjee, Debangshu, S...",2022,
2,phd.ris,3D non-rigid registration by gradient descent ...,"[Cachier, P., Pennec, X.]",2000,10.1109/MMBIA.2000.852376
3,phd.ris,Chemo-informatic strategy for imaging mass spe...,"[Veselkov, Kirill A., Mirnezami, Reza, Strittm...",2014,10.1073/pnas.1310524111
4,phd.ris,A Co-registration Pipeline for Multimodal MALD...,"[Nikitina, Arina, Huang, Danning, Li, Li, Pete...",2020,10.1021/jasms.9b00094


In [26]:
ris_df['title'].nunique()

145

In [27]:
pip install fuzzywuzzy[speedup] --user


Note: you may need to restart the kernel to use updated packages.


In [34]:
from pathlib import Path
import pandas as pd
from rapidfuzz import process, fuzz
import re

# path
processed_text_dir = Path("/home/anbarasan.p/processed_text")  # folder with extracted text files
mapping_output = Path("/home/anbarasan.p/matched_metadata.csv")

# cleaning function
def clean_text(s):
    """Normalize strings by lowercasing, removing punctuation, and condensing spaces."""
    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

# cleanup RIS data
ris_df["clean_title"] = ris_df["title"].apply(clean_text)
before = len(ris_df)
ris_df = ris_df.drop_duplicates(subset=["clean_title"], keep="first").reset_index(drop=True)
after = len(ris_df)
print(f" Removed {before - after} duplicate titles from RIS data ({after} unique titles remain).")

# prep data
ris_titles = ris_df["clean_title"].tolist()
text_files = list(processed_text_dir.glob("*.txt"))
print(f"Found {len(text_files)} extracted text files.\n")

records = []

# fuzzy matching
for text_file in text_files:
    fname = text_file.stem
    clean_fname = clean_text(fname)

    # Match filename to closest RIS title
    match, score, idx = process.extractOne(clean_fname, ris_titles, scorer=fuzz.token_sort_ratio)

    # Keep only good matches (≥70 similarity)
    if score >= 70:
        matched_row = ris_df.iloc[idx]
        records.append({
            "file": text_file.name,
            "matched_title": matched_row["title"],
            "similarity": score,
            "doi": matched_row.get("doi"),
            "year": matched_row.get("year"),
            "authors": "; ".join(matched_row.get("authors", [])) if isinstance(matched_row.get("authors"), list) else matched_row.get("authors")
        })

# export results
mappings_df = pd.DataFrame(records)
mappings_df.to_csv(mapping_output, index=False)

# summary
print(f"\n Mapping complete!")
print(f"{len(mappings_df)} matched files (≥70 similarity)")
print(f"Results saved to: {mapping_output}")

mappings_df.head()


 Removed 0 duplicate titles from RIS data (145 unique titles remain).
Found 244 extracted text files.


 Mapping complete!
122 matched files (≥70 similarity)
Results saved to: /home/anbarasan.p/matched_metadata.csv


Unnamed: 0,file,matched_title,similarity,doi,year,authors
0,4.0 Image Gradients and Gradient Filtering.txt,4.0 Image Gradients and Gradient Filtering,100.0,,,
1,Abdelmoula et al. - 2014 - Automatic Registrat...,Automatic Registration of Mass Spectrometry Im...,74.683544,10.1021/ac500148a,2014.0,"Abdelmoula, Walid M.; Carreira, Ricardo J.; Sh..."
2,Alexandrov - 2012 - MALDI imaging mass spectro...,MALDI imaging mass spectrometry: statistical d...,70.886076,10.1186/1471-2105-13-S16-S11,2012.0,"Alexandrov, Theodore"
3,Alexandrov - 2012 - MALDI imaging mass spectro...,MALDI imaging mass spectrometry: statistical d...,92.156863,10.1186/1471-2105-13-S16-S11,2012.0,"Alexandrov, Theodore"
4,Alexandrov et al. - 2023 - Enablers and challe...,"Enablers and challenges of spatial omics, a me...",86.419753,10.15252/msb.202110571,2023.0,"Alexandrov, Theodore; Saez‐Rodriguez, Julio; S..."
