In [5]:
!pip install pdf2image

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [36]:
# Rename NVIDIA repository source files and temporarily disable them  
!mv /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/cuda.list.bak  
!mv /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/nvidia-ml.list.bak  

# Update apt cache (official repositories only)  
!apt update -o Acquire::AllowInsecureRepositories=true

mv: cannot stat '/etc/apt/sources.list.d/cuda.list': No such file or directory
mv: cannot stat '/etc/apt/sources.list.d/nvidia-ml.list': No such file or directory
Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease       [0m[33m
Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Reading package lists... Done[0m                  [33m[33m
Building dependency tree       
Reading state information... Done
119 packages can be upgraded. Run 'apt list --upgradable' to see them.


In [37]:
# 1. View Jupyter’s current working directory  
print(f"Jupyter’s current working directory: {os.getcwd()}")  

# 2. Verify whether the target PDF file actually exists (critical!)  
target_pdf = "/mnt/ONCOBOX/geneplus/workspace/chenly/cbioprotal_download_km/Columbia, Nat Med. 2019/KM_Plot__Progression_Free_(months).pdf"  
print(f"Does the target PDF exist?: {os.path.exists(target_pdf)}")  

# 3. List all files in that folder (to confirm whether the PDF filename is correct)  
folder_path = "/mnt/ONCOBOX/geneplus/workspace/chenly/cbioprotal_download_km/Columbia, Nat Med. 2019/"  
print(f"\nFiles in {folder_path}:")  
!ls -lh "{folder_path}"

Jupyter’s current working directory: /mnt/ONCOBOX/geneplus/workspace/chenly/KM_DATA
Does the target PDF exist?: True

Files in /mnt/ONCOBOX/geneplus/workspace/chenly/cbioprotal_download_km/Columbia, Nat Med. 2019/:
total 200K
-rwx------ 1 1001 1001 165K Jan 20 06:01 'KM_Plot__Progression_Free_(months).pdf'
-rw-r--r-- 1 root root  27K Jan 20 07:03 'KM_Plot__Progression_Free_(months).png'
-rwx------ 1 1001 1001 2.3K Jan 20 06:01 'KM_Plot__Progression_Free_(months).txt'


In [34]:
import os
import shutil

In [41]:
root_dir = "cbioprotal_download_km"

# Verify the pdftoppm path
pdftoppm_path = os.popen("which pdftoppm").read().strip()
print(f" pdftoppm path: {pdftoppm_path}")

 pdftoppm path: /usr/bin/pdftoppm


In [42]:
# Initialize statistics
success_count = 0
fail_count = 0
failed_files = []

# Iterate through all folders
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    if not os.path.isdir(folder_path):
        print(f"Skipping non-directory: {folder_path}")
        continue
    
    print(f"\nProcessing folder: {folder_path}")
    
    # Iterate through PDF files
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".pdf"):
            # Construct paths
            pdf_path = os.path.join(folder_path, file_name)
            expected_png = os.path.join(folder_path, file_name.replace(".pdf", ".png"))
            pdftoppm_png = os.path.join(folder_path, file_name.replace(".pdf", "-1.png"))  # Suffix automatically added by pdftoppm
            
            # Verify PDF existence
            if not os.path.exists(pdf_path):
                error_msg = f"File does not exist: {pdf_path}"
                print(error_msg)
                fail_count += 1
                failed_files.append(error_msg)
                continue
            
            try:
                # Execute the conversion command
                cmd = f'{pdftoppm_path} -png -r 300 -f 1 -l 1 "{pdf_path}" "{os.path.splitext(pdf_path)[0]}"'
                exit_code = os.system(cmd)
                
                # Core fix: Handle the "-1" suffix added by pdftoppm
                if exit_code == 0:
                    # Case 1: "-1.png" was generated; rename it to the expected filename
                    if os.path.exists(pdftoppm_png):
                        shutil.move(pdftoppm_png, expected_png)
                        print(f" Conversion succeeded (renamed): {pdf_path} → {expected_png}")
                        success_count += 1
                    # Case 2: ".png" was generated directly (on some systems)
                    elif os.path.exists(expected_png):
                        print(f" Conversion succeeded: {pdf_path} → {expected_png}")
                        success_count += 1
                    # Case 3: The file was not generated
                    else:
                        raise Exception("PNG file was not generated (neither with the '-1' suffix nor with a valid suffix)")
                else:
                    raise Exception(f"Command execution failed, exit code: {exit_code}")
                
            except Exception as e:
                error_msg = f" Conversion failed for {pdf_path}: {str(e)}"
                print(error_msg)
                fail_count += 1
                failed_files.append(error_msg)

# Summary of Results
print("\n" + "="*60)
print(f" Batch Conversion Summary:")
print(f"   Successfully converted: {success_count} files")
print(f"   Failed conversions: {fail_count} files")
if failed_files:
    print(f"\n List of Failed Files:")
    for idx, msg in enumerate(failed_files, 1):
        print(f"   {idx}. {msg}")
else:
    print(f"\n All PDF files were successfully converted to PNG!")


Processing folder: cbioprotal_download_km/Columbia, Nat Med. 2019

Processing folder: cbioprotal_download_km/GATCI, Cell Reports 2024

Processing folder: cbioprotal_download_km/Mayo Clinic, Clin Cancer Res 2020

Processing folder: cbioprotal_download_km/MSK 2025

Processing folder: cbioprotal_download_km/MSK, Cancer Discov 2024

Processing folder: cbioprotal_download_km/MSK, J Clin Onco 2013

Processing folder: cbioprotal_download_km/MSK, Nat Genet 2013

Processing folder: cbioprotal_download_km/MSK, Nat Genet 2016_dfs

Processing folder: cbioprotal_download_km/MSK, Nat Genet 2016_overall

Processing folder: cbioprotal_download_km/MSK, Neuro Oncol 2017

Processing folder: cbioprotal_download_km/National University of Singapore, Nat Genet 2012

Processing folder: cbioprotal_download_km/TARGET GDC, 2025

Processing folder: cbioprotal_download_km/TCGA, Firehose Legacy

Processing folder: cbioprotal_download_km/TCGA, GDC

Processing folder: cbioprotal_download_km/TCGA, Nature 2014

Proces