In [1]:
SparkContext

pyspark.context.SparkContext

In [2]:
!pwd

/home/almalinux/eda1-coursework/src/merizo_pipeline


In [3]:
!pip3 install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting torch==2.0.1+cpu
  Using cached https://download.pytorch.org/whl/cpu/torch-2.0.1%2Bcpu-cp39-cp39-linux_x86_64.whl (195.4 MB)


In [7]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [6]:
sc.master

'local[4]'

In [7]:
os.getcwd()

'/home/almalinux/eda1-coursework/src/merizo_pipeline'

In [8]:
def read_dir(input_dir, output_dir):
    """
    Read file paths from HDFS using SparkContext.
    """
    # Use SparkContext to read files from HDFS
    # If files are located in a directory on HDFS, you can use textFile 
    file_rdd = sc.wholeTextFile(input_dir + "/AF-Q46839-F1-model_v4.pdb") 
    file_paths = file_rdd.collect()  # This retrieves the file paths as a list

    # Create a list of tuples with file path, id, and output directory
    return [(file_path, os.path.basename(file_path), output_dir) for file_path in file_paths]


In [9]:
input_dir = "/UP000000625_83333_ECOLI_v4/"

In [10]:
file_rdd = sc.wholeTextFiles(input_dir + "*.pdb")
file_paths_rdd = file_rdd.map(lambda x: (x[0], os.path.basename(x[0])))

In [11]:
from pyspark import SparkFiles
local_file_path = SparkFiles.get(input_dir + "/AF-Q46839-F1-model_v4.pdb")
local_file_path

'/UP000000625_83333_ECOLI_v4/AF-Q46839-F1-model_v4.pdb'

In [12]:
!ls

merizo_pipeline  pipeline_playground.ipynb  results_parser.py
pipeline_job.py  requirements.txt	    setup.py


In [54]:
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
import os

In [57]:
input_dir = "/UP000000625_83333_ECOLI_v4/"
file_rdd = sc.binaryFiles(input_dir + "*.pdb")
file_rdd = file_rdd.sample(withReplacement=False, fraction=0.005)
file_content_rdd = file_rdd.map(lambda x: (os.path.basename(x[0]), x[1]))

In [65]:
def delete_local_file(file_path):
    try:
        os.remove(file_path)
        print(f"{file_path} local file has been deleted.")
    except FileNotFoundError:
        print(f"{file_path} does not exist.")
    except PermissionError:
        print(f"Permission denied to delete {file_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [66]:
def upload_file_to_hdfs(local_file_path, hdfs_file_path):
    hdfs_put_cmd = ['hdfs', 'dfs', '-put', local_file_path, hdfs_file_path]
    print(f'STEP 3: UPLOADING ANALYSIS OUTPUT TO HDFS: {" ".join(hdfs_put_cmd)}')
    p = Popen(hdfs_put_cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
    out, err = p.communicate()
    # Decode the byte output to string
    print("Output:")
    print(out.decode("utf-8"))  # Decode and print the standard output
    
    if err:
        print("Error:")
        print(err.decode("utf-8"))  # Decode and print the standard  

In [67]:
def run_parser(input_file):
    """
    Run the results_parser.py over the hhr file to produce the output summary
    """
    search_file = input_file+"_search.tsv"
    print("search_file: ", search_file)
    cmd = ['python3', './results_parser.py', search_file]
    print(f'STEP 2: RUNNING PARSER: {" ".join(cmd)}')
    p = Popen(cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
    out, err = p.communicate()
    # Decode the byte output to string
    print("Output:")
    print(out.decode("utf-8"))  # Decode and print the standard output
        
    if err:
        print("Error:")
        print(err.decode("utf-8"))  # Decode and print the standard error

In [68]:
def run_merizo_search(file_name, file_content):
    print(f"File Name: {file_name}")
    # Create a temporary file to hold the content
    with NamedTemporaryFile(delete=True, mode='wb') as temp_file:
        temp_file.write(file_content)
        temp_file_path = temp_file.name
        cmd = ['python3',
           '/home/almalinux/merizo_search/merizo_search/merizo.py',
           'easy-search',
           temp_file_path,
           '/home/almalinux/data/cath-4.3-foldclassdb',
           file_name,
           'tmp',
           '--iterate',
           '--output_headers',
           '-d',
           'cpu',
           '--threads',
           '1'
          ]
        print(f'STEP 1: RUNNING MERIZO: {" ".join(cmd)}')
        p = Popen(cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()
        # Decode the byte output to string
        print("Output:")
        print(out.decode("utf-8"))  # Decode and print the standard output
        
        if err:
            print("Error:")
            print(err.decode("utf-8"))  # Decode and print the standard 


In [69]:
def upload_analysis_outputs_to_hdfs(file_name):
    # upload anaylsis output files to hdfs and clean local files
    local_files_paths = [ file_name + '_segment.tsv', file_name + '_search.tsv', file_name + '.parsed']
    hdfs_file_path = '/analysis_outputs/'
    for local_file_path in local_files_paths:
        upload_file_to_hdfs(local_file_path, hdfs_file_path)
        delete_local_file(local_file_path)

In [70]:
def pipeline(file_tuple):
    file_name, file_content = file_tuple
    # STEP 1
    run_merizo_search(file_name, file_content)
    # STEP 2
    run_parser(file_name)
    # STEP 3
    upload_analysis_outputs_to_hdfs(file_name)

In [71]:
file_content_rdd.map(pipeline).collect()

File Name: AF-P77374-F1-model_v4.pdb
STEP 1: RUNNING MERIZO: python3 /home/almalinux/merizo_search/merizo_search/merizo.py easy-search /tmp/tmpfiifo97y /home/almalinux/data/cath-4.3-foldclassdb AF-P77374-F1-model_v4.pdb tmp --iterate --output_headers -d cpu --threads 1
File Name: AF-P25740-F1-model_v4.pdb
STEP 1: RUNNING MERIZO: python3 /home/almalinux/merizo_search/merizo_search/merizo.py easy-search /tmp/tmp2xt8zzle /home/almalinux/data/cath-4.3-foldclassdb AF-P25740-F1-model_v4.pdb tmp --iterate --output_headers -d cpu --threads 1
File Name: AF-P05052-F1-model_v4.pdb
STEP 1: RUNNING MERIZO: python3 /home/almalinux/merizo_search/merizo_search/merizo.py easy-search /tmp/tmpq3j445lk /home/almalinux/data/cath-4.3-foldclassdb AF-P05052-F1-model_v4.pdb tmp --iterate --output_headers -d cpu --threads 1
Output:

Error:
  backends.update(_get_backends("networkx.backends"))
2024-12-07 14:08:59,181 | INFO | Starting easy-search with command: 

/home/almalinux/merizo_search/merizo_search/merizo

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]