In [422]:
SparkContext

pyspark.context.SparkContext

In [423]:
!pwd

/home/almalinux/eda1-coursework/src/merizo_pipeline


In [424]:
!pip3 install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting torch==2.0.1+cpu
  Using cached https://download.pytorch.org/whl/cpu/torch-2.0.1%2Bcpu-cp39-cp39-linux_x86_64.whl (195.4 MB)


In [425]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [426]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [427]:
sc.master

'local[4]'

In [428]:
os.getcwd()

'/home/almalinux/eda1-coursework/src/merizo_pipeline'

In [429]:
def read_dir(input_dir, output_dir):
    """
    Read file paths from HDFS using SparkContext.
    """
    # Use SparkContext to read files from HDFS
    # If files are located in a directory on HDFS, you can use textFile 
    file_rdd = sc.wholeTextFile(input_dir + "/AF-Q46839-F1-model_v4.pdb") 
    file_paths = file_rdd.collect()  # This retrieves the file paths as a list

    # Create a list of tuples with file path, id, and output directory
    return [(file_path, os.path.basename(file_path), output_dir) for file_path in file_paths]


In [430]:
input_dir = "/UP000000625_83333_ECOLI_v4/"

In [431]:
file_rdd = sc.wholeTextFiles(input_dir + "*.pdb")
file_paths_rdd = file_rdd.map(lambda x: (x[0], os.path.basename(x[0])))

In [432]:
from pyspark import SparkFiles
local_file_path = SparkFiles.get(input_dir + "/AF-Q46839-F1-model_v4.pdb")
local_file_path

'/UP000000625_83333_ECOLI_v4/AF-Q46839-F1-model_v4.pdb'

In [433]:
!ls

AF-P00811-F1-model_v4.pdb.parsed  remove_requirements.txt
AF-P37095-F1-model_v4.pdb.parsed  requirements.txt
AF-P39368-F1-model_v4.pdb.parsed  results_parser.py
AF-P76079-F1-model_v4.pdb.parsed  setup.py
merizo_pipeline			  test.bin
pipeline_job.py			  test_requirements.txt
pipeline_playground.ipynb


In [434]:
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
import os

In [435]:
input_dir = "/UP000000625_83333_ECOLI_v4/"
# input_dir = "/"
# Failed example : AF-P0DSE5-F1-model_v4.pdb
# Success example: AF-P75975-F1-model_v4.pdb
file_rdd = sc.binaryFiles(input_dir + "*.pdb")
file_rdd = file_rdd.sample(withReplacement=False, fraction=0.001)
file_content_rdd = file_rdd.map(lambda x: (os.path.basename(x[0]), x[1]))

In [436]:
def delete_local_file(file_path):
    try:
        os.remove(file_path)
        print(f"{file_path} local file has been deleted.")
    except FileNotFoundError:
        print(f"{file_path} does not exist.")
    except PermissionError:
        print(f"Permission denied to delete {file_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [437]:
def upload_file_to_hdfs(local_file_path, hdfs_file_path):
    hdfs_put_cmd = ['hdfs', 'dfs', '-put', local_file_path, hdfs_file_path]
    print(f'STEP 3: UPLOADING ANALYSIS OUTPUT TO HDFS: {" ".join(hdfs_put_cmd)}')
    p = Popen(hdfs_put_cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
    out, err = p.communicate()
    # Decode the byte output to string
    print("Output:")
    print(out.decode("utf-8"))  # Decode and print the standard output
    
    if err:
        print("Error:")
        print(err.decode("utf-8"))  # Decode and print the standard  

In [438]:
def run_parser(input_file):
    """
    Run the results_parser.py over the hhr file to produce the output summary
    """
    search_file = input_file+"_search.tsv"
    print("search_file: ", search_file)
    cmd = ['python3', './results_parser.py', search_file]
    print(f'STEP 2: RUNNING PARSER: {" ".join(cmd)}')
    p = Popen(cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
    out, err = p.communicate()
    # Decode the byte output to string
    print("Output:")
    print(out.decode("utf-8"))  # Decode and print the standard output
        
    if err:
        print("Error:")
        print(err.decode("utf-8"))  # Decode and print the standard error

In [439]:
def run_merizo_search(file_name, file_content):
    print(f"File Name: {file_name}")
    # Create a temporary file to hold the content
    with NamedTemporaryFile(delete=True, mode='wb') as temp_file:
        temp_file.write(file_content)
        temp_file_path = temp_file.name
        cmd = ['python3',
           '/home/almalinux/merizo_search/merizo_search/merizo.py',
           'easy-search',
           temp_file_path,
           '/home/almalinux/data/cath-4.3-foldclassdb',
           file_name,
           'tmp',
           '--iterate',
           '--output_headers',
           '-d',
           'cpu',
           '--threads',
           '2'
          ]
        print(f'STEP 1: RUNNING MERIZO: {" ".join(cmd)}')
        p = Popen(cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()
        # Decode the byte output to string
        print("Output:")
        print(out.decode("utf-8"))  # Decode and print the standard output
        
        if err:
            print("Error:")
            print(err.decode("utf-8"))  # Decode and print the standard 


In [406]:
from fs.memoryfs import MemoryFS

def run_merizo_search_vfs(file_name, file_content):
    print(f"File Name: {file_name}")
    # Initialize an in-memory virtual filesystem
    virtual_fs = MemoryFS()
    # Create a temporary file to hold the content
    with virtual_fs.open(file_name, mode='wb') as virtual_file:
        virtual_file.write(file_content)
        if virtual_fs.exists(file_name):
            print(f"File '{file_name}' successfully written to the virtual filesystem.")
            with open(file_name, 'r') as pdb_file:
                print(pdb_file.readlines())
        # cmd = ['python3',
        #    '/home/almalinux/merizo_search/merizo_search/merizo.py',
        #    'easy-search',
        #    file_name,
        #    '/home/almalinux/data/cath-4.3-foldclassdb',
        #    file_name,
        #    'tmp',
        #    '--iterate',
        #    '--output_headers',
        #    '-d',
        #    'cpu',
        #    '--threads',
        #    '2'
        #   ]
        # print(f'STEP 1: RUNNING MERIZO: {" ".join(cmd)}')
        # p = Popen(cmd, stdin=PIPE,stdout=PIPE, stderr=PIPE)
        # out, err = p.communicate()
        # # Decode the byte output to string
        # print("Output:")
        # print(out.decode("utf-8"))  # Decode and print the standard output
        
        # if err:
        #     print("Error:")
        #     print(err.decode("utf-8"))  # Decode and print the standard 
    
    # Clean up the virtual filesystem
    virtual_fs.close()

In [407]:
def upload_analysis_outputs_to_hdfs(file_name):
    # upload anaylsis output files to hdfs and clean local files
    local_files_paths = [ file_name + '_segment.tsv', file_name + '_search.tsv', file_name + '.parsed']
    hdfs_file_path = '/analysis_outputs/'
    for local_file_path in local_files_paths:
        upload_file_to_hdfs(local_file_path, hdfs_file_path)
        delete_local_file(local_file_path)

In [440]:
def pipeline(file_tuple):
    file_name, file_content = file_tuple
    # STEP 1
    run_merizo_search(file_name, file_content)
    # STEP 2
    run_parser(file_name)
    # STEP 3
    # parsed_output_dict = read_parsed_file(file_name)
    upload_analysis_outputs_to_hdfs(file_name)

In [441]:
file_content_rdd.map(pipeline).collect()

File Name: AF-P0AGE6-F1-model_v4.pdb
STEP 1: RUNNING MERIZO: python3 /home/almalinux/merizo_search/merizo_search/merizo.py easy-search /tmp/tmp765gvs3u /home/almalinux/data/cath-4.3-foldclassdb AF-P0AGE6-F1-model_v4.pdb tmp --iterate --output_headers -d cpu --threads 2
File Name: AF-P51024-F1-model_v4.pdb
STEP 1: RUNNING MERIZO: python3 /home/almalinux/merizo_search/merizo_search/merizo.py easy-search /tmp/tmp64f4ejqs /home/almalinux/data/cath-4.3-foldclassdb AF-P51024-F1-model_v4.pdb tmp --iterate --output_headers -d cpu --threads 2
Output:

Error:
2024-12-19 11:42:25,262 | INFO | Starting easy-search with command: 

/home/almalinux/merizo_search/merizo_search/merizo.py easy-search /tmp/tmp765gvs3u /home/almalinux/data/cath-4.3-foldclassdb AF-P0AGE6-F1-model_v4.pdb tmp --iterate --output_headers -d cpu --threads 2

2024-12-19 11:42:32,554 | INFO | Finished easy-search in 7.291 seconds.

search_file:  AF-P0AGE6-F1-model_v4.pdb_search.tsv
STEP 2: RUNNING PARSER: python3 ./results_parser

[None, None, None]

In [442]:
!cat AF-P00811-F1-model_v4.pdb.parsed

#AF-P00811-F1-model_v4.pdb_search.tsv Results. mean plddt: 98.4244
cath_id,count
3.40.710.10,1


In [443]:
test_str = "#AF-P00811-F1-model_v4.pdb_search.tsv Results. mean plddt: 98.4244" 

In [444]:
float(test_str.split("mean plddt: ")[1].strip())

98.4244

In [445]:
def parse_parsed_file(file_path):
    """
    Reads a .parsed file and extracts:
    1. A dictionary with 'cath_id' as keys and their counts as values.
    2. The mean pLDDT value.
    
    Args:
        file_path (str): Path to the .parsed file.
        
    Returns:
        tuple: (dict of cath_id counts, mean pLDDT value)
    """
    cath_counts = {}
    mean_plddt = 0
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        # Extract mean pLDDT from the header
        if len(lines) < 1:
            return cath_counts, mean_plddt

        first_line = lines[0]
        if "mean plddt:" in first_line:
            mean_plddt = float(first_line.split("mean plddt:")[1].strip())
        
        # Skip the header and process the data rows
        for line in lines[2:]:  # Assuming data rows start from the 3rd line
            if not line.strip():
                continue # Ignore empty lines
            cath_id, count = line.strip().split(',')
            cath_counts[cath_id] = int(count)
    
    return cath_counts, mean_plddt

# Apply the function to the uploaded file
file_path = "AF-P00811-F1-model_v4.pdb.parsed"
cath_counts, mean_plddt = parse_parsed_file(file_path)
cath_counts, mean_plddt


({'3.40.710.10': 1}, 98.4244)

In [446]:
def combine_parsed_dict_and_mean(analysis_output1, analysis_output2):
    cath_counts_dict1, mean_plddt1 = analysis_output1
    cath_counts_dict2, mean_plddt2 = analysis_output2

    for key in cath_counts_dict2:
        if key in cath_counts_dict1:
            cath_counts_dict1[key] +=cath_counts_dict2[key]
        else:
            cath_counts_dict1[key] = cath_counts_dict2[key]

    mean_plddt = (mean_plddt1 + mean_plddt2) / 2.0

    return cath_counts_dict1, mean_plddt

analysis_output1 = ({'3.40.710.10': 1}, 98.4244)
analysis_output2 = ({'3.40.710.10': 3, '2.30.810.20': 2}, 88.4244)

combine_parsed_dict_and_mean(analysis_output1, analysis_output2)

({'3.40.710.10': 4, '2.30.810.20': 2}, 93.4244)

In [447]:
from fs.memoryfs import MemoryFS

# Initialize an in-memory virtual filesystem
virtual_fs = MemoryFS()

# Path of the binary file to read
binary_file_path = 'test.bin'

# Path to write the file in the virtual filesystem
virtual_file_path = 'virtual_test.bin'

# Read the binary file and write its content to the virtual filesystem
with open(binary_file_path, 'rb') as local_file:
    binary_content = local_file.read()
    with virtual_fs.open(virtual_file_path, 'wb') as virtual_file:
        virtual_file.write(binary_content)

# Verify the file exists in the virtual filesystem
if virtual_fs.exists(virtual_file_path):
    print(f"File '{virtual_file_path}' successfully written to the virtual filesystem.")
    with open(virtual_file_path, 'r') as file:
        print(file.readlines())
else:
    print("Failed to write the file.")




# # Clean up the virtual filesystem
# virtual_fs.close()


File 'virtual_test.bin' successfully written to the virtual filesystem.


FileNotFoundError: [Errno 2] No such file or directory: 'virtual_test.bin'

In [448]:
results = sc.parallelize([1,2,3])

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/spark-7b6032d9-bfc3-4f74-8bd8-8040d65ec3b4/pyspark-d645a6d8-e77e-4674-9d32-f886bf77eb95/tmp3tvlpmna'

In [417]:
import math

def combine_results(acc, new_data):
    """
    Combines two tuples of (cath_counts, mean_pLDDT_stats) into one.
    
    Args:
        acc (tuple): Accumulated results (combined_dict, (sum_means, count, sum_squared_diffs)).
        new_data (tuple): New data to combine (dict, mean_pLDDT).
        
    Returns:
        tuple: Combined results.
    """
    combined_dict, mean_stats = acc
    new_dict, new_mean = new_data
    
    # Combine dictionaries by summing counts
    for key, value in new_dict.items():
        if key in combined_dict:
            combined_dict[key] += value
        else:
            combined_dict[key] = value
    
    # Update mean stats
    sum_means, count, sum_squared_diffs = mean_stats
    count += 1
    sum_means += new_mean
    new_mean_diff_squared = (new_mean - (sum_means / count)) ** 2
    sum_squared_diffs += new_mean_diff_squared
    
    return combined_dict, (sum_means, count, sum_squared_diffs)

def finalize_results(combined_results):
    """
    Finalizes the combined results to calculate the overall mean and standard deviation.
    
    Args:
        combined_results (tuple): (combined_dict, (sum_means, count, sum_squared_diffs)).
        
    Returns:
        tuple: (combined_dict, overall_mean, std_dev)
    """
    combined_dict, (sum_means, count, sum_squared_diffs) = combined_results
    overall_mean = sum_means / count
    std_dev = math.sqrt(sum_squared_diffs / count)
    return combined_dict, overall_mean, std_dev

# Simulate the map step
parsed_results = sc.parallelize([
    ({"CATH1": 10, "CATH2": 5}, 80.0),
    ({"CATH1": 20, "CATH3": 15}, 75.0),
    ({"CATH2": 8, "CATH3": 12}, 85.0)
])

# Initial values: (empty dict, (sum_means, count, sum_squared_diffs))
initial_value = ({}, (0.0, 0, 0.0))

# Reduce step
combined_results = parsed_results.reduce(combine_results)

# Finalize results to compute overall mean and standard deviation
final_result = finalize_results(combined_results)
print("Final Result:", final_result)


FileNotFoundError: [Errno 2] No such file or directory: '/tmp/spark-7b6032d9-bfc3-4f74-8bd8-8040d65ec3b4/pyspark-d645a6d8-e77e-4674-9d32-f886bf77eb95/tmp_u6_n6jk'