In [2]:
import csv
import re

def read_tsv(file_path):
    # Read the data from the file
    with open(file_path, newline='') as file:
        reader = csv.reader(file, delimiter='\t')
        return [row for row in reader]

def write_tsv(file_path, data):
    # Write the data to a new file
    with open(file_path, 'w', newline='') as file: 
        writer = csv.writer(file, delimiter='\t')
        writer.writerows(data)

def is_desired_location_format(location):
    # Define the pattern for the desired location format (e.g., '19:55494631-55494631')
    pattern = r'^\d+:\d+-\d+$'
    return re.match(pattern, location)

def process_files(source_path, data_path, output_file_path):
    # Read data from both files
    source_data = read_tsv(source_path)
    data = read_tsv(data_path)

    # Extract RSIDs from source file
    column_headers = source_data[0]
    rsid_index = column_headers.index('rsID')
    source_rsids = [row[rsid_index] for row in source_data[0:] if row]


    # Create a dictionary of RSIDs and locations from the data file
    data_dict = {}
    for row in data:
        if row and len(row) >= 2:
            rsid, location = row[0], row[1]
            if rsid not in data_dict and is_desired_location_format(location):
                data_dict[rsid] = [rsid, location]

    # Filter and order data based on source file's RSIDs
    ordered_data = [data_dict.get(rsid) for rsid in source_rsids if rsid in data_dict]
    
    # Insert the header row at the 0 position of source_data
    header_row = ['rsID', 'Location']  
    ordered_data.insert(0, header_row)

    # Write the ordered data to a new file
    write_tsv(output_file_path, ordered_data)
    


data_path = 'VEP_rsid_SNP.txt'
source_path = 'snpList_SNP.tsv'
output_file_path = 'output_results_SNP.tsv'

# Process the files
process_files(source_path, data_path, output_file_path)