In [16]:
# import libraries 
import json
import numpy as np
import pandas as pd

In [87]:
# mutation_type = 'Amino'
mutation_type = 'Nucleotide'


normalised = 'Reference'
# normalised = 'Normalised'


In [88]:
# function to ensure mutations only appear in the form a|b and not also b|a
# checks if there is a b|a. If there is it concatenates the lists for a|b with b|a and doesn't store the b|a key
# writes merged keys to new file 

def merge_reverse_keys(mutations_file):
    # Read mutations from the input JSON file
    with open(mutations_file) as f:
        data = json.load(f)

    # merge reverse keys
    merged_data = {}
    processed_keys = set()

    for key in data.keys():
        if key in processed_keys:
            continue

        reverse_key = '|'.join(reversed(key.split('|')))
        if reverse_key in data:
            merged_data[key] = data[key] + data[reverse_key]  # Concatenate the lists 
            processed_keys.add(reverse_key)
        else:
            merged_data[key] = data[key]

    # Write merged data to a new JSON file
    with open("Output/Generated_Sequences/"+normalised+"/JSON_Tree/"+mutation_type+"/cooccurrence_mutations_merged_"+mutation_type+".json", "w") as outfile:
        json.dump(merged_data, outfile)

# call function
mutations_file = "Output/Generated_Sequences/"+normalised+"/JSON_Tree/"+mutation_type+"/cooccurrence_mutations_"+mutation_type+".json"
merge_reverse_keys(mutations_file)


In [89]:
# function to read in mutations json file and output a dataframe containing the mutations, their frequency and a mean mutation distance
# a mutation distance of 1 means the mutations occur at the same time

def mutation_parser(mutationfile):
    with open(mutationfile) as f:
        data = json.load(f)

    # lists to store column data
    keys = []
    total_elements_list = []
    mean_list = []

    # Iterate over each key-value pair in the JSON data
    for key, value in data.items():
        # Add 1 to each element in the list
        modified_list = [x + 1 for x in value]

        # Calculate the total number of elements in the list
        total_elements = len(modified_list)

        # Calculate the mean
        mean = np.mean(modified_list)

        # append data to lists
        keys.append(key)
        total_elements_list.append(total_elements)
        mean_list.append(mean)

    # Create df
    df = pd.DataFrame({
        'mutation_pairs': keys,
        'frequency': total_elements_list,
        'mean': mean_list
    })

    return df

# call function on merged co data
mutation_dataframe = mutation_parser("Output/Generated_Sequences/"+normalised+"/JSON_Tree/"+mutation_type+"/cooccurrence_mutations_merged_"+mutation_type+".json")

In [90]:
if mutation_type == 'Nucleotide' and normalised == "Normalised":
    # read in co-occurrence tool output
    wesme = pd.read_csv('wsampling/results/SARS_smut_co_pvs_0.05.txt', sep="\t")
    
elif mutation_type == 'Nucleotide' and normalised == "Reference":
    # read in co-occurrence tool output
    wesme = pd.read_csv('wsampling/results/SARS_REF_smut_co_pvs_0.05.txt', sep="\t")
    
elif mutation_type == 'Amino' and normalised == "Normalised":
    # read in co-occurrence tool output
    wesme = pd.read_csv('wsampling/results/SARS_AMINO_smut_co_pvs_0.05.txt', sep="\t")
    
elif mutation_type == 'Amino' and normalised == "Reference":
    # read in co-occurrence tool output
    wesme = pd.read_csv('wsampling/results/SARS_REF_AMINO_smut_co_pvs_0.05.txt', sep="\t")
    

if mutation_type == 'Amino':
    # create a column of mutations pairs 
    # format will match the mutations dataframe mutations_pairs column
    gene1 = [x.split('|')[6]+":"+"".join(reversed(x.split('|')[0:3])) for x in wesme.gene1.to_list()]
    gene2 = [x.split('|')[6]+":"+"".join(reversed(x.split('|')[0:3])) for x in wesme.gene2.to_list()]
    mutations_list = ['{}|{}'.format(a, b) for a, b in zip(gene1, gene2)]
    # mutations_list
    wesme['mutation_pairs'] = mutations_list
elif mutation_type == 'Nucleotide':
    # create a column of mutations pairs 
    # format will match the mutations dataframe mutations_pairs column
    gene1 = [x.split('|')[0] for x in wesme.gene1.to_list()]
    gene2 = [x.split('|')[0] for x in wesme.gene2.to_list()]
    mutations_list = ['{}|{}'.format(a, b) for a, b in zip(gene1, gene2)]
    # mutations_list
    wesme['mutation_pairs'] = mutations_list

# merge the two dataframes together 
merged_df = pd.merge(mutation_dataframe, wesme, on='mutation_pairs', how='inner')
merged_df

Unnamed: 0,mutation_pairs,frequency,mean,gene1,gene2,jaccard index,pv (fisher),pv (ws)
0,T670G|C2790T,262,1.435115,T670G|R|135|S|7|NSP1|NSP1|orf1ab,C2790T|I|842|T|43|NSP3-Ubl1|NSP3|orf1ab,0.988060,0.000000,0.00000
1,C21618T|T21633C,384,1.302083,C21618T|I|19|T|1|NTD|S|S,T21633C|S|24|L|2|NTD|S|S,0.997006,0.000000,0.00000
2,T21633C|C21635-,385,1.301299,T21633C|S|24|L|2|NTD|S|S,C21635-|-|25|P|2|NTD|S|S,1.000000,0.000000,0.00000
3,C21635-|C21636-,385,1.301299,C21635-|-|25|P|2|NTD|S|S,C21636-|-|25|P|2|NTD|S|S,1.000000,0.000000,0.00000
4,C21636-|C21637-,385,1.301299,C21636-|-|25|P|2|NTD|S|S,C21637-|-|25|P|2|NTD|S|S,1.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...
11113,G25352T|T22917A,1,2.000000,G25352T|L|1264|V|64|CD|S|S,T22917A|Q|452|L|23|RBD|S|S,0.086957,0.002507,0.01140
11114,G521T|C2306T,1,1.000000,G521T|F|86|V|5|NSP1|NSP1|orf1ab,C2306T|F|681|L|35|NSP2|NSP2|orf1ab,0.250000,0.003014,0.00714
11115,C5730T|C6401T,1,1.000000,C5730T|I|1822|T|92|NSP3-PL2pro|NSP3|orf1ab,C6401T|S|2046|P|103|NSP3-Beta-SM|NSP3|orf1ab,0.111111,0.010521,0.02316
11116,A10323G|C10775T,1,1.000000,A10323G|R|3353|K|168|NSP5-Mpro|NSP5|orf1ab,C10775T|S|3504|P|176|NSP5-Mpro|NSP5|orf1ab,0.031250,0.016080,0.03460


In [91]:
if mutation_type == 'Nucleotide' and normalised == "Normalised":
    merged_df.to_csv('Output/Generated_Sequences/'+normalised+'/Parser/'+mutation_type+'/SARS_CO_Heights_Annotations.tsv',sep='\t',index=False)

elif mutation_type == 'Nucleotide' and normalised == "Reference":
    merged_df.to_csv('Output/Generated_Sequences/'+normalised+'/Parser/'+mutation_type+'/SARS_REF_CO_Heights_Annotations.tsv',sep='\t',index=False)
    
elif mutation_type == 'Amino' and normalised == "Normalised":
    merged_df.to_csv('Output/Generated_Sequences/'+normalised+'/Parser/'+mutation_type+'/SARS_AMINO_CO_Heights_Annotations.tsv',sep='\t',index=False)
    
elif mutation_type == 'Amino' and normalised == "Reference":
    merged_df.to_csv('Output/Generated_Sequences/'+normalised+'/Parser/'+mutation_type+'/SARS_REF_AMINO_CO_Heights_Annotations.tsv',sep='\t',index=False)
