In [None]:
import pandas as pd
import rich_click as click
from pathlib import Path
from datetime import datetime
import numpy as np

def str_ingress(paths, f_format, sample_col, marker_col, sample_map=None):
    """
    Reads in a list of paths and returns a pandas DataFrame of STR alleles in long format.
    """

    samps_dicts = []

    for path in paths:
        if path.suffix == '.xlsx':
            df = pd.read_excel(path)
        elif path.suffix == '.csv':
            df = pd.read_csv(path)
        elif path.suffix == '.tsv':
            df = pd.read_csv(path, sep='\t')
        elif path.suffix == '.txt':
            df = pd.read_csv(path, sep='\t')
        
        df = df.applymap(lambda x: x.strip() if type(x)==str else x)

        df.columns = df.columns.str.strip()

        # Collapse allele columns for each marker into a single column if in wide format.
        df['Alleles'] = df.filter(like='Allele').apply(lambda x: 
            ','.join([str(y) for y in x if str(y) != "nan"]), axis=1).str.strip(",")

        # Group and collect dict from each sample for markers and alleles.
        grouped = df.groupby(sample_col)

        for samp in grouped.groups.keys():
            samp_df = grouped.get_group(samp)
            samps_dict = samp_df.set_index(marker_col).to_dict()["Alleles"]
            samps_dict["Sample"] = samp
            
            samps_dicts.append(samps_dict)

    allele_df = pd.DataFrame(samps_dicts)
    
    # Replace sample names with sample map if provided.
    if sample_map is not None:
        for id in sample_map.iloc[:, 0]:
            allele_df.loc[allele_df["Sample"] == id, "Sample"] = sample_map.iloc[:,1][sample_map.iloc[:,0] == id].to_string(header=False, index=False)
    
    # Set index to sample name.
    allele_df.set_index("Sample", inplace=True, verify_integrity=True)
    
    # Remove Nans.
    allele_df = allele_df.replace({np.nan: ''})
    
    return allele_df


def score_query(query, reference, use_amel=False, amel_col = "AMEL"):
    """
    Calculates the Tanabe and Masters scores for a query sample against a reference sample.
    
    Args:
        query (_type_): _description_
        reference (_type_): _description_
    """ 
    
    n_r_alleles = 0
    n_q_alleles = 0

    n_shared_alleles = 0
    
    # Convert allele values to lists, removing markers with no alleles, and uniquifying alleles.
    query = {k: list(set(v.split(","))) for k, v in query.items() if v != ""}
    reference = {k: list(set(v.split(","))) for k, v in reference.items() if v != ""}
    
    # Get unique markers in query and reference.
    markers = list(set(query.keys()) & set(reference.keys()))

    # Remove amelogenin markers if use_amel is False.
    if use_amel == False:
        markers.remove(amel_col)
    
    # Calculate the number of shared markers.
    n_shared_markers = len(markers)
    
    # Calculate the number of shared alleles.
    for m in markers:
        n_r_alleles += len(reference[m])
        n_q_alleles += len(query[m])
        n_shared_alleles += len(set(reference[m]) & set(query[m]))

    # Calculate the scores.
    tanabe_score = 100 * ((2 * n_shared_alleles) / (n_q_alleles + n_r_alleles))
    masters_q_score = 100 * (n_shared_alleles / n_q_alleles)
    masters_r_score = 100 * (n_shared_alleles / n_r_alleles)

    out = {"n_markers": n_shared_markers, "n_shared_alleles": n_shared_alleles, 
           "n_query_alleles": n_q_alleles, "n_reference_alleles": n_r_alleles, 
           "tanabe_score": tanabe_score, "masters_q_score": masters_q_score,
           "masters_r_score": masters_r_score}
    
    return out

In [None]:
strs = [Path("ExampleSTR.xlsx"), Path("ExampleSTR2.xlsx")]
smap = pd.read_csv("SampleMap_exp.csv", header=None)

df = str_ingress(paths = strs, f_format = "wide", sample_col = "Sample Name", marker_col = "Marker", sample_map = smap)

In [None]:
samps = df.to_dict(orient = "index")

q = "Sample1"

for s in samps.keys():
    q = samps[q]
    
    for sa in samps.keys():
        if sa != s:
            r = samps[sa]
            
            