This code was used to generate the files that consistency_output files that determined whether or not all of the inbreds parents of the nine hybrids or expanded set of hybrids were the same allele, and therefore, would not be able to differentiate. As a note, this does not cover for the scenario in which each hybrid is heterozygous and all of them show both the cut and uncut bands. This code was mainly generated by ChatGPT with minimal editing done. The input chr_loc_file were the SNPs that were filtered as being in non-repetitive regions and had high similarity across multiple inbreds. LH244 and NKH8431 are not included in this code as they are not present in the hmp_file.

The following code was used to generate the initial consistency_output file:

In [None]:
import pandas as pd

# === FILE PATHS ===
hmp_file = "C:/Users/Lundquist Lab/OneDrive - Michigan State University/Documents/Jannis/AgSpec/GWAS SNPs/all_1034_snps.hmp.txt"
chr_loc_file = "C:/Users/Lundquist Lab/OneDrive - Michigan State University/Documents/Jannis/AgSpec/GWAS SNPs/SNPs_filtered_6_2025.xlsx"

# === HYBRID GROUPS ===
nine_hybrids = ["B73", "Mo17", "LH195", "PHN82", "PHK76", "PHB47"]
expanded_set = ["B73", "Mo17", "LH195", "PHN82", "PHK76", "PHB47", "PHZ51", "PHG29", "PHG47",
                "LH145", "LH82", "PHJ40", "PH207", "PHG72", "PHW65", "B84", "PHG50", "PHN11", "CR1HT", "PHG35", "Seagull Seventeen", "W604S"]

# === READ HMP FILE ===
print("Loading HMP file...")
hmp_df = pd.read_csv(hmp_file, sep="\t", low_memory=False)

# === READ Chromosome/Location FILE ===
print("Loading Chromosome/Location file...")
if chr_loc_file.endswith(".xlsx"):
    chr_df = pd.read_excel(chr_loc_file)
elif chr_loc_file.endswith(".csv"):
    chr_df = pd.read_csv(chr_loc_file)
else:
    raise ValueError("Input file must be .xlsx or .csv")

# === Create rs# ===
chr_df['rs#'] = chr_df.apply(lambda x: f"chr{x['Chromosome']}_{x['Location']}", axis=1)

# === Function to Check Consistency ===
def is_consistent(row, columns):
    vals = row[columns]
    return vals.nunique() == 1

# === Main Loop ===
results = []
for _, row in chr_df.iterrows():
    rs_id = row['rs#']
    chr_num = row['Chromosome']
    pos = row['Location']
    
    matching_row = hmp_df[hmp_df['rs#'] == rs_id]
    if matching_row.empty:
        results.append({
            "Chromosome": chr_num,
            "Position": pos,
            "rs#": rs_id,
            "nine_hybrids_consistent": "Not found",
            "expanded_set_consistent": "Not found"
        })
    else:
        data_row = matching_row.iloc[0]
        nine_consistent = is_consistent(data_row, nine_hybrids)
        expanded_consistent = is_consistent(data_row, expanded_set)
        results.append({
            "Chromosome": chr_num,
            "Position": pos,
            "rs#": rs_id,
            "nine_hybrids_consistent": nine_consistent,
            "expanded_set_consistent": expanded_consistent
        })

# === Export to Excel ===
output_df = pd.DataFrame(results)
output_file = "consistency_output.xlsx"
output_df.to_excel(output_file, index=False)

print(f"✅ Done! Results written to: {output_file}")


This code was used as a positive control, as the chr4_249258130 SNP has the same allele for all the parents in the set of nine and expanded set of hybrids:

In [None]:
import pandas as pd

# === INPUTS ===
hmp_file = "C:/Users/Lundquist Lab/OneDrive - Michigan State University/Documents/Jannis/AgSpec/GWAS SNPs/all_1034_snps.hmp.txt"
positive_control_rs = "chr4_249258130"

# Define hybrid sets
nine_hybrids = ["B73", "Mo17", "LH195", "PHN82", "PHK76", "PHB47"]
expanded_set = ["B73", "Mo17", "LH195", "PHN82", "PHK76", "PHB47", "PHZ51", "PHG29", "PHG47", "LH145", "LH82", "PHJ40", "PH207", "PHG72", "PHW65", "B84", "PHG50", "PHN11", "CR1HT", "PHG35", "Seagull Seventeen", "W604S"]

# === LOAD HMP FILE ===
df = pd.read_csv(hmp_file, sep="\t", low_memory=False)

# === FILTER FOR POSITIVE CONTROL SNP ===
match = df[df['rs#'] == positive_control_rs]

if match.empty:
    print(f"❌ SNP {positive_control_rs} not found in file.")
else:
    row = match.iloc[0]

    # Check consistency for hybrid groups
    nine_values = row[nine_hybrids]
    expanded_values = row[expanded_set]

    nine_consistent = nine_values.nunique() == 1
    expanded_consistent = expanded_values.nunique() == 1

    # Build result
    result_df = pd.DataFrame([{
        "rs#": positive_control_rs,
        "nine_hybrids_consistent": nine_consistent,
        "expanded_set_consistent": expanded_consistent
    }])

    # Save to Excel
    result_df.to_excel("positive_control_consistency.xlsx", index=False)
    print("✅ Done! Output saved to: positive_control_consistency.xlsx")


This code was later added in which the output also contained the alleles which is important to known in designing CAPS assays. 

In [None]:
import pandas as pd

# === FILE PATHS ===
hmp_file = "C:/Users/Lundquist Lab/OneDrive - Michigan State University/Documents/Jannis/AgSpec/GWAS SNPs/all_1034_snps.hmp.txt"
chr_loc_file = "C:/Users/Lundquist Lab/OneDrive - Michigan State University/Documents/Jannis/AgSpec/GWAS SNPs/SNPs_filtered_6_2025.xlsx"

# === HYBRID GROUPS ===
nine_hybrids = ["B73", "Mo17", "LH195", "PHN82", "PHK76", "PHB47"]
expanded_set = ["B73", "Mo17", "LH195", "PHN82", "PHK76", "PHB47", "PHZ51", "PHG29", "PHG47",
                "LH145", "LH82", "PHJ40", "PH207", "PHG72", "PHW65", "B84", "PHG50", "PHN11", "CR1HT", "PHG35", "Seagull Seventeen", "W604S"]

# === READ HMP FILE ===
print("Loading HMP file...")
hmp_df = pd.read_csv(hmp_file, sep="\t", low_memory=False)

# === READ Chromosome/Location FILE ===
print("Loading Chromosome/Location file...")
if chr_loc_file.endswith(".xlsx"):
    chr_df = pd.read_excel(chr_loc_file)
elif chr_loc_file.endswith(".csv"):
    chr_df = pd.read_csv(chr_loc_file)
else:
    raise ValueError("Input file must be .xlsx or .csv")

# === Create rs# ===
chr_df['rs#'] = chr_df.apply(lambda x: f"chr{x['Chromosome']}_{x['Location']}", axis=1)

# === Function to Check Consistency ===
def is_consistent(row, columns):
    vals = row[columns]
    return vals.nunique() == 1

# === Main Loop ===
results = []
for _, row in chr_df.iterrows():
    rs_id = row['rs#']
    chr_num = row['Chromosome']
    pos = row['Location']
    
    matching_row = hmp_df[hmp_df['rs#'] == rs_id]
    if matching_row.empty:
        results.append({
            "Chromosome": chr_num,
            "Position": pos,
            "rs#": rs_id,
            "nine_hybrids_consistent": "Not found",
            "expanded_set_consistent": "Not found",
            "alleles": "Not found"
        })
    else:
        data_row = matching_row.iloc[0]
        nine_consistent = is_consistent(data_row, nine_hybrids)
        expanded_consistent = is_consistent(data_row, expanded_set)
        alleles = data_row["alleles"] if "alleles" in data_row else "N/A"
        results.append({
            "Chromosome": chr_num,
            "Position": pos,
            "rs#": rs_id,
            "nine_hybrids_consistent": nine_consistent,
            "expanded_set_consistent": expanded_consistent,
            "alleles": alleles
        })

# === Export to Excel ===
output_df = pd.DataFrame(results)
output_file = "consistency_output_alleles.xlsx"
output_df.to_excel(output_file, index=False)

print(f"✅ Done! Results written to: {output_file}")
