In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Reading in 23andMe data

In [10]:
def load_23andme(path, gt_col):
    df = pd.read_csv(
        path,
        header=None,
        names=["id", "chrom", "pos", gt_col],
        sep="\t",            # IMPORTANT if these are tab-delimited
        comment="#",         # IMPORTANT if there are header/comment lines
        dtype={"chrom": "string"},
        na_values=["", "NA", "NaN"]
    )

    # normalize chrom labels
    df["chrom"] = (
        df["chrom"]
        .str.strip()
        .str.replace("^chr", "", regex=True)   # chr1 -> 1, chrX -> X
        .str.replace("^M$", "MT", regex=True)  # M -> MT (if present)
    )

    # ensure pos is numeric, drop bad rows
    df["pos"] = pd.to_numeric(df["pos"], errors="coerce").astype("Int64")

    # clean genotype
    df[gt_col] = df[gt_col].astype("string").str.strip()

    # autosomes only + keep only standard diploid calls
    df = df[
        df["chrom"].isin([str(i) for i in range(1, 23)]) &
        df[gt_col].notna() &
        ~df[gt_col].isin(["--", "II", "DD", "DI"])
    ].dropna(subset=["pos"])

    # if you truly want int (non-null), convert at end
    df["pos"] = df["pos"].astype(int)

    return df

josh = load_23andme("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/genome_Joshua_Yoakem_v5_Full_20250129211749.txt.corrected.txt", "josh_gt")
user5105 = load_23andme("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5105_file3621_yearofbirth_unknown_sex_XX.23andme.txt.corrected.txt", "user5105_gt")
user5107 = load_23andme("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5107_file3623_yearofbirth_1986_sex_XX.23andme.txt.corrected.txt", "user5107_gt")
user5902 = load_23andme("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5902_file4372_yearofbirth_1983_sex_XX.23andme.txt.corrected.txt", "user5902_gt")
user5917 = load_23andme("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5917_file4386_yearofbirth_1979_sex_XX.23andme.txt.corrected.txt", "user5917_gt")

In [None]:
# Individual genotyping
josh = pd.read_csv("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/genome_Joshua_Yoakem_v5_Full_20250129211749.txt.corrected.txt", header = None, names=["id", "chrom", "pos", "josh_gt"]).query('chrom not in ["X", "Y", "MT"]').astype({'chrom': 'str', 'pos': 'int'}).query('josh_gt not in ["--", "II", "DD", "DI"]')
user5105 = pd.read_csv("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5105_file3621_yearofbirth_unknown_sex_XX.23andme.txt.corrected.txt", header = None, names=["id", "chrom", "pos", "user5105_gt"]).query('chrom not in ["X", "Y", "MT"]').astype({'chrom': 'str', 'pos': 'int'}).query('user5105_gt not in ["--", "II", "DD", "DI"]')
user5107 = pd.read_csv("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5107_file3623_yearofbirth_1986_sex_XX.23andme.txt.corrected.txt", header = None, names=["id", "chrom", "pos", "user5107_gt"]).query('chrom not in ["X", "Y", "MT"]').astype({'chrom': 'str', 'pos': 'int'}).query('user5107_gt not in ["--", "II", "DD", "DI"]')
user5902 = pd.read_csv("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5902_file4372_yearofbirth_1983_sex_XX.23andme.txt.corrected.txt", header = None, names=["id", "chrom", "pos", "user5902_gt"]).query('chrom not in ["X", "Y", "MT"]').astype({'chrom': 'str', 'pos': 'int'}).query('user5902_gt not in ["--", "II", "DD", "DI"]')
user5917 = pd.read_csv("/Users/lanceoconnor/Downloads/5-PGP-genotype-files/user5917_file4386_yearofbirth_1979_sex_XX.23andme.txt.corrected.txt", header = None, names=["id", "chrom", "pos", "user5917_gt"]).query('chrom not in ["X", "Y", "MT"]').astype({'chrom': 'str', 'pos': 'int'}).query('user5917_gt not in ["--", "II", "DD", "DI"]')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer: Error while type casting for column 'pos'

## Reformatting data to a matrix

In [11]:
print(josh.head())

            id chrom     pos josh_gt
0  rs548049170     1   69869      TT
1    rs9283150     1  565508      AA
2  rs116587930     1  727841      GG
3    rs3131972     1  752721      GG
4   rs12184325     1  754105      CC


# Reading in Seldin data

In [7]:
df = pd.read_csv('/Users/lanceoconnor/Downloads/Seldin_AIMS_SNPs.csv')
print(df.head)

<bound method NDFrame.head of           NCBI       SNP_Assay   Strand VIC FAM  \
0   rs10108270  C__30263561_10  Forward   A   C   
1   rs10236187  C____328256_10  Forward   A   C   
2    rs1040045  C___8767011_10  Forward   A   G   
3    rs1040404  C___2985471_10  Reverse   A   G   
4   rs10496971  C__30021395_20  Forward   G   T   
..         ...             ...      ...  ..  ..   
88    rs948028  C___8799834_10  Reverse   A   C   
89   rs9522149  C__30502208_20  Reverse   C   T   
90   rs9530435  C__27192660_10  Reverse   C   T   
91   rs9809104  C__30049893_10  Reverse   C   T   
92   rs9845457  C___1478361_10  Reverse   A   G   

                                              Context  Chr  Assembly  \
0   ACAATTCTATTAAAGCCAATCCTGA[A/C]GCTAAGTCCTCACCTG...    8        36   
1   GAACGGCAGACAAAGCCTCACATTA[A/C]GCATCTCTTTAGTAAA...    7        36   
2   TCTTGGGGGTCCTGCTCCATGCTGC[A/G]TTACCCCAATCCCCAT...    6        36   
3   GCTGAGCATTTTGTAGTGAAATTAG[A/G]TGTGGTAGAAAATAGT...    1        36 