## Inputs

In [1]:
indir = "sample_idats" # Directory with IDATs

In [2]:
outcsv_beta = "betas.csv" # Output path for csv file containing beta values
outcsv_pred = "prediction.csv" # Output path to save the prediction results

## Process idats

In [3]:
rscript = "../DNAm_deconv/process_array.R"
ref = "../DNAm_deconv/ref_sample.RData"

In [4]:
import os

## Uses meth_atlas processing script (https://github.com/nloyfer/meth_atlas/tree/master/pre_process)
os.system(f"""
        Rscript {rscript} {indir} {outcsv_beta} {ref}
    """)

Loading required package: BiocGenerics

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min

Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: S4Vectors

Attaching package: ‘S4Vectors’

The following object is masked from ‘package:utils’:

    findMatches

The following objects are masked from ‘package:base’:

    expand.grid, I, unname

Loading required package: IRanges
Loading required package: GenomeInfoDb
Loading require

[1] "1. process_array script - READING ARRAY - 2024-06-15 13:46:35.155617"
[1] "2. process_array script - P_VALUES CALCULATION - 2024-06-15 13:46:39.607545"


Loading required package: IlluminaHumanMethylationEPICmanifest


[1] "3. process_array script - ILLUMINA - 2024-06-15 13:46:46.201213"


[convertArray] Casting as IlluminaHumanMethylation450k
Loading required package: IlluminaHumanMethylation450kmanifest


[1] "4. process_array script - CONTEXT_REMOVAL - 2024-06-15 13:46:58.03009"
[1] "5. process_array script - CONVERT TO RATIOSET - 2024-06-15 13:46:58.206205"
[1] "6. process_array script - CONVERT TO GENOMICRATIOSET - 2024-06-15 13:46:58.247501"


Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19


[1] "7. process_array script - REMOVE SNPs - 2024-06-15 13:46:59.154387"
[1] "8. process_array script - BETA VALUES CALCULATION - 2024-06-15 13:46:59.995857"
[1] "13. process_array script - WRITING ARRAY TO CSV - 2024-06-15 13:47:03.657825"
[1] "14. process_array script - FINISHED - 2024-06-15 13:47:05.544365"


0

## Load betas file

In [5]:
import pandas as pd # tested with pandas version 2.1.1

In [6]:
df = pd.read_csv(outcsv_beta, index_col=0).T
df.head()

Unnamed: 0,cg20367788,cg01847620,cg08450501,cg06848514,cg16045340,cg08036346,cg26702958,cg26361646,cg21774865,cg12465710,...,cg06969287,cg15545692,cg08989942,cg18293633,cg24076348,cg17754680,cg09460231,cg22502319,cg02078370,cg18105767
201868500109_R01C01,0.71381,0.97018,0.02561,0.05913,0.77954,0.95897,0.09149,,0.93369,0.40153,...,0.07831,0.77114,0.7088,0.59292,0.807,0.71371,0.60444,0.03165,0.10808,0.11553


## load CpG sites

In [8]:
# CpG sites
low_cgs = pd.read_csv("../neural_group_classification/low_manifest_all.csv", index_col=0).index.tolist()
high_cgs = pd.read_csv("../neural_group_classification/high_manifest_all.csv", index_col=0).index.tolist()
cgs = low_cgs + high_cgs
print(len(cgs))

1289


## check overlap of CpGs

In [9]:
common = list(set(df.columns)&set(cgs))
print(len(common))

1289


In [10]:
df = df[cgs] # subset

### If needed, use simple imputer to impute any missing values (uses mean CpG value from the training data)

In [11]:
## check for CpGs with at least one missing value
missing_cgs = df.columns[df.isna().sum()>0]
print(len(missing_cgs))

0


In [12]:
import joblib
imputer = joblib.load("../neural_group_classification/imputer.pkl")

In [13]:
df_fill_values = pd.DataFrame(imputer.statistics_, columns=["fill_value"], index=imputer.feature_names_in_)
df_fill_values.head()

Unnamed: 0,fill_value
cg20367788,0.640331
cg01847620,0.933521
cg08450501,0.062221
cg06848514,0.069198
cg16045340,0.689996


In [14]:
import numpy as np
np.__version__

'1.26.3'

In [15]:
for idx in df.index:
    for col in missing_cgs:
        if pd.isnull(df.loc[idx, col]):
            df.loc[idx, col] = df_fill_values.loc[col][0]

## Load model and make prediction

In [16]:
# Model 
import joblib
clf = joblib.load("../neural_group_classification/logregCV_allCpG.pkl")
print(clf.coef_.shape[1]) # number of features

1289


In [17]:
preds = clf.predict(np.array(df))
preds_score = clf.predict_proba(np.array(df)).max(1)

In [20]:
df_preds = pd.DataFrame([preds, preds_score], index=["Prediction", "Prediction score"], columns=df.index).T
df_preds["Prediction"] = df_preds["Prediction"].astype(int)
df_preds

Unnamed: 0,Prediction,Prediction score
201868500109_R01C01,0,0.992182


In [21]:
df_preds.to_csv(outcsv_pred)

-----