# Пайплайн с неполным пересчётом и перезаписью матрицы

In [1]:
import hail as hl

In [2]:
hl.init()
hl.default_reference('GRCh38')

Running on Apache Spark version 3.5.5
SparkUI available at http://192.168.0.115:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.134-952ae203dbbe
LOGGING: writing to /home/julia/Downloads/gnomADru/hail-20250427-2206-0.2.134-952ae203dbbe.log
2025-04-27 22:06:39.999 Hail: INFO: Reading table without type imputation
  Loading field 'ID' as type str (user-supplied)
  Loading field 'sex' as type str (user-supplied)
  Loading field 'profile' as type str (not specified)
2025-04-27 22:06:48.070 Hail: INFO: scanning VCF for sortedness...
2025-04-27 22:22:29.619 Hail: INFO: Reading table without type imputation
  Loading field 'ID' as type str (user-supplied)
  Loading field 'sex' as type str (user-supplied)
  Loading field 'profile' as type str (not specified)
2025-04-27 22:22:40.417 Hail: INFO: scanning VCF for sortedness...
2025-04-27 22:30:35.046 Hail: INFO: scanning VCF for sortedness...
2025-04-27 22:30:43.989 Hail: INFO: VCF is

In [3]:
import glob
import os

In [4]:
# конфигурация
# VCF_DIR = '/home/julia/Downloads/gnomADru/vcf/'  # папка с VCF
VCF_DIR = '/home/julia/Downloads/gnomADru/vcf_test/'  # папка с VCF без одного варианта

VCF_DIR_NEW = '/home/julia/Downloads/gnomADru/vcf_new/' # папка с VCF для добавления
# SEX_TABLE_PATH = '/home/julia/Downloads/gnomADru/sids.csv' # файл с полом
SEX_TABLE_PATH = '/home/julia/Downloads/gnomADru/vcf_test/sids_test.csv' # файл с полом - тестовый, с одним "лишним" образцом
BASE_DATA_PATH = '/home/julia/Downloads/gnomADru/cache/combined.mt' # первый пул данных

In [5]:
# старые файлы

vcf_files = glob.glob(VCF_DIR + '*.vcf.gz')
print(vcf_files)

['/home/julia/Downloads/gnomADru/vcf_test/000007000020.vcf.gz', '/home/julia/Downloads/gnomADru/vcf_test/000007000040.vcf.gz', '/home/julia/Downloads/gnomADru/vcf_test/000007000070.vcf.gz', '/home/julia/Downloads/gnomADru/vcf_test/000007000030.vcf.gz', '/home/julia/Downloads/gnomADru/vcf_test/000007000060.vcf.gz']


In [18]:
# новые файлы
new_vcf_files = glob.glob(VCF_DIR_NEW + '*.vcf.gz')
print(new_vcf_files)

['/home/julia/Downloads/gnomADru/vcf_new/000007000050.vcf.gz']


In [7]:
# добавление пола

def set_sex(mt, sex_table):
    # преобразуем пол в is_female (True для 'ж'/'f')
    sex_table = sex_table.annotate(
        is_female = (
            (sex_table.sex.lower() == 'ж') | 
            (sex_table.sex.lower() == 'f')
        )
    )
    
    # добавляем is_female к образцам (простое соединение)
    mt = mt.annotate_cols(
        is_female = sex_table[mt.s].is_female  # mt.s - ID образца
    )

    return mt

In [8]:
# нормализация гемизигот у мужчин

def gemizygote_normalize(mt):
    return mt.annotate_entries(
        GT = hl.if_else(
            (~mt.is_female) & ((mt.locus.contig == "chrX") | (mt.locus.contig == "chrY")),
            hl.if_else(
                mt.GT.is_hom_ref(),  # гомозигота по REF 0/0 → 0
                hl.call(0),
                hl.if_else(
                    mt.GT.is_hom_var(),  # гомозигота по ALT 1/1 → 1
                    hl.call(1),
                    hl.if_else(
                        mt.VAF[0] > 0.3,  # для гетерозигот - если VAF > 30% → 1 считаем гомозиготой по ALT
                        hl.call(1),
                        hl.call(0)     # иначе → 0 считаем гомозиготой по REF
                    )
                )
            ),
            mt.GT  # Для женщин и аутосом оставляем без изменений
        )
    )

In [9]:
# фильтрация по глубине

def filter_variants_by_DP(combined_mt_all, dp):

    # отсекаем варианты, если нет ни одного образца с DP больше порога
    filtered_mt = combined_mt_all.filter_rows(
        hl.agg.count_where(
            (hl.is_defined(combined_mt_all.DP)) & 
            (combined_mt_all.DP >= dp)
        ) >= 1
    )

    # корректируем генотипы - варианты с DP меньше порога исключаем из расчёта частот, помечая как NA
    return filtered_mt.annotate_entries(
        GT = hl.if_else(
            (hl.is_defined(filtered_mt.DP)) & 
            (filtered_mt.DP >= dp),
            filtered_mt.GT,
            hl.missing(hl.tcall)
        )
    )


In [10]:
#препроцессинг до фильтрации включительно

def preprocessing(vcf_files, sex_table):
    # комбайн
    mts_all = []
    for vcf in vcf_files: 
        mt = hl.import_vcf(vcf, force_bgz=True, array_elements_required=False)
        mt = set_sex(mt, sex_table)
        mt = gemizygote_normalize(mt)    
        mts_all.append(mt)

    # Объединение MatrixTable по колонкам (образцам)
    combined_mt_all = mts_all[0]
    if len(mts_all) > 1:
        for mt in mts_all[1:]:
            combined_mt_all = combined_mt_all.union_cols(mt, row_join_type='outer')

    #фильтрация по глубине
    return filter_variants_by_DP(combined_mt_all, 3)

In [11]:
# расчёт частот - 4 версия.попытаться ускорить.пока ориентировочно лучший вариант

def mt_AF_calculated(mt):
    freq_mt_all = mt.annotate_rows(
    call_stats=hl.agg.call_stats(mt.GT, mt.alleles)
    )

    # извлечение частот аллелей
    return freq_mt_all.annotate_rows(
        allele_frequencies=freq_mt_all.call_stats.AF  # AF — это массив частот аллелей, включая мультиаллели
    )

In [12]:
# функция сбора первых данных

def save_base_data(vcf_files, sex_table):    
    #препроцессированные данные - установка пола, нормализация гемизигот, фильтрация по глубине
    mt_combined = preprocessing(vcf_files, sex_table)
    
    #расчёт частот
    mt_af = mt_AF_calculated(mt_combined)

    #сохранение
    mt_af.write(BASE_DATA_PATH)
    return mt_af


## Пайплайн с добавлением новых данных - отсюда и до конца

In [19]:
# определение пола
sex_table = hl.import_table(SEX_TABLE_PATH,
        delimiter=',',
        types={'ID': hl.tstr, 'sex': hl.tstr},
        key='ID'
    )

In [None]:
# не нужен, если данные уже сохранены
# mt = save_base_data(vcf_files, sex_table)

#3m56.9s



In [None]:
mt.show(n_cols=6, n_rows=10)



Unnamed: 0_level_0,Unnamed: 1_level_0,'000007000020','000007000020','000007000020','000007000020','000007000020','000007000020','000007000020','000007000020','000007000040','000007000040','000007000040','000007000040','000007000040','000007000040','000007000040','000007000040','000007000070','000007000070','000007000070','000007000070','000007000070','000007000070','000007000070','000007000070','000007000030','000007000030','000007000030','000007000030','000007000030','000007000030','000007000030','000007000030','000007000060','000007000060','000007000060','000007000060','000007000060','000007000060','000007000060','000007000060'
locus,alleles,GT,GQ,DP,MIN_DP,AD,VAF,PL,MED_DP,GT,GQ,DP,MIN_DP,AD,VAF,PL,MED_DP,GT,GQ,DP,MIN_DP,AD,VAF,PL,MED_DP,GT,GQ,DP,MIN_DP,AD,VAF,PL,MED_DP,GT,GQ,DP,MIN_DP,AD,VAF,PL,MED_DP
locus<GRCh38>,array<str>,call,int32,int32,int32,array<int32>,array<float64>,array<int32>,int32,call,int32,int32,int32,array<int32>,array<float64>,array<int32>,int32,call,int32,int32,int32,array<int32>,array<float64>,array<int32>,int32,call,int32,int32,int32,array<int32>,array<float64>,array<int32>,int32,call,int32,int32,int32,array<int32>,array<float64>,array<int32>,int32
chr1:10177,"[""A"",""AC""]",,,,,,,,,,,,,,,,,1/1,4.0,3.0,,"[0,2]",[6.67e-01],"[1,13,0]",,,,,,,,,,,,,,,,,
chr1:10230,"[""AC"",""A""]",,,,,,,,,,,,,,,,,,,,,,,,,,13.0,5.0,,"[3,2]",[4.00e-01],"[0,21,13]",,,,,,,,,
chr1:10241,"[""T"",""C""]",,,,,,,,,,,,,,,,,,,,,,,,,,18.0,5.0,,"[3,2]",[4.00e-01],"[0,21,19]",,,,,,,,,
chr1:10291,"[""C"",""T""]",,,,,,,,,,,,,,,,,,,,,,,,,,18.0,6.0,,"[4,2]",[3.33e-01],"[0,20,20]",,,,,,,,,
chr1:10315,"[""C"",""T""]",,,,,,,,,,,,,,,,,,,,,,,,,,14.0,5.0,,"[2,3]",[6.00e-01],"[0,18,15]",,,,,,,,,
chr1:10333,"[""CT"",""C""]",,,,,,,,,,,,,,,,,,,,,,,,,,13.0,5.0,,"[2,2]",[4.00e-01],"[0,16,16]",,,,,,,,,
chr1:10407,"[""T"",""C""]",,5.0,3.0,,"[1,2]",[6.67e-01],"[0,8,5]",,,,,,,,,,,,,,,,,,,3.0,4.0,,"[2,2]",[5.00e-01],"[0,8,1]",,,3.0,6.0,,"[2,4]",[6.67e-01],"[0,6,0]",
chr1:10417,"[""C"",""G""]",,6.0,4.0,,"[2,2]",[5.00e-01],"[0,5,12]",,,,,,,,,,,,,,,,,,,6.0,3.0,,"[1,2]",[6.67e-01],"[0,14,5]",,,4.0,6.0,,"[2,4]",[6.67e-01],"[0,14,1]",
chr1:10428,"[""CCCTAA"",""C""]",0/1,3.0,4.0,,"[1,3]",[7.50e-01],"[0,0,7]",,,,,,,,,,,,,,,,,,,4.0,5.0,,"[3,2]",[4.00e-01],"[0,14,2]",,,4.0,7.0,,"[3,4]",[5.71e-01],"[0,15,1]",
chr1:10433,"[""A"",""AC""]",,,,,,,,,,,,,,,,,,,,,,,,,1/1,9.0,3.0,,"[0,3]",[1.00e+00],"[11,12,0]",,,,,,,,,


In [20]:
%%time

# если начальные данные уже сохранены - достаём их, если нет - сохраняем
if os.path.exists(BASE_DATA_PATH):
    old_data = hl.read_matrix_table(BASE_DATA_PATH)
else:
    old_data = save_base_data(vcf_files, sex_table)


# запись:
#CPU times: user 262 ms, sys: 23.5 ms, total: 286 ms
#Wall time: 3min 53s

# достать из кеша:
#CPU times: user 7.7 ms, sys: 1.96 ms, total: 9.66 ms
#Wall time: 135 ms

CPU times: user 14 ms, sys: 1.16 ms, total: 15.2 ms
Wall time: 123 ms


In [24]:
%%time

# препроцессинг новых данных
new_data = preprocessing(new_vcf_files, sex_table)

CPU times: user 104 ms, sys: 1.34 ms, total: 105 ms
Wall time: 144 ms


In [25]:
%%time

# поиск уникальных вариантов
# new_variants = new_data.rows().key_by("locus", "alleles")
new_variants = new_data.rows()

# фильтруем старые данные: оставляем только варианты, которые есть в новых образцах
old_mt_to_update = old_data.filter_rows(
    hl.is_defined(new_variants[old_data.row_key])
)

CPU times: user 52 ms, sys: 0 ns, total: 52 ms
Wall time: 50.6 ms


In [26]:
print(old_mt_to_update.count(), old_data.count(), new_data.count())

[Stage 12:>                                                         (0 + 1) / 1]

(5341786, 5) (10918243, 5) (5341787, 1)


In [27]:
%%time

# объединение старых и новых данных с пересчётом частот PART 1

# Объединяем старые и новые данные по этим вариантам
updated_mt = old_mt_to_update.union_cols(new_data, row_join_type="outer")

# Пересчитываем AF для обновлённых вариантов
updated_mt = mt_AF_calculated(updated_mt)

# Получаем список всех колонок из обновлённой матрицы
# cols_updated = updated_mt.col_key.keys()

CPU times: user 21.1 ms, sys: 2.47 ms, total: 23.5 ms
Wall time: 22.8 ms


In [28]:
# old_data и final_mt - найти один вариант и сравнить AF

In [29]:
%%time

# объединяем пересчитанные данные со старыми

old_mt = old_data.filter_rows(
    ~hl.is_defined(new_variants[old_data.row_key])
)
new_mt = updated_mt

# Объединяем таблицы
combined_mt = hl.experimental.full_outer_join_mt(old_mt, new_mt)

# 1. Сначала создаем новое неключевое поле с объединенными sample IDs
combined_mt = combined_mt.annotate_cols(
    combined_s = hl.coalesce(combined_mt.right_col.s, combined_mt.left_col.s),
    is_female = hl.coalesce(combined_mt.right_col.is_female, combined_mt.left_col.is_female)
)

# 2. Переключаем ключ столбца на новое поле
combined_mt = combined_mt.key_cols_by('combined_s')

# # 3. Теперь можем переименовать поле обратно в 's' если нужно
combined_mt = combined_mt.drop('s', 'left_col', 'right_col')
combined_mt = combined_mt.rename({'combined_s': 's'})


# Собираем поля для локуса и аллелей
combined_mt = combined_mt.annotate_rows(
    combined_locus = hl.coalesce(combined_mt.right_row.locus, combined_mt.left_row.locus),
    combined_alleles = hl.coalesce(combined_mt.right_row.alleles, combined_mt.left_row.alleles),
    rsid = hl.coalesce(combined_mt.right_row.rsid, combined_mt.left_row.rsid),
    qual = hl.coalesce(combined_mt.right_row.qual, combined_mt.left_row.qual),
    call_stats = hl.coalesce(combined_mt.right_row.call_stats, combined_mt.left_row.call_stats),
    allele_frequencies = hl.coalesce(combined_mt.right_row.allele_frequencies, combined_mt.left_row.allele_frequencies),
)

# переключаем ключи строк на новые поля
combined_mt = combined_mt.key_rows_by('combined_locus', 'combined_alleles')

combined_mt = combined_mt.drop('locus', 'alleles', 'right_row', 'left_row')
combined_mt = combined_mt.rename({'combined_locus': 'locus', 'combined_alleles': 'alleles'})

# # Объединяем записи (entry fields)
combined_mt = combined_mt.annotate_entries(
    # Используем записи из правой таблицы, если они есть, иначе из левой
    GT = hl.coalesce(combined_mt.right_entry.GT, combined_mt.left_entry.GT),
    DP = hl.coalesce(combined_mt.right_entry.DP, combined_mt.left_entry.DP),
    AD = hl.coalesce(combined_mt.right_entry.AD, combined_mt.left_entry.AD),
    VAF = hl.coalesce(combined_mt.right_entry.VAF, combined_mt.left_entry.VAF),
)

# # Убираем временные поля
combined_mt = combined_mt.drop('left_entry', 'right_entry')

combined_mt.describe()
# combined_mt.count()

# combined_mt.write(BASE_DATA_PATH, overwrite = True)
#Wall time: 3min 53s

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'is_female': bool
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'call_stats': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        homozygote_count: array<int32>
    }
    'allele_frequencies': array<float64>
----------------------------------------
Entry fields:
    'GT': call
    'DP': int32
    'AD': array<int32>
    'VAF': array<float64>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
CPU times: user 244 ms, sys: 0 ns, total: 244 ms
Wall time: 243 ms


In [25]:
combined_mt.show(n_cols = 10)



Unnamed: 0_level_0,Unnamed: 1_level_0,'000007000020','000007000020','000007000020','000007000020','000007000030','000007000030','000007000030','000007000030','000007000040','000007000040','000007000040','000007000040','000007000050','000007000050','000007000050','000007000050','000007000060','000007000060','000007000060','000007000060','000007000070','000007000070','000007000070','000007000070'
locus,alleles,GT,DP,AD,VAF,GT,DP,AD,VAF,GT,DP,AD,VAF,GT,DP,AD,VAF,GT,DP,AD,VAF,GT,DP,AD,VAF
locus<GRCh38>,array<str>,call,int32,array<int32>,array<float64>,call,int32,array<int32>,array<float64>,call,int32,array<int32>,array<float64>,call,int32,array<int32>,array<float64>,call,int32,array<int32>,array<float64>,call,int32,array<int32>,array<float64>
chr1:10177,"[""A"",""AC""]",,,,,,,,,,,,,,,,,,,,,1/1,3.0,"[0,2]",[6.67e-01]
chr1:10230,"[""AC"",""A""]",,,,,,5.0,"[3,2]",[4.00e-01],,,,,,,,,,,,,,,,
chr1:10241,"[""T"",""C""]",,,,,,5.0,"[3,2]",[4.00e-01],,,,,,,,,,,,,,,,
chr1:10291,"[""C"",""T""]",,,,,,6.0,"[4,2]",[3.33e-01],,,,,,,,,,,,,,,,
chr1:10315,"[""C"",""T""]",,,,,,5.0,"[2,3]",[6.00e-01],,,,,,,,,,,,,,,,
chr1:10333,"[""CT"",""C""]",,,,,,5.0,"[2,2]",[4.00e-01],,,,,,,,,,,,,,,,
chr1:10407,"[""T"",""C""]",,3.0,"[1,2]",[6.67e-01],,4.0,"[2,2]",[5.00e-01],,,,,,6.0,"[2,4]",[6.67e-01],,6.0,"[2,4]",[6.67e-01],,,,
chr1:10417,"[""C"",""G""]",,4.0,"[2,2]",[5.00e-01],,3.0,"[1,2]",[6.67e-01],,,,,,6.0,"[2,4]",[6.67e-01],,6.0,"[2,4]",[6.67e-01],,,,
chr1:10428,"[""CCCTAA"",""C""]",0/1,4.0,"[1,3]",[7.50e-01],,5.0,"[3,2]",[4.00e-01],,,,,,7.0,"[3,4]",[5.71e-01],,7.0,"[3,4]",[5.71e-01],,,,
chr1:10433,"[""A"",""AC""]",,,,,1/1,3.0,"[0,3]",[1.00e+00],,,,,,,,,,,,,,,,


In [35]:
variant_mt = combined_mt.filter_rows(
    (combined_mt.locus.contig == "22") & 
    (combined_mt.locus.position == 50818468)
)

variant_mt.count()




(0, 6)

In [29]:
new_mt_find = new_mt.filter_rows(
    (new_mt.locus.contig == "22") & 
    (new_mt.locus.position == 50818468)
)

new_mt_find.count()



(0, 6)

In [31]:
# Варианты, которые есть в new_data, но нет в old_mt
missing_variants = new_data.anti_join_rows(old_mt_to_update.rows())
missing_variants.show()

[Stage 29:>                                                         (0 + 1) / 1]

locus,alleles
locus<GRCh38>,array<str>
chr22:50818468,"[""A"",""T""]"


In [30]:
combined_mt.count()



(10918244, 6)

In [37]:
old_af = old_mt_to_update.filter_rows(
    (old_mt_to_update.locus.contig == "22") & 
    (old_mt_to_update.locus.position == 50808270)
)

old_af.rows().show(n_rows=10)



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,call_stats,call_stats,call_stats,call_stats,Unnamed: 10_level_0
locus,alleles,rsid,qual,filters,END,AC,AF,AN,homozygote_count,allele_frequencies
locus<GRCh38>,array<str>,str,float64,set<str>,int32,array<int32>,array<float64>,int32,array<int32>,array<float64>


In [39]:
rows = old_af.rows().collect()
for row in rows:
    print(row)

[Stage 65:>                                                         (0 + 1) / 1]

KeyboardInterrupt: 

In [None]:
final_af = combined_mt.filter_rows(
    (combined_mt.locus.contig == "22") & 
    (combined_mt.locus.position == 50808270)
)

final_af.rows().show()

In [40]:
hl.stop()

Exception in thread "RemoteBlock-temp-file-clean-thread" java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.storage.BlockManager$RemoteBlockDownloadFileManager$$Lambda$985/0x0000000100489040.get$Lambda(Unknown Source)
	at java.base/java.lang.invoke.DirectMethodHandle$Holder.invokeStatic(DirectMethodHandle$Holder)
	at java.base/java.lang.invoke.Invokers$Holder.linkToTargetMethod(Invokers$Holder)
	at org.apache.spark.storage.BlockManager$RemoteBlockDownloadFileManager.org$apache$spark$storage$BlockManager$RemoteBlockDownloadFileManager$$keepCleaning(BlockManager.scala:2228)
	at org.apache.spark.storage.BlockManager$RemoteBlockDownloadFileManager$$anon$2.run(BlockManager.scala:2194)
