In [1]:
import pandas as pd

from sprime_mapping import map_sprime_segments
from sprime_info import compute_sprime_info

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# these two files are necessary: score_file with SPrime output and original vcf_file output
vcf_file = "archie.1_biallelic.vcf.gz"
score_file = "sprime.1src.out.100000.score"


In [3]:
# for getting only the introgressed tracts in the target individuals, we also have to provide a list with the individual names (have to correspond to the vcf-file) we are interested in
# if not provided, for all individuals in the vcf-file the introgression status is checked (including reference individuals etc. - this is usually not desired / reasonable, but perhaps one want to check)
tgt_individuals_file = "archie.1.tgt.ind.list"

In [4]:
# if we ant to write the file, also an out_file has to be scpecified
#out_file_wo_filtering = "archie.0.sprime.100000.inferred_wo_filtering.bed"
#out_file_every_snp = "archie.0.sprime.100000.inferred_every_snp.bed"
#out_file_frac_50 = "archie.0.sprime.100000.inferred_fraction_050.bed"
out_file_frac_100 = "archie.0.sprime.100000.inferred_fraction_100.bed"




In [5]:
# open file with tgt individuals

In [6]:
with open(tgt_individuals_file) as f:
    tgt_individuals = [line.strip() for line in f if line.strip()]


In [7]:
tgt_individuals

['tsk_50',
 'tsk_51',
 'tsk_52',
 'tsk_53',
 'tsk_54',
 'tsk_55',
 'tsk_56',
 'tsk_57',
 'tsk_58',
 'tsk_59']

In [8]:
### Let's also check the sprime output file

In [9]:
df = pd.read_csv(score_file, sep="\t")

# Count the number of SNPs per SEGMENT
snp_counts = df['SEGMENT'].value_counts()

snp_counts_df = snp_counts.reset_index()
snp_counts_df.columns = ['SEGMENT', 'SNP_COUNT']

print(snp_counts_df)
print(snp_counts_df["SNP_COUNT"].sum())

    SEGMENT  SNP_COUNT
0         0         75
1         1         47
2         4         42
3         5         36
4         2         31
5         3         30
6        11         29
7        10         28
8         6         27
9         7         24
10        9         23
11        8         22
12       13         21
13       12         20
455


## we assume that only matches of full segments - i.e. each archaic SNP of the segment in the score file has to be present in the target individual - are acceptable

## This is done by setting segment_fraction=1, i.e. the full segment without deviations must be matched

### With these settings, the first and last SNP, which matches the archaic SNPs of the segment, is used as border of the segment

### In this run of the function, we create an output-file (bed-file)



In [10]:
df_100 = map_sprime_segments(
    score_file,
    vcf_file,
    out_file_frac_100, only_tract_output=True, phased=True, return_full_records=False, target_individuals=tgt_individuals,
    segment_fraction=1)

### we can also show the full records, including the nsnps in each segment, the fraction of archaic snps in the segment for a specific individual and the snps present
### out_file is here set to None, so no new out_file is created

In [11]:
df_100_full = map_sprime_segments(
    score_file,
    vcf_file,
    None, only_tract_output=True, phased=True, return_full_records=True, target_individuals=tgt_individuals,
    segment_fraction=1)

In [12]:
# now the number of fragments in the dataframe is considerably smaller than for other approachs accepting partial matches
print(len(df_100_full))

10


# We see that - because we require that full segments, i.e. all SNPs of a segment, have to be matched - not all segments are matched

In [13]:
df_100_full

Unnamed: 0,chrom,start,end,individual,haplotype,nsnps,segment_fraction,segment,all_snps_present,archaic_snps_present
0,1,238504,314341,tsk_50,1,28,1.0,10,"[238504, 251513, 255766, 257368, 257723, 25802...","[238504, 251513, 255766, 257368, 257723, 25802..."
1,1,238504,314341,tsk_57,1,28,1.0,10,"[238504, 251513, 255766, 257368, 257723, 25802...","[238504, 251513, 255766, 257368, 257723, 25802..."
2,1,8413208,8484292,tsk_56,2,29,1.0,11,"[8413208, 8418944, 8423046, 8430926, 8432462, ...","[8413208, 8418944, 8423046, 8430926, 8432462, ..."
3,1,9931398,10038950,tsk_53,1,27,1.0,6,"[9931398, 9939286, 9945995, 9946088, 9951559, ...","[9931398, 9939286, 9945995, 9946088, 9951559, ..."
4,1,9931398,10038950,tsk_54,2,27,1.0,6,"[9931398, 9939286, 9945995, 9946088, 9951559, ...","[9931398, 9939286, 9945995, 9946088, 9951559, ..."
5,1,9931398,10038950,tsk_55,2,27,1.0,6,"[9931398, 9939286, 9945995, 9946088, 9951559, ...","[9931398, 9939286, 9945995, 9946088, 9951559, ..."
6,1,11390420,11569377,tsk_53,2,42,1.0,4,"[11390420, 11394987, 11398157, 11407918, 11427...","[11390420, 11394987, 11398157, 11407918, 11427..."
7,1,17813068,17870703,tsk_52,2,36,1.0,5,"[17813068, 17815969, 17816939, 17817828, 17819...","[17813068, 17815969, 17816939, 17817828, 17819..."
8,1,20647984,20693924,tsk_50,2,21,1.0,13,"[20647984, 20648821, 20649517, 20649859, 20650...","[20647984, 20648821, 20649517, 20649859, 20650..."
9,1,20647984,20693924,tsk_51,1,21,1.0,13,"[20647984, 20648821, 20649517, 20649859, 20650...","[20647984, 20648821, 20649517, 20649859, 20650..."


# Computation of averages

### We now additionally compute the length of the regions as well as the segments per individual
## afterwards, we can average over these values and get the desired results

In [14]:
res_frac100 = compute_sprime_info(df_100_full)



In [15]:
# total length of introgressed regions per individual/haplotype
res_frac100["total_length_ind_hap"]


Unnamed: 0,individual_haplotype,total_length,individual,haplotype
0,tsk_50_1,75837,tsk_50,1
1,tsk_50_2,45940,tsk_50,2
2,tsk_51_1,45940,tsk_51,1
3,tsk_52_2,57635,tsk_52,2
4,tsk_53_1,107552,tsk_53,1
5,tsk_53_2,178957,tsk_53,2
6,tsk_54_2,107552,tsk_54,2
7,tsk_55_2,107552,tsk_55,2
8,tsk_56_2,71084,tsk_56,2
9,tsk_57_1,75837,tsk_57,1


In [16]:
# total length of introgressed regions per individual/haplotype and segment
res_frac100["total_length_ind_hap_seg"]

Unnamed: 0,individual_haplotype,segment,total_length,individual,haplotype
0,tsk_50_1,10,75837,tsk_50,1
1,tsk_50_2,13,45940,tsk_50,2
2,tsk_51_1,13,45940,tsk_51,1
3,tsk_52_2,5,57635,tsk_52,2
4,tsk_53_1,6,107552,tsk_53,1
5,tsk_53_2,4,178957,tsk_53,2
6,tsk_54_2,6,107552,tsk_54,2
7,tsk_55_2,6,107552,tsk_55,2
8,tsk_56_2,11,71084,tsk_56,2
9,tsk_57_1,10,75837,tsk_57,1


In [17]:
#segments per individual/haplotype 
res_frac100["segments_per_ind"]

Unnamed: 0,individual_haplotype,segments,nr_segments,individual,haplotype
0,tsk_50_1,[10],1,tsk_50,1
1,tsk_50_2,[13],1,tsk_50,2
2,tsk_51_1,[13],1,tsk_51,1
3,tsk_52_2,[5],1,tsk_52,2
4,tsk_53_1,[6],1,tsk_53,1
5,tsk_53_2,[4],1,tsk_53,2
6,tsk_54_2,[6],1,tsk_54,2
7,tsk_55_2,[6],1,tsk_55,2
8,tsk_56_2,[11],1,tsk_56,2
9,tsk_57_1,[10],1,tsk_57,1


In [18]:
# the df now also has interval length
res_frac100["df"]

Unnamed: 0,chrom,start,end,individual,haplotype,nsnps,segment_fraction,segment,all_snps_present,archaic_snps_present,individual_haplotype,interval_length
0,1,238504,314341,tsk_50,1,28,1.0,10,"[238504, 251513, 255766, 257368, 257723, 25802...","[238504, 251513, 255766, 257368, 257723, 25802...",tsk_50_1,75837
1,1,238504,314341,tsk_57,1,28,1.0,10,"[238504, 251513, 255766, 257368, 257723, 25802...","[238504, 251513, 255766, 257368, 257723, 25802...",tsk_57_1,75837
2,1,8413208,8484292,tsk_56,2,29,1.0,11,"[8413208, 8418944, 8423046, 8430926, 8432462, ...","[8413208, 8418944, 8423046, 8430926, 8432462, ...",tsk_56_2,71084
3,1,9931398,10038950,tsk_53,1,27,1.0,6,"[9931398, 9939286, 9945995, 9946088, 9951559, ...","[9931398, 9939286, 9945995, 9946088, 9951559, ...",tsk_53_1,107552
4,1,9931398,10038950,tsk_54,2,27,1.0,6,"[9931398, 9939286, 9945995, 9946088, 9951559, ...","[9931398, 9939286, 9945995, 9946088, 9951559, ...",tsk_54_2,107552
5,1,9931398,10038950,tsk_55,2,27,1.0,6,"[9931398, 9939286, 9945995, 9946088, 9951559, ...","[9931398, 9939286, 9945995, 9946088, 9951559, ...",tsk_55_2,107552
6,1,11390420,11569377,tsk_53,2,42,1.0,4,"[11390420, 11394987, 11398157, 11407918, 11427...","[11390420, 11394987, 11398157, 11407918, 11427...",tsk_53_2,178957
7,1,17813068,17870703,tsk_52,2,36,1.0,5,"[17813068, 17815969, 17816939, 17817828, 17819...","[17813068, 17815969, 17816939, 17817828, 17819...",tsk_52_2,57635
8,1,20647984,20693924,tsk_50,2,21,1.0,13,"[20647984, 20648821, 20649517, 20649859, 20650...","[20647984, 20648821, 20649517, 20649859, 20650...",tsk_50_2,45940
9,1,20647984,20693924,tsk_51,1,21,1.0,13,"[20647984, 20648821, 20649517, 20649859, 20650...","[20647984, 20648821, 20649517, 20649859, 20650...",tsk_51_1,45940


## Average number of segments per individual

In [19]:
sum_per_individual = (
    res_frac100["segments_per_ind"].groupby("individual")["nr_segments"]
      .sum()
      .reset_index(name="sum_nr_segments")
)


In [20]:
sum_per_individual


Unnamed: 0,individual,sum_nr_segments
0,tsk_50,2
1,tsk_51,1
2,tsk_52,1
3,tsk_53,2
4,tsk_54,1
5,tsk_55,1
6,tsk_56,1
7,tsk_57,1


In [21]:
avg_segments_per_individual = sum_per_individual["sum_nr_segments"].mean()
print(avg_segments_per_individual)

1.25


## or per chromosome, ,i.e. individual haplotypes

In [22]:
sum_per_individual_and_haplotype = (
    res_frac100["segments_per_ind"].groupby(["individual", "haplotype"])["nr_segments"]
      .sum()
      .reset_index(name="sum_nr_segments")
)

In [23]:
sum_per_individual_and_haplotype

Unnamed: 0,individual,haplotype,sum_nr_segments
0,tsk_50,1,1
1,tsk_50,2,1
2,tsk_51,1,1
3,tsk_52,2,1
4,tsk_53,1,1
5,tsk_53,2,1
6,tsk_54,2,1
7,tsk_55,2,1
8,tsk_56,2,1
9,tsk_57,1,1


In [24]:
avg_segments_per_individual_haplotype = sum_per_individual_and_haplotype["sum_nr_segments"].mean()
print(avg_segments_per_individual_haplotype)

1.0
