In [1]:
from Bio import SeqIO
# To be able to modify sequences
from Bio.Seq import MutableSeq
# To import the databases directly
from database_io import load_db

In [2]:
databases_path = '/home/msr/Documents/Experimentos/Databases/'
reference_genome_path = databases_path + 'GCA_000001405.28/GCA_000001405.28_GRCh38.p13_genomic.fna'
reference_genome = SeqIO.index(reference_genome_path,"fasta")

ensembl_db = load_db(databases_path+'Ensembl_database_ready.csv')

In [3]:
chromosome_21_record = reference_genome["CM000683.2"] # The key is the GenBank identifier
chromosome_21_sequence = chromosome_21_record.seq
# Convert it into a mutable sequence
mutable_chromosome_21 = MutableSeq(chromosome_21_sequence)
# Import the ensembl database through a personalized function and extract the chromosome 21 variants
# In this line we import the database
chromosome_21_variants = ensembl_db.loc[ensembl_db['seqid']=='21']
# Consider that the loci counts start from 1 and Python starts counting from 0

In [4]:
chromosome_21_snvs = chromosome_21_variants.loc[chromosome_21_variants['type']=='SNV']
test = chromosome_21_snvs.iloc[1] # The first element (0) of the dataframe has two alleles

In [5]:
print(test)

seqid                                                                   21
source                                                               dbSNP
type                                                                   SNV
start                                                             14144627
end                                                               14144627
attributes               ID=957930;clinical_significance=benign;Referen...
ID                                                                  957930
Variant_seq                                                              T
Reference_seq                                                            C
Dbxref                                                 dbSNP_154:rs2822432
clinical_significance                                               benign
Name: 676119, dtype: object


In [6]:
ref_seq_test = mutable_chromosome_21

In [7]:
test['start']

'14144627'

In [9]:
ref_seq_test[int(test['start'])-6:int(test['start'])+5]

MutableSeq('cctTTCTGGGT')

In [10]:
ref_subseq_test=ref_seq_test[int(test['start'])-6:int(test['start'])+5]

In [11]:
ref_subseq_test

MutableSeq('cctTTCTGGGT')

In [12]:
ref_subseq_test[0]

'c'

In [13]:
print(test)

seqid                                                                   21
source                                                               dbSNP
type                                                                   SNV
start                                                             14144627
end                                                               14144627
attributes               ID=957930;clinical_significance=benign;Referen...
ID                                                                  957930
Variant_seq                                                              T
Reference_seq                                                            C
Dbxref                                                 dbSNP_154:rs2822432
clinical_significance                                               benign
Name: 676119, dtype: object


In [15]:
ref_subseq_test

MutableSeq('cctTTCTGGGT')

In [18]:
ref_seq_test[14144626]

'C'

In [19]:
muted_seq = ref_seq_test

In [20]:
muted_seq==ref_seq_test

True

In [21]:
muted_seq[14144626]=test['Variant_seq']

In [22]:
muted_subseq = muted_seq[14144621:14144632]

In [23]:
muted_subseq

MutableSeq('cctTTTTGGGT')

In [24]:
muted_seq==ref_seq_test

True

In [25]:
type(muted_seq)

Bio.Seq.MutableSeq

In [26]:
str(muted_seq)==str(ref_seq_test)

True

In [27]:
ref_subseq_test==muted_subseq

False

In [28]:
print(ref_subseq_test+'\n'+muted_subseq)

cctTTCTGGGT
cctTTTTGGGT
