# Exploring Covid19

In this notebook we consider some string operations on the genomes of Covid19 (the SARS-CoV-2 virus.) The strings used were download from the National Library of Medicine website (https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/). 

In [4]:
def read_FASTA_file(file):
    # preprocessing
    dna_list = file.readlines()

    dict = {}

    curr_key = ''
    curr_string = ''

    for item in dna_list:
        # Line contains label
        if item[0] == '>':
            # store current_key with current_string
            if len(curr_string) != 0:
                dict[curr_key.replace('\n','').replace('>','')] = curr_string.replace('\n','')
                curr_key = ''
                curr_string = ''

            curr_key = item
        # line contains piece of string. 
        else:
            curr_string = curr_string + item

    # Add last pair to dict.
    dict[curr_key.replace('\n', '').replace('>','')] = curr_string.replace('\n', '')
    file.close()
    return dict

In [16]:
# Read strings of file into dict.
file_loc = "C:/Users/Kyle/Downloads/sequences.fasta"

d = read_FASTA_file(open(file_loc,'r'))

In [25]:
def window_motif(seq, window_size):
    """
        Given a sequence and window_size, slide window along seq and count the
        number of times each subsequence appears. 
        
        Return dict with key/value pairs are subseq/probability. 
    """
    d = dict()
    # Count subsequences
    n = len(seq)
    for i in range(n-window_size+1):
        subseq = seq[i:i+window_size]

        if subseq in d:
            d[subseq] = d.get(subseq) + 1
        else:
            d[subseq] = 1

    # normalize    
    for entry in d.items():
        d[entry[0]] = entry[1] / (n-window_size+1)

    return d

In [27]:
results = dict()
for k,v in d.items():
    results[k] = window_motif(v, 3)

In [29]:
for entry in results.items():
    
# Print sorted by decreasing order of probability. 
print(sorted(d.items(), key=lambda count: count[1], reverse=True))

dict_values([{'AGA': 0.019883119500235105, 'GAT': 0.014576476120104789, 'ATC': 0.011083495667360785, 'TCT': 0.01743131591321287, 'CTG': 0.016322966346476793, 'TGT': 0.02821253442600927, 'GTT': 0.02297306374689326, 'TTC': 0.016860347954591254, 'CTC': 0.009370591791495936, 'CTA': 0.018438906428427486, 'TAA': 0.023745549808557802, 'AAA': 0.029320883992745347, 'AAC': 0.020051051252770874, 'ACG': 0.00544098878215893, 'CGA': 0.003157116947672466, 'GAA': 0.017532074964734332, 'ACT': 0.022234164035735877, 'CTT': 0.02408141331362934, 'TTT': 0.0332169006515752, 'TTA': 0.028615570632095116, 'AAT': 0.02502183112782965, 'GTG': 0.01820380197487741, 'TGG': 0.018438906428427486, 'GGC': 0.007456169812588164, 'GCT': 0.017296970511184256, 'GTC': 0.008866796533888627, 'TCA': 0.018170215624370257, 'CAC': 0.015281789480755021, 'TCG': 0.003728084906294082, 'CGG': 0.0025525626385436958, 'TGC': 0.018136629273863104, 'GCA': 0.012393363337139786, 'CAT': 0.016155034593941024, 'ATG': 0.02391348156109357, 'TAG': 0.