# Sequences Lab

### by Jonathan Fischer and Courtney Rauchman

In [None]:
from datascience import *
import numpy as np
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
import scipy.stats as stats
import scipy as scipy
import pandas as pd
plt.style.use('fivethirtyeight')
from client.api.notebook import Notebook

## Reference Assembly

In [None]:
# Let's imagine we've performed an experiment and obtained the set of 8 reads below. We happen to know 
# that these correspond to the coding sequence of a gene of length 45 nucleotides.

reads = ['ATGCTAGAAA','GGAGACTGCT','AGTTAGTCAT','CCATAGCTGA',
 'AGAAACGGCT','CGGCTAGTTA','CTGCTCCATA','GTCATGGAGA']

In [None]:
# We want to use these reads to obtain the actual sequence of the entire gene. Let's assume that each read overlaps
# with only one other by exactly 5 bp. These overlaps will be in the first and last 5 nt of each read.
# By stitching these reads together, we can thus obtain the entire gene.

# Assemble the reads to yield the sequence of the gene. Call this value full_seq

# Then the first 5 bp and last 5 bp them from the raw sequences and populate the respective arrays
prefixes = [? for ? in ?]
suffixes = [? for ? in ?]

# Initialize an array of zeros in which to store which reads overlap
match_mat = np.zeros((len(prefixes), len(suffixes)))
# Iterate over prefixes and suffixes to identify when the prefix of read i equals the suffix of read j
for i in np.arange(?):
    for j in np.arange(?):
        ?[i,j] = ?
        
print(match_mat)

ind = int(np.argwhere(?)) # find the read whose prefix doesn't match any suffixes
full_seq = ?              # initialize the full sequence with that read
# Iterate over the number of remaining sequences, identifying which read's prefix matches the current suffix
for i in np.arange(?):
    ind = int(np.argwhere(?))
    full_seq += ?
    
full_seq

## Read mapping

In [None]:
# Now let's assume already have the reference sequences for two different genes and want to quantify
# their expression levels. We perform an experiment and get a set of reads, some of which come from these 
# genes. We can estimate their expression by counting the number of reads which match the sequence
# in the respective references. We only want to count reads which uniquely align to one gene or the other.

# Load the data

# Reference sequences
gene_1 = 'ATGCTAGAAACGGCTAGTTATTCATGGAGACTGCTCCATAGCTGA'
gene_2 = 'ATGTATGATCTCCAGGTATTCACGCAGTGCTCGCCTTACTTATAG'

# Reads from experiment
experimental_reads = [
 'TCATG','CGGCT','TGGAG','GCTCC','ACTGC','TTATT','TATTC','AGAAA','AACGG','AGCTG','CTAGT','GCTCC','ATGGA',
 'AACGG','TAGAA','ACGGC','ATAGC','GGAGA','TAGTT','CGGCT','ACTGC','GGCTA','GCTAG','GCTAG','AGTTA','TGCTA',
 'TTATT','TGCTA','GCTAG','AGAAA','AACGG','ATAGC','TTATT','TCCAG','TACTT','TGATC','CTTAT','TGATC','TTATA',
 'TCGCC','CTCCA','TGATC','CAGTG','TTATA','GCAGT','AGGTA','GCAGT','TGTAT','GCCTT','ATTCA','ATGAT','CTCCA',
 'CCTTA','GATCT','CAGGT','TCCAG','ACTTA','AATAT','ATGAA','TGTGG','GGTGC','GCAAG','CTACC','CACCG','TGGGG',
 'TAGGT','CTGCC','AGGCT','GGTAG','GGCGT','AGTGA','AATGT','CGTCG','CGTCC','GTACC','AGGGA','ATCGG','CTCTT',
 'CCGGA','AATGG','CTTTC','TAAAT','TATAA','GGTTA','AAAGG','TCAGG','GCATT']


In [None]:
# Write a function (read_map_check) that takes two sequences, one test and one reference, as input and 
# checks whether the test sequence aligns to the reference.

def read_map_check(test_seq, ref_seq):
    map_flag = ?        # Does test_seq match any subsequence of ref_seq?
    return(map_flag)

In [None]:
# Now apply your function to the set of experimental reads to produce the expression counts for each gene.
# Remember that only reads mapping uniquely to a given sequence will get counted.

# Initialize counters at zero
gene_1_hits = 0
gene_2_hits = 0

# For each experimental read, check whether it maps to a sequence in gene_1 or gene_2.
# If it maps uniquely to one gene, increase that gene's count by 1
for i in ?:
    m1 = ?(?, ?)             # does the read map to gene 1?
    m2 = ?(?, ?)             # does the read map to gene 2?
    if ? and not ?:          # if unique to gene 1
        gene_1_hits += ?     # update gene 1 counts
    if ? and not ?:          # if unique to gene 2
        gene_2_hits += ?     # update gene 2 counts

gene_1_hits, gene_2_hits  

## Comparing genetic sequences

In [None]:
# A simple way to compare the similarity of genetic sequences is to compute a quantity known as the Hamming distance.
# This measures the number of mismatches between pairs of sequences; e.g., d_H('ACG', 'ATG') = 1, 
# d_H('TAG', 'TAG') = 0, etc. 

# We will now compare the differences in the same gene observed in two "populations". First, load the provided data.

data_pop_1 = ['ATGCTAGAAAGGGCTAGTTAGACATGGAGACGGCTCCATAGCTGA', 'ATGCTAGTAACGGCTAGTTTGTCATGGAGACTGCTCCATACCTGA', 'ATGCTAGATACGGCTAGTTACTCATGGAGAGAGCTCCATAGCTGA', 'ATGCTAGTTACGGCTACTTAGTCATCGAGACTGCACCATAGCTGA', 'ATGCTAGAATCGGGTAGATAGTCATGGAGACAGCTCCATAGCTGA']
data_pop_2 = ['ATGCAAGCAACGGCTTGTTTTTCATGGACACTGATCCATTGCTGA', 'ATGCATGGAAGGGCTAGTTATTCTTGCAGACTGATCCATTGCTGA', 'ATGCAAGCAACGGCTAGTTATTCTTGGACACTGATGCATTGCTGA', 'ATGCAAGGAACGGCAAGTTAATCATGGTGACTCATCCATTGCTGA', 'ATGCAAGGAAGGGCTAGATATTCGTGGAGAGTGATCGATTGCTGA']


In [None]:
# Write a function which takes two sequences as input and outputs the Hamming distance between them.

def compute_hamming_dist(seq_1, seq_2):
    # check that they are the same length
    if ?:
        d = 'Error: sequences are not of same length'
    else:
        # initialize the distance to 0
        d = ?
        # count how many locations differ between the two
        for n in np.arange(?):
            d += ?
    return(d)

In [None]:
# Construct a matrix D in which D_{ij} = d_H(s_i, s_j) for sequences s_i and s_j.

# Combine the data from the different populations
merged_data = ?

# Initialize a distance matrix of all zeros
D = np.zeros( (?, ?) )

# Iterate over the sequences to find the Hamming distances between each pair
for i in np.arange(?):
    for j in np.arange(?):
        d_ij = ?
        D[?,?] = ?
    
D

In [None]:
# Use MDS to examine whether the individuals cluster in any noticeable way

# You don't need to edit this part
from sklearn.manifold import MDS
mds = MDS(n_components=2, random_state = 100200300, dissimilarity="precomputed")
proj = mds.fit_transform(D)

# Now make a scatter plot of the MDS projection (proj) with the points colored by population
plt.scatter(?, ?) # plot the points corresponding to population 1
plt.scatter(?, ?) # plot the points corresponding to population 2
plt.xlabel('Coordinate 1')
plt.ylabel('Coordinate 2')
plt.title('MDS of individuals based on genetic sequences')
plt.legend(['Pop 1', 'Pop 2'], loc = 'lower right')
plt.show()

## To submit

In [None]:
ok = Notebook('Lab03_Sequences.ok')
_ = ok.auth(inline=True)

In [None]:
# Submit the assignment.
_ = ok.submit()