## Nucloetide Skew

In [1]:
def skew(genome:String):List[Int] = {
    genome.foldLeft(List(0)){ case(m, x) => 
        val ret = if (x == 'G') m :+ (m.last+1) 
            else if (x == 'C') m :+ (m.last-1) 
            else m
        ret
    }
}

defined [32mfunction[39m [36mskew[39m

## Hamming distance

In [2]:
def hamming(s1: String, s2: String): Int = s1.zip(s2).count(c => c._1 != c._2)

defined [32mfunction[39m [36mhamming[39m

## Approx pattern matching

In [3]:
def approxMatch(pattern:String, input:String, k:Int=3):List[Int] = {
    input.sliding(pattern.length).zipWithIndex.
        filter{ case(kmer, i) => hamming(kmer, pattern) <= k }.
        map{ _._2 }.toList
}

defined [32mfunction[39m [36mapproxMatch[39m

## Frequent k-mers with mismatches

In [4]:
def neighbours(x:String):List[String] = {
    val bases = "ACTG"
    (0 to x.length-1).map{ i =>
        bases.filter{ _ != x(i) }
            .map{ b => 
                val y = x.toCharArray 
                y(i) = b
                y.mkString("")
            }.toList
    }.toList.flatten.distinct
}
def neighbours(x:String, d:Int):List[String] = if (d == 1) neighbours(x) else {
    neighbours(x).map{ neighbours(_, d-1) }.flatten.distinct
}

defined [32mfunction[39m [36mneighbours[39m
defined [32mfunction[39m [36mneighbours[39m

In [5]:
def approxKmers(x:String, k:Int, d:Int):List[(String, Int)] = {
    val km = x.toList.sliding(k)
    
    km.map{ x => x.mkString("") }.
        map{ kmer => neighbours(kmer, d) ++ List(kmer) }.flatten.toList.
        groupBy{ identity }.map{ case(k, v) => (k, v.length) }.toList
}

defined [32mfunction[39m [36mapproxKmers[39m

In [6]:
def reverseCompliments(x:String):String = x.map{ base => 
    base match {
        case 'A' => 'T'
        case 'T' => 'A'
        case 'G' => 'C'
        case 'C' => 'G'
    }
}.reverse

defined [32mfunction[39m [36mreverseCompliments[39m

## Frequent k-mers with mismatches and reverse compliments

In [7]:
def approxKmersC(x:String, k:Int, d:Int):List[(String, Int)] = {
    val km = x.toList.sliding(k)
    
    km.map{ x => x.mkString("") }.
        map{ kmer => 
            neighbours(kmer, d) ++ List(kmer) ++
            neighbours(kmer, d).map{ reverseCompliments(_) }
        }.flatten.toList.
        foldLeft(Map[String, Int]()) { case(m, x) =>
            m ++ Map(x -> (m.get(x).getOrElse(0) + 1))
        }.toList
}

defined [32mfunction[39m [36mapproxKmersC[39m

## Motif Enumeration in a bunch of DNA strings

In [8]:
def motifEnumeration(dnas:List[String], k:Int, d:Int) = {
    dnas.map{ dna =>
        dna.sliding(k).map{ pattern => neighbours(pattern, d).toSet }.toSet.flatten
    }.reduce{ (a, b) => a.intersect(b) }
}

defined [32mfunction[39m [36mmotifEnumeration[39m

## Median string problem

In [9]:
def kmerGenerator(k:Int, str:String = ""):List[String] = k match {
    case 1 => "ACTG".toList.map{ str + _.toString }
    case k => "ACTG".toList.map{ str + _.toString }.map{ x => kmerGenerator(k-1, x) }.flatten
}

defined [32mfunction[39m [36mkmerGenerator[39m

In [10]:
def medianString(dnas:List[String], k:Int) = {
    kmerGenerator(k).map{ pattern =>
        (
            pattern,
            dnas.map{ dna => dna.sliding(k).map{ kmer => hamming(pattern, kmer) }.min }.sum
        )
    }.sortBy{ _._2 }
}

defined [32mfunction[39m [36mmedianString[39m

## Profile most probable kmer

In [11]:
import scala.math.{ BigDecimal => Big, log10 }
type Matrix = List[List[Big]]

[32mimport [39m[36mscala.math.{ BigDecimal => Big, log10 }
[39m
defined [32mtype[39m [36mMatrix[39m

In [12]:
def profileCreator(x:String):Matrix = {
    x.split("\n").map{ _.split(" ").map{ Big(_) }.toList }.toList
}

defined [32mfunction[39m [36mprofileCreator[39m

In [13]:
def profileKmer(profile:Matrix, n:Int=1):(String, Big) = {
    val prof = "ACGT".zip(profile)
    (0 to n-1).
        map{ c => 
            prof.map{ case(base, prob) => (base, prob(c)) }.
                foldLeft(('x', Big(0.0d))){ case(mem, x) => if (mem._2 > x._2) mem else x }
        }.toList.foldLeft(("", Big(0.0d))){ case(m, x) => (m._1 + x._1, m._2 * x._2) }
}

defined [32mfunction[39m [36mprofileKmer[39m

In [14]:
def kmerProbability(kmer:String, profile:Matrix):Big = {
    profile.transpose.zip(kmer).map{ 
        case(prob, 'A') => prob(0)
        case(prob, 'C') => prob(1)
        case(prob, 'G') => prob(2)
        case(prob, 'T') => prob(3)
    }.reduce{ _ * _ }
}

defined [32mfunction[39m [36mkmerProbability[39m

In [15]:
def profileMostProbableKmer(dna:String, prof:String, k:Int):List[(String, Big)] = {
    val profile = profileCreator(prof)

    dna.sliding(k).map{ kmer =>
        (kmer, kmerProbability(kmer, profile))
    }.toList.sortBy( -_._2 )
}

def profileMostProbableKmer(dna:String, profile:Matrix, k:Int):List[(String, Big)] = {
    dna.sliding(k).map{ kmer =>
        (kmer, kmerProbability(kmer, profile))
    }.toList.sortBy( -_._2 )
}

defined [32mfunction[39m [36mprofileMostProbableKmer[39m
defined [32mfunction[39m [36mprofileMostProbableKmer[39m

## Greedy motif search

In [16]:
def formProfile(x:List[String]):Matrix = {
    x.transpose.
        map{ x => 
            List(
                Big(x.count(_ == 'A').toDouble/4.0d), 
                Big(x.count(_ == 'C').toDouble/4.0d), 
                Big(x.count(_ == 'G').toDouble/4.0d), 
                Big(x.count(_ == 'T').toDouble/4.0d))  
        }.transpose
}

defined [32mfunction[39m [36mformProfile[39m

In [17]:
def score(x:List[String]):Big = {
    val consensus = profileKmer(formProfile(x), x.head.length)._1
    x.map{ motif => Big(hamming(consensus, motif)) }.sum
}

defined [32mfunction[39m [36mscore[39m

In [18]:
def greedyMotifSearch(dnas:List[String], k:Int) = {
    val bestMotifs = dnas.map{ _.take(k) }.toList

    dnas.head.sliding(k).foldLeft(bestMotifs){ case(mm, kmer) =>
        val bestKmers = dnas.tail.foldLeft( List(kmer) ){ case(motifs, dna) =>
            val next = profileMostProbableKmer(dna, formProfile(motifs), k).head._1
            motifs ++ List(next)
        }
        if (score(bestKmers) < score(mm)) bestKmers else mm
    }
}

defined [32mfunction[39m [36mgreedyMotifSearch[39m

## Greedy motif search with Laplacian smoothing

In [19]:
def formProfileLaplacian(x:List[String]):Matrix = {
    x.transpose.
        map{ x => 
            List(
                Big((x.count(_ == 'A').toDouble + 1)/4.0d),
                Big((x.count(_ == 'C').toDouble + 1)/4.0d),
                Big((x.count(_ == 'G').toDouble + 1)/4.0d),
                Big((x.count(_ == 'T').toDouble + 1)/4.0d))
        }.transpose
}

defined [32mfunction[39m [36mformProfileLaplacian[39m

In [20]:
def greedyMotifSearchLaplacian(dnas:List[String], k:Int):List[String] = {
    val bestMotifs = dnas.map{ _.take(k) }.toList

    dnas.head.sliding(k).foldLeft(bestMotifs){ case(mm, kmer) =>
        val bestKmers = dnas.tail.foldLeft( List(kmer) ){ case(motifs, dna) =>
            val next = profileMostProbableKmer(dna, formProfileLaplacian(motifs), k).head._1
            motifs ++ List(next)
        }
        if (score(bestKmers) < score(mm)) bestKmers else mm
    }
}

defined [32mfunction[39m [36mgreedyMotifSearchLaplacian[39m

## Randomized motif search

In [21]:
def genMotifs(dnas:List[String], profile:Matrix, k:Int) = dnas.map{ dna => profileMostProbableKmer(dna, profile, k).head._1 }

defined [32mfunction[39m [36mgenMotifs[39m

In [22]:
def scoreCounts(x:List[String]) = {
    val ret = x.map{ _.toList }.transpose.
        map{ y =>
            "ACGT".map{ b => Big(y.count(_ == b)) }
        }.map{ x => (x.sum - x.max) }

    ret
}.sum

defined [32mfunction[39m [36mscoreCounts[39m

In [23]:
def recursiveMotifSearch(best:List[String], dnas:List[String], k:Int, itr:Int=0):List[String] = {
    val profile = formProfileLaplacian(best)
    val newMotifs = genMotifs(dnas, profile, k)

    if (scoreCounts(newMotifs) >= scoreCounts(best) && itr > 0) best
    else recursiveMotifSearch(newMotifs, dnas, k, itr + 1)
}

defined [32mfunction[39m [36mrecursiveMotifSearch[39m

In [24]:
def randomInit(dnas:List[String], k:Int) = dnas.map{ dna => dna.drop(scala.util.Random.nextInt(dna.length-k)).take(k) }.toList

defined [32mfunction[39m [36mrandomInit[39m

In [25]:
def randomizedMotifSearch(dnas:List[String], k:Int):List[String] = {
    val bestMotifs = randomInit(dnas, k)
    recursiveMotifSearch(bestMotifs, dnas, k)
}

defined [32mfunction[39m [36mrandomizedMotifSearch[39m

In [26]:
def repeatedrandomizedMotifSearch(dnas:List[String], k:Int, best:Big = Big(Int.MaxValue), bestMotifs:List[String] = List[String](), itr:Int=0):List[String] = {
    val motifs = randomizedMotifSearch(dnas, k)
    val s = scoreCounts(motifs)

    print(best.toString + ", ")
    if (s > best && itr >= 100) bestMotifs
    else if (s < best) repeatedrandomizedMotifSearch(dnas, k, s, motifs, 0)
    else repeatedrandomizedMotifSearch(dnas, k, best, bestMotifs, itr+1)
}

defined [32mfunction[39m [36mrepeatedrandomizedMotifSearch[39m

## Gibbs sampling

In [27]:
def gibbs(dnas:List[String], k:Int, bestM:List[String]=List[String](), N:Int):List[String] = {
    val bestMotifs:List[String] = if (bestM.isEmpty) randomInit(dnas, k) else bestM

    val ignore = scala.util.Random.nextInt(dnas.length)
    val motifsWith1Ignored = bestMotifs.zipWithIndex.filter{ case(m, i) => i != ignore }.map{ _._1 }
    val ignProfile = formProfileLaplacian(motifsWith1Ignored)
    val ignMotif = genMotifs(List(dnas(ignore)), ignProfile, k).head
    val motifs = bestMotifs.zipWithIndex.map{ case(m, i) => if (i == ignore) ignMotif else m }
    val best = scoreCounts(bestMotifs)
    val s = scoreCounts(motifs)
    
    if (s >= best && N <= 0) bestMotifs
    else if (s < best) gibbs(dnas, k, motifs, N-1)
    else gibbs(dnas, k, bestMotifs, N-1)
}

defined [32mfunction[39m [36mgibbs[39m

In [28]:
def repeatedGibbs(dnas:List[String], k:Int, bestM:List[String] = List[String](), N:Int=100):List[String] = {
    val bestMotifs:List[String] = if (bestM.isEmpty) randomInit(dnas, k) else bestM
    val motifs = gibbs(dnas, k, N=100)
    val s = scoreCounts(motifs)
    val best = scoreCounts(bestMotifs)
    
    print(best.toString + ", ")
    if (s >= best && N <= 0) bestMotifs
    else if (s < best) repeatedGibbs(dnas, k, motifs, N=N-1)
    else repeatedGibbs(dnas, k, bestMotifs, N=N-1)
}

defined [32mfunction[39m [36mrepeatedGibbs[39m

## String composition

In [29]:
def composition(x:String, k:Int):List[String] = x.sliding(k).toList.sorted

defined [32mfunction[39m [36mcomposition[39m

## Genome construction form genome path

In [30]:
def overlapCoef(x:String, y:String):Int = {
    (0 to x.length).map{ i => if (x.drop(i) == y.take(y.length - i)) i else 0 }.find{ _ > 0 }.getOrElse(0)
}

defined [32mfunction[39m [36moverlapCoef[39m

In [31]:
def fromGenomePath(path:List[String]):String = {
    val overlap = overlapCoef(path.head, path(1))
    val start = path.head.take(path.head.length-overlap)
    start + path.map{ x => x.takeRight(overlap) }.mkString("")
}

defined [32mfunction[39m [36mfromGenomePath[39m

## Overlap graph

In [32]:
def kmerOverlap(x:String, y:String):Int = {
    if (x == y) 0
    else {
        (0 to x.length).map{ i =>
            if ( y.take(i) == x.take(i) || y.takeRight(i) == x.take(i) ) i else 0 }.max
    }
}

defined [32mfunction[39m [36mkmerOverlap[39m

In [33]:
type AdjacencyMatrix = List[List[Int]]

defined [32mtype[39m [36mAdjacencyMatrix[39m

In [34]:
def overlapGraph(kmers:List[String]):AdjacencyMatrix = {
    val overlaps = kmers.par.map{ kmer => kmers.par.map{ kmer2 => kmerOverlap(kmer, kmer2) }.toList }.toList
    val maxOverlap = overlaps.par.map{ _.par.filter{ _ != overlaps.length }.toList.max }.toList.max
    println(maxOverlap)
    overlaps.par.map{ x => x.par.map{ y => if (y >= maxOverlap) 1 else 0 }.toList }.toList
}

defined [32mfunction[39m [36moverlapGraph[39m

In [35]:
def adjacencyList(kmers:List[String]):List[(String, String)] = 
    kmers.map{ k1 => 
        kmers.map{ k2 => (k1, k2) } }.
            flatten.zip(overlapGraph(kmers).flatten).
            filter{ case((k1, k2), o) => k1 != k2 && o > 0 }.
            map{ case((k1, k2), o) => (k2 -> k1) }

defined [32mfunction[39m [36madjacencyList[39m

In [36]:
type AdjacencyList = List[(String, String)]

defined [32mtype[39m [36mAdjacencyList[39m

## De Bruijn graph

In [37]:
def debruijn(motif:String, k:Int) = {
    val kmers = motif.sliding(k).toList
    val nodes = List(kmers.head.take(k-1)) ++ kmers.map{ _.takeRight(k-1) }
    nodes.sliding(2).map{ case List(a, b) => (a -> b) }.toList.
        groupBy{ _._1 }.map{ case(k, v) => (k, v.map{ _._2 }) }
}

defined [32mfunction[39m [36mdebruijn[39m

In [38]:
def debruijn(kmers:List[String]) = {
    val k = kmers.head.length
    val nodes = List(kmers.head.take(k-1)) ++ kmers.map{ _.takeRight(k-1) }.distinct
    kmers.map{ kmer => (kmer.take(k-1), kmer.takeRight(k-1)) }.
        groupBy{ _._1 }.map{ case(a,b) => (a, b.map{ _._2 }) }
}

defined [32mfunction[39m [36mdebruijn[39m

## DPChange

In [39]:
def dpchange(total:Int, values:List[Int], state:List[Int] = List(0)):Int = {
    (1 to total).foldLeft(Map(0 -> 0)){ case(mem, m) =>
        val ret:Int = values.foldLeft(Int.MaxValue){ case(minv, value) =>
            if (m >= value) {
                val v = mem(m - value) + 1
                if (v < mem.get(m).getOrElse(Int.MaxValue) && v < minv) v
                else minv
            } else minv
        }
        mem ++ Map(m -> ret)
    }.toList.sortBy{ _._1 }.last._2
}

defined [32mfunction[39m [36mdpchange[39m

## Manhattan tourist problem

In [40]:
def southOrEast(i:Int, j:Int, weights:Matrix):Big = if (i==0 && j==0) 0 else {
    val x = if (i > 0) southOrEast(i-1, j, weights) + weights(i)(j) else Big(-9999999)
    val y = if (j > 0) southOrEast(i, j-1, weights) + weights(i)(j) else Big(-9999999)
    List(x, y).max
}

defined [32mfunction[39m [36msouthOrEast[39m

In [41]:
import scala.collection.mutable.ArrayBuffer

def manhattanTourist(n:Int, m:Int, down:Matrix, right:Matrix) = {
    val s = ArrayBuffer.fill(n+1, m+1)(Big(0))
    (1 to n).map{ i => s(i)(0) = s(i-1)(0) + down(i-1)(0) }
    (1 to m).map{ i => s(0)(i) = s(0)(i-1) + right(0)(i-1) }

    (1 to n).map{ i => (1 to m).map{ j =>
        s(i)(j) = List( s(i-1)(j) + down(i-1)(j), s(i)(j-1) + right(i)(j-1) ).max
    } }
    s(n)(m)
}

[32mimport [39m[36mscala.collection.mutable.ArrayBuffer

[39m
defined [32mfunction[39m [36mmanhattanTourist[39m

## Backtracking

In [42]:
import scala.collection.mutable.ArrayBuffer

def lcsBacktrack(v:String, w:String):Matrix = {
    val s = ArrayBuffer.fill(v.length+1, w.length+1)(Big(0))
    val backtrack = ArrayBuffer.fill(v.length+1, w.length+1)(Big(0))

    (1 to v.length).foreach{ i => (1 to w.length).foreach{ j =>
        s(i)(j) = List( s(i-1)(j), s(i)(j-1), s(i-1)(j-1) + (if (v(i-1) == w(j-1)) 1 else 0) ).max

        if (s(i)(j) == s(i-1)(j)) backtrack(i)(j) = Big(1)
        else if (s(i)(j) == s(i)(j-1)) backtrack(i)(j) = Big(-1)
        else if (s(i)(j) == s(i-1)(j-1) + 1 && v(i-1) == w(j-1)) backtrack(i)(j) = Big(0)
    } }
    backtrack.map{ _.toList }.toList
}

[32mimport [39m[36mscala.collection.mutable.ArrayBuffer

[39m
defined [32mfunction[39m [36mlcsBacktrack[39m

## Longest Common Subsequence

In [43]:
def longestCommonSubsequence(backtrack:Matrix, v:String, i:Int, j:Int, mem:String=""):String = {
    if (i == 0 || j == 0) mem.reverse
    else {
        (backtrack(i)(j).toInt) match {
            case 1 => longestCommonSubsequence(backtrack, v, i-1, j, mem)
            case -1 => longestCommonSubsequence(backtrack, v, i, j-1, mem)
            case _ => longestCommonSubsequence(backtrack, v, i-1, j-1, (mem + v(i-1)))
        }
    }
}

defined [32mfunction[39m [36mlongestCommonSubsequence[39m

In [44]:
def lcs(v:String, w:String):String = 
    longestCommonSubsequence(lcsBacktrack(v, w), v, v.length, w.length)

defined [32mfunction[39m [36mlcs[39m

## Eulerian Cycles

In [45]:
def toAdjacencyList(x:String):AdjacencyList = x.
    split("\n").
    map{ x => 
        val r = x.replace(" ", "").split("->")
        r.last.split(",").map{ x => (r.head, x) }
    }.
    flatten.
    toList.
    map{ x => (x._1, x._2.toString) }

defined [32mfunction[39m [36mtoAdjacencyList[39m

In [46]:
import scala.annotation.tailrec
@tailrec
def eulerianCycle(
    graph:AdjacencyList, 
    cur:String="", 
    stack:List[String]=List[String](),
    circuit:List[String]=List[String]()
):List[String] = {

    val curr = if (cur.isEmpty) graph.head._1 else cur
    val neighbours = graph.filter{ case(a,b) => a == curr }
    val hasNeighbours = !neighbours.isEmpty

    if (neighbours.length == 0 && stack.isEmpty) circuit
    else  {
        val circuit2 = if (!hasNeighbours) circuit :+ curr else circuit
        val curr2 = if (!hasNeighbours) stack.last else neighbours.head._2
        val stack2 = if (!hasNeighbours) stack.dropRight(1) else stack :+ curr
        val graph2 = if (!hasNeighbours) graph else graph.filter{ _ != neighbours.head }
        eulerianCycle(graph2, curr2, stack2, circuit2)
    }
}

[32mimport [39m[36mscala.annotation.tailrec
[39m
defined [32mfunction[39m [36meulerianCycle[39m

## Make a DAG balanced

In [47]:
def makeBalanced(graph:AdjacencyList):(AdjacencyList, (String,String)) = {
    val nodes = graph.map{ case(a,b) => List(a, b) }.flatten.distinct
    val ins = graph.groupBy{ _._1 }.map{ case(a,b) => (a -> b.length) }.toMap
    val outs = graph.groupBy{ _._2 }.map{ case(a,b) => (a -> b.length) }.toMap

    if (ins == outs) (graph, ("", ""))
    else {
        val insNeeded = nodes.map{ n => List.fill(outs.get(n).getOrElse(0) - ins.get(n).getOrElse(0))(n) }.flatten
        val outsNeeded = nodes.map{ n => List.fill(ins.get(n).getOrElse(0) - outs.get(n).getOrElse(0))(n) }.flatten
        (graph ++ insNeeded.zip(outsNeeded), insNeeded.zip(outsNeeded).head)
    }
}

defined [32mfunction[39m [36mmakeBalanced[39m

## Eulerian Path

In [48]:
def eulerianPath(graph:AdjacencyList):List[String] = {
    val (balanced, addedEdge) = makeBalanced(graph)
    val cycle = eulerianCycle(balanced).reverse

    val post = cycle.takeWhile(_ != addedEdge._1) :+ addedEdge._1
    val pre = List(addedEdge._2) ++ cycle.reverse.takeWhile(_ != addedEdge._2).reverse
    (pre ++ post)
}

defined [32mfunction[39m [36meulerianPath[39m

## String reconstruction

In [49]:
def reconstruct(kmers:List[String]):String = {
    val graph = debruijn(kmers).map{ case(a, b) => b.map{ bb => (a, bb) }.toList }.flatten.toList
    val path = eulerianPath(graph)
    println(path)
    fromGenomePath(path)
}

defined [32mfunction[39m [36mreconstruct[39m

## Universal string

In [50]:
def binaryKmers(k:Int):List[String] = 
    (0 to k).
        map{ i => (List.fill(i)("0") ++ List.fill(k-i)("1")).permutations.toList.map{ _.mkString("") } }.flatten.toList

defined [32mfunction[39m [36mbinaryKmers[39m

In [51]:
def universalString(k:Int) = {
    val graph = debruijn(binaryKmers(k)).map{ case(a, b) => b.map{ bb => (a, bb) }.toList }.flatten.toList
    val path = eulerianCycle(graph).reverse
    path.foldLeft(""){ case(mem, k) => mem + k.last }
}

defined [32mfunction[39m [36muniversalString[39m

## Global alignment

In [52]:
val aminoAcids = List("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
val blosum62 = """ 4  0 -2 -1 -2  0 -2 -1 -1 -1 -1 -2 -1 -1 -1  1  0  0 -3 -2
 0  9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2
-2 -3  6  2 -3 -1 -1 -3 -1 -4 -3  1 -1  0 -2  0 -1 -3 -4 -3
-1 -4  2  5 -3 -2  0 -3  1 -3 -2  0 -1  2  0  0 -1 -2 -3 -2
-2 -2 -3 -3  6 -3 -1  0 -3  0  0 -3 -4 -3 -3 -2 -2 -1  1  3
 0 -3 -1 -2 -3  6 -2 -4 -2 -4 -3  0 -2 -2 -2  0 -2 -3 -2 -3
-2 -3 -1  0 -1 -2  8 -3 -1 -3 -2  1 -2  0  0 -1 -2 -3 -2  2
-1 -1 -3 -3  0 -4 -3  4 -3  2  1 -3 -3 -3 -3 -2 -1  3 -3 -1
-1 -3 -1  1 -3 -2 -1 -3  5 -2 -1  0 -1  1  2  0 -1 -2 -3 -2
-1 -1 -4 -3  0 -4 -3  2 -2  4  2 -3 -3 -2 -2 -2 -1  1 -2 -1
-1 -1 -3 -2  0 -3 -2  1 -1  2  5 -2 -2  0 -1 -1 -1  1 -1 -1
-2 -3  1  0 -3  0  1 -3  0 -3 -2  6 -2  0  0  1  0 -3 -4 -2
-1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2  7 -1 -2 -1 -1 -2 -4 -3
-1 -3  0  2 -3 -2  0 -3  1 -2  0  0 -1  5  1  0 -1 -2 -2 -1
-1 -3 -2  0 -3 -2  0 -3  2 -2 -1  0 -2  1  5 -1 -1 -3 -3 -2
 1 -1  0  0 -2  0 -1 -2  0 -2 -1  1 -1  0 -1  4  1 -2 -3 -2
 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1  0 -1 -1 -1  1  5  0 -2 -2
 0 -1 -3 -2 -1 -3 -3  3 -2  1  1 -3 -2 -2 -3 -2  0  4 -3 -1
-3 -2 -4 -3  1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11  2
-2 -2 -3 -2  3 -3  2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1  2  7""".
    split("\n").
    map{ _.trim.split("\\s+").map{ Big(_) }.toList.zip(aminoAcids) }.toList.zip(aminoAcids).
    map{ case(row, aa) => row.map{ case(score, aa2) => ((aa2, aa), score) } }.flatten.toMap

[36maminoAcids[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"A"[39m,
  [32m"C"[39m,
  [32m"D"[39m,
  [32m"E"[39m,
  [32m"F"[39m,
  [32m"G"[39m,
  [32m"H"[39m,
  [32m"I"[39m,
  [32m"K"[39m,
  [32m"L"[39m,
  [32m"M"[39m,
[33m...[39m
[36mblosum62[39m: [32mMap[39m[([32mString[39m, [32mString[39m), [32mBigDecimal[39m] = [33mMap[39m(
  ([32m"A"[39m, [32m"N"[39m) -> -2,
  ([32m"Q"[39m, [32m"V"[39m) -> -2,
  ([32m"Y"[39m, [32m"H"[39m) -> 2,
  ([32m"H"[39m, [32m"T"[39m) -> -2,
  ([32m"L"[39m, [32m"V"[39m) -> 1,
  ([32m"N"[39m, [32m"T"[39m) -> 0,
  ([32m"Q"[39m, [32m"C"[39m) -> -3,
  ([32m"H"[39m, [32m"H"[39m) -> 8,
  ([32m"H"[39m, [32m"A"[39m) -> -2,
  ([32m"N"[39m, [32m"N"[39m) -> 6,
  ([32m"W"[39m, [32m"I"[39m) -> -3,
[33m...[39m

In [53]:
val aminoAcids = List("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
val pam250 = """ 2 -2  0  0 -3  1 -1 -1 -1 -2 -1  0  1  0 -2  1  1  0 -6 -3
-2 12 -5 -5 -4 -3 -3 -2 -5 -6 -5 -4 -3 -5 -4  0 -2 -2 -8  0
 0 -5  4  3 -6  1  1 -2  0 -4 -3  2 -1  2 -1  0  0 -2 -7 -4
 0 -5  3  4 -5  0  1 -2  0 -3 -2  1 -1  2 -1  0  0 -2 -7 -4
-3 -4 -6 -5  9 -5 -2  1 -5  2  0 -3 -5 -5 -4 -3 -3 -1  0  7
 1 -3  1  0 -5  5 -2 -3 -2 -4 -3  0  0 -1 -3  1  0 -1 -7 -5
-1 -3  1  1 -2 -2  6 -2  0 -2 -2  2  0  3  2 -1 -1 -2 -3  0
-1 -2 -2 -2  1 -3 -2  5 -2  2  2 -2 -2 -2 -2 -1  0  4 -5 -1
-1 -5  0  0 -5 -2  0 -2  5 -3  0  1 -1  1  3  0  0 -2 -3 -4
-2 -6 -4 -3  2 -4 -2  2 -3  6  4 -3 -3 -2 -3 -3 -2  2 -2 -1
-1 -5 -3 -2  0 -3 -2  2  0  4  6 -2 -2 -1  0 -2 -1  2 -4 -2
 0 -4  2  1 -3  0  2 -2  1 -3 -2  2  0  1  0  1  0 -2 -4 -2
 1 -3 -1 -1 -5  0  0 -2 -1 -3 -2  0  6  0  0  1  0 -1 -6 -5
 0 -5  2  2 -5 -1  3 -2  1 -2 -1  1  0  4  1 -1 -1 -2 -5 -4
-2 -4 -1 -1 -4 -3  2 -2  3 -3  0  0  0  1  6  0 -1 -2  2 -4
 1  0  0  0 -3  1 -1 -1  0 -3 -2  1  1 -1  0  2  1 -1 -2 -3
 1 -2  0  0 -3  0 -1  0  0 -2 -1  0  0 -1 -1  1  3  0 -5 -3
 0 -2 -2 -2 -1 -1 -2  4 -2  2  2 -2 -1 -2 -2 -1  0  4 -6 -2
-6 -8 -7 -7  0 -7 -3 -5 -3 -2 -4 -4 -6 -5  2 -2 -5 -6 17  0
-3  0 -4 -4  7 -5  0 -1 -4 -1 -2 -2 -5 -4 -4 -3 -3 -2  0 10""".
    split("\n").
    map{ _.trim.split("\\s+").map{ Big(_) }.toList.zip(aminoAcids) }.toList.zip(aminoAcids).
    map{ case(row, aa) => row.map{ case(score, aa2) => ((aa2, aa), score) } }.flatten.toMap

[36maminoAcids[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"A"[39m,
  [32m"C"[39m,
  [32m"D"[39m,
  [32m"E"[39m,
  [32m"F"[39m,
  [32m"G"[39m,
  [32m"H"[39m,
  [32m"I"[39m,
  [32m"K"[39m,
  [32m"L"[39m,
  [32m"M"[39m,
[33m...[39m
[36mpam250[39m: [32mMap[39m[([32mString[39m, [32mString[39m), [32mBigDecimal[39m] = [33mMap[39m(
  ([32m"A"[39m, [32m"N"[39m) -> 0,
  ([32m"Q"[39m, [32m"V"[39m) -> -2,
  ([32m"Y"[39m, [32m"H"[39m) -> 0,
  ([32m"H"[39m, [32m"T"[39m) -> -1,
  ([32m"L"[39m, [32m"V"[39m) -> 2,
  ([32m"N"[39m, [32m"T"[39m) -> 0,
  ([32m"Q"[39m, [32m"C"[39m) -> -5,
  ([32m"H"[39m, [32m"H"[39m) -> 6,
  ([32m"H"[39m, [32m"A"[39m) -> -1,
  ([32m"N"[39m, [32m"N"[39m) -> 2,
  ([32m"W"[39m, [32m"I"[39m) -> -5,
[33m...[39m

In [54]:
import scala.collection.mutable.ArrayBuffer

def lcsScoredBacktrack(v:String, w:String, score:Map[(String, String), BigDecimal], indelPenalty:Big):Matrix = {
    val s = ArrayBuffer.fill(v.length+1, w.length+1)(Big(0))
    (0 to v.length).foreach{ i => s(i)(0) = -1*i*indelPenalty }
    (0 to w.length).foreach{ i => s(0)(i) = -1*i*indelPenalty }
    val backtrack = ArrayBuffer.fill(v.length+1, w.length+1)(Big(0))

    (1 to v.length).foreach{ i => (1 to w.length).foreach{ j =>
        s(i)(j) = List(
            s(i-1)(j) - indelPenalty,
            s(i)(j-1) - indelPenalty,
            s(i-1)(j-1) + score((v(i-1).toString, w(j-1).toString))
        ).max

        if (s(i)(j) + indelPenalty == s(i-1)(j)) backtrack(i)(j) = Big(0)
        else if (s(i)(j) + indelPenalty == s(i)(j-1)) backtrack(i)(j) = Big(1)
        else backtrack(i)(j) = Big(2)
    } }
    println(s(v.length)(w.length))

    backtrack.map{ _.toList }.toList
}

[32mimport [39m[36mscala.collection.mutable.ArrayBuffer

[39m
defined [32mfunction[39m [36mlcsScoredBacktrack[39m

In [55]:
def longestCommonSubsequenceScored(backtrack:Matrix, v:String, w:String, i:Int, j:Int, memV:String="", memW:String=""):(String, String) = {
    if (i == 0 && j == 0) { println(i, j); (("-" * i ) +  memV.reverse, ("-" * j ) +  memW.reverse) }
    else {
        (backtrack(i)(j).toInt) match {
            case 0 => longestCommonSubsequenceScored(backtrack, v, w, i-1, j, (memV + v(i-1)), memW+"-")
            case 1 => longestCommonSubsequenceScored(backtrack, v, w, i, j-1, memV+"-", (memW + w(j-1)))
            case 2 => longestCommonSubsequenceScored(backtrack, v, w, i-1, j-1, (memV + v(i-1)), (memW + w(j-1)) )
        }
    }
}

defined [32mfunction[39m [36mlongestCommonSubsequenceScored[39m

In [56]:
def globalAlignment(v:String, w:String, indel:Int):(String, String) = {
    val graph = lcsScoredBacktrack(v, w, blosum62, indel)
    longestCommonSubsequenceScored(graph, v, w, v.length, w.length)
}

defined [32mfunction[39m [36mglobalAlignment[39m

## Local Alignment

In [57]:
import scala.collection.mutable.ArrayBuffer

def lcsScoredLocalBacktrack(v:String, w:String, score:Map[(String, String), BigDecimal], indelPenalty:Big):(Matrix, (Int, Int)) = {
    val s = ArrayBuffer.fill(v.length+1, w.length+1)(Big(0))
//     (0 to v.length).foreach{ i => s(i)(0) = -1*i*indelPenalty }
//     (0 to w.length).foreach{ i => s(0)(i) = -1*i*indelPenalty }
    val backtrack = ArrayBuffer.fill(v.length+1, w.length+1)(Big(0))

    (1 to v.length).foreach{ i => (1 to w.length).foreach{ j =>
        s(i)(j) = List(
            Big(0),
            s(i-1)(j) - indelPenalty,
            s(i)(j-1) - indelPenalty,
            s(i-1)(j-1) + score((v(i-1).toString, w(j-1).toString))
        ).max

        backtrack(i)(j) = 
            if (s(i)(j) + indelPenalty == s(i-1)(j)) Big(0)
            else if (s(i)(j) + indelPenalty == s(i)(j-1)) Big(1)
            else if (s(i)(j) == s(i-1)(j-1) + score((v(i-1).toString, w(j-1).toString))) Big(2)
            else Big(3)
    } }

    val positions = s.map{ _.zipWithIndex }.zipWithIndex.map{ case(r, i) => r.map{ case(d, j) => (d, (i,j)) } }.flatten
    val maxScore = s.map{ _.max }.max
    println(maxScore)
    val startingPositions = positions.filter{ case(x, (i,j)) => x == maxScore }.map{ _._2 }
    
    println(startingPositions)
    (backtrack.map{ _.toList }.toList, startingPositions.head)
}

[32mimport [39m[36mscala.collection.mutable.ArrayBuffer

[39m
defined [32mfunction[39m [36mlcsScoredLocalBacktrack[39m

In [58]:
def longestCommonSubsequenceScoredLocal(backtrack:Matrix, v:String, w:String, i:Int, j:Int, memV:String="", memW:String=""):(String, String) = {
    if (i == 0 || j == 0 || backtrack(i)(j) == 3) { println(i, j); (("-" * (i-1) ) +  memV.reverse, ("-" * (j-1) ) +  memW.reverse) }
    else {
        (backtrack(i)(j).toInt) match {
            case 0 => longestCommonSubsequenceScoredLocal(backtrack, v, w, i-1, j, (memV + v(i-1)), memW+"-")
            case 1 => longestCommonSubsequenceScoredLocal(backtrack, v, w, i, j-1, memV+"-", (memW + w(j-1)))
            case 2 => longestCommonSubsequenceScoredLocal(backtrack, v, w, i-1, j-1, (memV + v(i-1)), (memW + w(j-1)) )
        }
    }
}

defined [32mfunction[39m [36mlongestCommonSubsequenceScoredLocal[39m

In [59]:
def localAlignment(v:String, w:String, indel:Int):(String, String) = {
    val (graph, pos) = lcsScoredLocalBacktrack(v, w, pam250, indel)
//     graph.map{ println }
    longestCommonSubsequenceScoredLocal(graph, v, w, pos._1, pos._2)
}

defined [32mfunction[39m [36mlocalAlignment[39m

In [60]:
val x = "MDIEHHANYWPRCMPAQWTGAQYVASPPRSPMSMGKPMTKKKHRLHMGDFILKFMPYSKMVCLYFRTVWSGFDHQEYDMIKTDMGMMDATQRFHNWEWAKEQPQTRDPHLAISIMWTFMVCSDAFAYAKYYGKTSHGSATAVIENKTSVPQSCTFRREMHDQRITDFNYDPKIMKTLVELDIAYYVPQYPNCREIGEVSSMEKQDPLNTNHKKGWDEGGREPPCYEYALTVHCYMFKASNSDKDLKKQCQECRYQEGREIPPNLTIGCWMFRAKNKMNSADCTPCQYKWSFIRDECYNTFDEQSQYAIICVCHDAPHWTWDGPMVKWCEVIHTESMAHDSFCNHLYSADTEQHFTVVDHGVDKNRYDLHVHPVRAAWHRFSTMEFSCVNEYLPVAMDLWGLGNGHMFMCFPSIRWQNDYCVHRLVIGNSVWNWHITAGIIWLKKHVGERPWCTFHTSHIHFMALYFVLDMSCGYCDIWMSDKYPRYHTIRHSAMNACRGLYWWHVIFARGDKHDTMYADSIISNKRTKEEGGCFNYMKFMARRFVDMYTMEREESKTYVYMQSFHWAYPVKQCCQGHKMASWPYQCKGHVEPSCCFQIYAKPPQHFEWMSWFLKITVFFQALILKWGYLDSTNCWRWRHLYVADVTGIWGYPGTPCWCDHNDGGSQSYSDQEKLIHSWDYLCLHVEKCTFVNYALHTCMCMHFGNSQHIVIRQHQGCHYFVAYGFNPMLQMGWDGKPGDTGVYNMISDLMIDDQDIYWLHNPRKAYTHYAEALLTNVTFLTGAGMHVFTARSQYWKWPMSCSPMEMFMNTAPSELEQCMSQNQQDSGDNQKSRSHDACHLEHEGSAHGCAYDTAIHDPRKMPQWYVVYRNCCSLNRPGAMRTCIWTPQWDINHSTYLPSQ"
val y = "KNKEHHTAMNNIASWQTLYDYMSEVEFSGPPWSQYNFVLEMQKDFKNHKAFFYQMVPIDPYDYAQGCVDISVPKTLNWHASDPIITRAYCCYPGKTGNLHNYQQFWPEMTHWEPQVRECHWACIVAHWYPVWHIDQHILYKVEGAKHMMDCGPPMKIYKRPVPCDVERRASPIPYILYAWNWPKIMPWGNMTYGQKLPERDIYGISIPIANINNRNIQIDPEHRSCKGLVGMDYKWAMQCMTWVLDSNSDKDLKKQCQYCRGLQEIGENDYNNHSMNSADCTPTVFFQNFIRDECYNTDTMKIFYAIICGRFHCNCHDAPHIVRKACNCITWDGPMVITDSFSNHLYDTEQIFTVVDHNHGWPVDKNRIRDLHVHPVRAPQWHRFSQMEFSCVNEYLPNNMFRAMWLWGLGNYHAFMNDWHVGNFPRIRWINDDCVHPDQSTPLVIGNWNWAAIIWLKGNFSRSHIHFVDTINWALYGEGPIVLDCDYRWMSWHKYPRYQTILHGTNYPRNAIPMQNITLSRGMYWWHVIFARGDLHDTMYDIWTMQFSNKRNKEWGGCHNYARRFVDAADCYTSTEREESKTQSFHWAYPVHCENPQNPYCLDECWSQGHKMASWPYQCKGHVEPSCCEWMSWFLKITVFFQALILKWLSWVCLAIYVDSFNCWRWRHLYVNAFAEFVLFKCNYDHIQSACIWMKMDENQQHEYYMVNKCGARSPKQAEEQGCSFTIHCRASRGTFFTCSQKKWFIFNATIEDLMFHATSEKIILSEYIFQRHVDTQHGDAQTERYRVDYCDDHNTRHPHQVRICMQKHHDHCTKHVQLLTTKMACDKTPEVNLDIIRKSESTFINVQKHKSPMEEMNMRAMNGADCRKVRYLTMHDGHCYMDNRFTAPCLLWVTKSPPSGIPWGIVNWLNCYSFMLGWEDISQKP"
localAlignment(x, y, 5)

1261
ArrayBuffer((874,916))
(1,7)


[36mx[39m: [32mString[39m = [32m"MDIEHHANYWPRCMPAQWTGAQYVASPPRSPMSMGKPMTKKKHRLHMGDFILKFMPYSKMVCLYFRTVWSGFDHQEYDMIKTDMGMMDATQRFHNWEWAKEQPQTRDPHLAISIMWTFMVCSDAFAYAKYYGKTSHGSATAVIENKTSVPQSCTFRREMHDQRITDFNYDPKIMKTLVELDIAYYVPQYPNCREIGEVSSMEKQDPLNTNHKKGWDEGGREPPCYEYALTVHCYMFKASNSDKDLKKQCQECRYQEGREIPPNLTIGCWMFRAKNKMNSADCTPCQYKWSFIRDECYNTFDEQSQYAIICVCHDAPHWTWDGPMVKWCEVIHTESMAHDSFCNHLYSADTEQHFTVVDHGVDKNRYDLHVHPVRAAWHRFSTMEFSCVNEYLPVAMDLWGLGNGHMFMCFPSIRWQNDYCVHRLVIGNSVWNWHITAGIIWLKKHVGERPWCTFHTSHIHFMALYFVLDMSCGYCDIWMSDKYPRYHTIRHSAMNACRGLYWWHVIFARGDKHDTMYADSIISNKRTKEEGGCFNYMKFMARRFVDMYTMEREESKTYVYMQSFHWAYPVKQCCQGHKMASWPYQCKGHVEPSCCFQIYAKPPQHFEWMSWFLKITVFFQALILKWGYLDSTNCWRWRHLYVADVTGIWGYPGTPCWCDHNDGGSQSYSDQEKLIHSWDYLCLHVEKCTFVNYALHTCMCMHFGNSQHIVIRQHQGCHYFVAYGFNPMLQMGWDGKPGDTGVYNMISDLMIDDQDIYWLHNPRKAYTHYAEALLTNVTFLTGAGMHVFTARSQYWKWPMSCSPMEMFMNTAPSELEQCMSQNQQDSGDNQKSRSHDACHLEHEGSAHGCAYDTAIHDPRKMPQWYVVYRNCCSLNRPGAMRTCIWTPQWDINHSTYLPSQ"[39m
[36my[39m: [32mString[39m = [32m"KNKEHHTAMNNIASWQT

In [60]:
println(res66_2._1)
println(res66_2._2)

cmd60.sc:1: not found: value res66_2
val res60_0 = println(res66_2._1)
                      ^cmd60.sc:2: not found: value res66_2
val res60_1 = println(res66_2._2)
                      ^

: 

## Translation and transcription

In [61]:
interp.load.ivy("org.biojava" % "biojava-core" % "5.0.0-alpha8")

In [62]:
import org.biojava.nbio.core.sequence.{ DNASequence, RNASequence, ProteinSequence }

[32mimport [39m[36morg.biojava.nbio.core.sequence.{ DNASequence, RNASequence, ProteinSequence }[39m

In [63]:
def translate(x:String):String = (new RNASequence(x)).getProteinSequence.toString

defined [32mfunction[39m [36mtranslate[39m

In [64]:
def transcribe(x:String):String = (new DNASequence(x)).getRNASequence.toString

defined [32mfunction[39m [36mtranscribe[39m

## Peptide encoding

In [65]:
def peptideEncodersN(dna:String, peptide:String):List[String] = {
    val dna1 = dna.drop(1)
    val dna2 = dna.drop(2)
    List(dna, dna1, dna2).map{ d =>
        d.sliding(peptide.length*3).
            toList.filter{ x => x.length == (peptide.length*3) && translate(transcribe(x)) == peptide }.
            toList
    }
}.flatten

def peptideEncodersRev(d:String, peptide:String):List[String] = {
    val dna = reverseCompliments(d)
    val dna1 = dna.drop(1)
    val dna2 = dna.drop(2)
    List(dna, dna1, dna2).map{ d =>
        d.sliding(peptide.length*3).
            toList.filter{ x => x.length == (peptide.length*3) && translate(transcribe(x)) == peptide }.
            toList.map{ reverseCompliments(_) }
    }
}.flatten

def peptideEncoders(dna:String, peptide:String):List[String] =
    (peptideEncodersN(dna, peptide) ++ peptideEncodersRev(dna, peptide)).distinct

defined [32mfunction[39m [36mpeptideEncodersN[39m
defined [32mfunction[39m [36mpeptideEncodersRev[39m
defined [32mfunction[39m [36mpeptideEncoders[39m

## Cyclopeptides

In [66]:
val peptideMass = Map(
    "A" -> Big(71),
    "R" -> Big(156),
    "N" -> Big(114),
    "D" -> Big(115),
    "C" -> Big(103),
    "E" -> Big(129),
    "Q" -> Big(128),
    "G" -> Big(57),
    "H" -> Big(137),
    "I" -> Big(113),
    "L" -> Big(113),
    "K" -> Big(128),
    "M" -> Big(131),
    "F" -> Big(147),
    "P" -> Big(97),
    "S" -> Big(87),
    "T" -> Big(101),
    "W" -> Big(186),
    "Y" -> Big(163),
    "V" -> Big(99)
)
val peptides = peptideMass.keys.toList

def massOf(x:String):Big = x.map{ c => peptideMass(c.toString) }.sum

[36mpeptideMass[39m: [32mMap[39m[[32mString[39m, [32mBigDecimal[39m] = [33mMap[39m(
  [32m"E"[39m -> 129,
  [32m"N"[39m -> 114,
  [32m"T"[39m -> 101,
  [32m"Y"[39m -> 163,
  [32m"F"[39m -> 147,
  [32m"A"[39m -> 71,
  [32m"M"[39m -> 131,
  [32m"I"[39m -> 113,
  [32m"G"[39m -> 57,
  [32m"V"[39m -> 99,
  [32m"Q"[39m -> 128,
[33m...[39m
[36mpeptides[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"E"[39m,
  [32m"N"[39m,
  [32m"T"[39m,
  [32m"Y"[39m,
  [32m"F"[39m,
  [32m"A"[39m,
  [32m"M"[39m,
  [32m"I"[39m,
  [32m"G"[39m,
  [32m"V"[39m,
  [32m"Q"[39m,
[33m...[39m
defined [32mfunction[39m [36mmassOf[39m

In [67]:
implicit class CyclicSliding[A](x:List[A]) {
    def cyclicSliding(a:Int, b:Int=1):Iterator[List[A]] = (x ++ x.take(a-b)).sliding(a, b)
}

defined [32mclass[39m [36mCyclicSliding[39m

In [68]:
"NQEL".toList.cyclicSliding(1).toList

[36mres67[39m: [32mList[39m[[32mList[39m[[32mChar[39m]] = [33mList[39m([33mList[39m([32m'N'[39m), [33mList[39m([32m'Q'[39m), [33mList[39m([32m'E'[39m), [33mList[39m([32m'L'[39m))

In [69]:
def spectrumOf(x:String):List[(String, Big)] = {
    (1 to x.length-1).map{ c =>
        val y = x.toList.cyclicSliding(c).map{ _.mkString("") }.toList
        y.map{ yy => (yy, massOf(yy)) }
    }.toList.flatten :+ (x, massOf(x))
}.sortBy{ _._2 }

defined [32mfunction[39m [36mspectrumOf[39m

In [70]:
translate("CCUCGUACAGAAAUCAAC")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


[36mres69[39m: [32mString[39m = [32m"PRTEIN"[39m

In [71]:
spectrumOf("ETC").map{_._2}.mkString(" ") == "71 99 101 103 128 129 199 200 204 227 230 231 298 303 328 330 332 333"

[36mres70[39m: [32mBoolean[39m = [32mfalse[39m

In [72]:
massOf("W")

[36mres71[39m: [32mBigDecimal[39m = 186

## Cyclopeptide scoring

In [99]:
def linearSpectrumOf(x:String):List[Big] = {
    val prefixMass = ArrayBuffer.fill(x.length+1)(Big(0))
    var linearSpectrum = List(Big(0))

    x.toList.zipWithIndex.foreach{ case(p, i) =>
        val pep = peptides.filter{ _ == p.toString }.head
        prefixMass(i+1) = prefixMass(i) + massOf(pep)
    }

    x.toList.zipWithIndex.foreach{ case(p, i) => ((i) to x.length).map{ j =>
        linearSpectrum :+= (prefixMass(j) - prefixMass(i))
    } }

    Big(0) +: linearSpectrum.sorted.dropWhile{ _ == 0 }
}

defined [32mfunction[39m [36mlinearSpectrumOf[39m

In [74]:
def cyclicSpectrumOf(x:String):List[Big] = {
    val prefixMass = ArrayBuffer.fill(x.length+1)(Big(0))
    var cycSpectrum = List(Big(0))
    
    x.toList.zipWithIndex.foreach{ case(p, i) =>
        val pep = peptides.filter{ _ == p.toString }.head
        prefixMass(i+1) = prefixMass(i) + massOf(pep)
    }

    var peptideMass = prefixMass(x.length)
    x.toList.zipWithIndex.foreach{ case(p, i) => ((i) to x.length).map{ j =>
        cycSpectrum :+= (prefixMass(j) - prefixMass(i))
        if (i > 0 && j < x.length)
            cycSpectrum :+= (peptideMass - (prefixMass(j) - prefixMass(i)))
    } }

    List(Big(0)) ++ cycSpectrum.sorted.dropWhile(_ == Big(0))
}

defined [32mfunction[39m [36mcyclicSpectrumOf[39m

In [107]:
// def cycScore(pep:String, spectrum:String):Big = {
//     Big((spectrumOf(pep).map{ _._2.toString } :+ "0").intersect(spectrum.split(" ")).size)
// }

def cycScore(pep:String, spectrum:String):Big = {
    Big((linearSpectrumOf(pep)).intersect(spectrum.split(" ").map{ x => Big(x) }).size)
}

defined [32mfunction[39m [36mcycScore[39m

## Cyclopeptide sequencing

In [117]:
def beamCyclopeptideSequencing(spectrum:String, N:Int=10, best:String="", candidates:List[String]=List("")):String = {
  val parentMass = Big(spectrum.split(" ").last.toInt)

  if (candidates.isEmpty) best 
  else {
    val expanded = peptides.map{ peptide => 
      candidates.par.map{ leader =>
        (leader+peptide, cycScore(leader+peptide, spectrum))
      }.toList
    }.flatten
      .filter{ case(leader, score) => massOf(leader) <= parentMass }
      .sortBy{ -_._2 }
    println(expanded.length)

    val possible = expanded.filter{ case(leader, score) => massOf(leader) == parentMass }.sortBy{ -_._2 }.headOption.getOrElse( ("", Big(0)) )._1
    val newCandidateMinScore = expanded.take(N).takeRight(1).map{ _._2 }.sum
    val newCandidate = expanded.takeWhile{ case(leader, score) => score >= newCandidateMinScore }.map{ _._1 }

    beamCyclopeptideSequencing(spectrum, N, possible, (if (possible.isEmpty) newCandidate else List() ))
  }
}

defined [32mfunction[39m [36mbeamCyclopeptideSequencing[39m