## Nucloetide Skew

In [1]:
def skew(genome:String):List[Int] = {
    genome.foldLeft(List(0)){ case(m, x) => 
        val ret = if (x == 'G') m :+ (m.last+1) 
            else if (x == 'C') m :+ (m.last-1) 
            else m
        ret
    }
}

defined [32mfunction[39m [36mskew[39m

## Hamming distance

In [2]:
def hamming(s1: String, s2: String): Int = s1.zip(s2).count(c => c._1 != c._2)

defined [32mfunction[39m [36mhamming[39m

## Approx pattern matching

In [3]:
def approxMatch(pattern:String, input:String, k:Int=3):List[Int] = {
    input.sliding(pattern.length).zipWithIndex.
        filter{ case(kmer, i) => hamming(kmer, pattern) <= k }.
        map{ _._2 }.toList
}

defined [32mfunction[39m [36mapproxMatch[39m

## Frequent k-mers with mismatches

In [4]:
def neighbours(x:String):List[String] = {
    val bases = "ACTG"
    (0 to x.length-1).map{ i =>
        bases.filter{ _ != x(i) }
            .map{ b => 
                val y = x.toCharArray 
                y(i) = b
                y.mkString("")
            }.toList
    }.toList.flatten.distinct
}
def neighbours(x:String, d:Int):List[String] = if (d == 1) neighbours(x) else {
    neighbours(x).map{ neighbours(_, d-1) }.flatten.distinct
}

defined [32mfunction[39m [36mneighbours[39m
defined [32mfunction[39m [36mneighbours[39m

In [5]:
def approxKmers(x:String, k:Int, d:Int):List[(String, Int)] = {
    val km = x.toList.sliding(k)
    
    km.map{ x => x.mkString("") }.
        map{ kmer => neighbours(kmer, d) ++ List(kmer) }.flatten.toList.
        groupBy{ identity }.map{ case(k, v) => (k, v.length) }.toList
}

defined [32mfunction[39m [36mapproxKmers[39m

In [6]:
def reverseCompliments(x:String):String = x.map{ base => 
    base match {
        case 'A' => 'T'
        case 'T' => 'A'
        case 'G' => 'C'
        case 'C' => 'G'
    }
}.reverse

defined [32mfunction[39m [36mreverseCompliments[39m

## Frequent k-mers with mismatches and reverse compliments

In [7]:
def approxKmersC(x:String, k:Int, d:Int):List[(String, Int)] = {
    val km = x.toList.sliding(k)
    
    km.map{ x => x.mkString("") }.
        map{ kmer => 
            neighbours(kmer, d) ++ List(kmer) ++
            neighbours(kmer, d).map{ reverseCompliments(_) }
        }.flatten.toList.
        foldLeft(Map[String, Int]()) { case(m, x) =>
            m ++ Map(x -> (m.get(x).getOrElse(0) + 1))
        }.toList
}

defined [32mfunction[39m [36mapproxKmersC[39m

## Motif Enumeration in a bunch of DNA strings

In [8]:
def motifEnumeration(dnas:List[String], k:Int, d:Int) = {
    dnas.map{ dna =>
        dna.sliding(k).map{ pattern => neighbours(pattern, d).toSet }.toSet.flatten
    }.reduce{ (a, b) => a.intersect(b) }
}

defined [32mfunction[39m [36mmotifEnumeration[39m

## Median string problem

In [9]:
def kmerGenerator(k:Int, str:String = ""):List[String] = k match {
    case 1 => "ACTG".toList.map{ str + _.toString }
    case k => "ACTG".toList.map{ str + _.toString }.map{ x => kmerGenerator(k-1, x) }.flatten
}

defined [32mfunction[39m [36mkmerGenerator[39m

In [10]:
def medianString(dnas:List[String], k:Int) = {
    kmerGenerator(k).map{ pattern =>
        (
            pattern,
            dnas.map{ dna => dna.sliding(k).map{ kmer => hamming(pattern, kmer) }.min }.sum
        )
    }.sortBy{ _._2 }
}

defined [32mfunction[39m [36mmedianString[39m

## Profile most probable kmer

In [11]:
import scala.math.{ BigDecimal => Big, log10 }
type Matrix = List[List[Big]]

[32mimport [39m[36mscala.math.{ BigDecimal => Big, log10 }
[39m
defined [32mtype[39m [36mMatrix[39m

In [12]:
def profileCreator(x:String):Matrix = {
    x.split("\n").map{ _.split(" ").map{ Big(_) }.toList }.toList
}

defined [32mfunction[39m [36mprofileCreator[39m

In [13]:
def profileKmer(profile:Matrix, n:Int=1):(String, Big) = {
    val prof = "ACGT".zip(profile)
    (0 to n-1).
        map{ c => 
            prof.map{ case(base, prob) => (base, prob(c)) }.
                foldLeft(('x', Big(0.0d))){ case(mem, x) => if (mem._2 > x._2) mem else x }
        }.toList.foldLeft(("", Big(0.0d))){ case(m, x) => (m._1 + x._1, m._2 * x._2) }
}

defined [32mfunction[39m [36mprofileKmer[39m

In [14]:
def kmerProbability(kmer:String, profile:Matrix):Big = {
    profile.transpose.zip(kmer).map{ 
        case(prob, 'A') => prob(0)
        case(prob, 'C') => prob(1)
        case(prob, 'G') => prob(2)
        case(prob, 'T') => prob(3)
    }.reduce{ _ * _ }
}

defined [32mfunction[39m [36mkmerProbability[39m

In [15]:
def profileMostProbableKmer(dna:String, prof:String, k:Int):List[(String, Big)] = {
    val profile = profileCreator(prof)

    dna.sliding(k).map{ kmer =>
        (kmer, kmerProbability(kmer, profile))
    }.toList.sortBy( -_._2 )
}

def profileMostProbableKmer(dna:String, profile:Matrix, k:Int):List[(String, Big)] = {
    dna.sliding(k).map{ kmer =>
        (kmer, kmerProbability(kmer, profile))
    }.toList.sortBy( -_._2 )
}

defined [32mfunction[39m [36mprofileMostProbableKmer[39m
defined [32mfunction[39m [36mprofileMostProbableKmer[39m

## Greedy motif search

In [16]:
def formProfile(x:List[String]):Matrix = {
    x.transpose.
        map{ x => 
            List(
                Big(x.count(_ == 'A').toDouble/4.0d), 
                Big(x.count(_ == 'C').toDouble/4.0d), 
                Big(x.count(_ == 'G').toDouble/4.0d), 
                Big(x.count(_ == 'T').toDouble/4.0d))  
        }.transpose
}

def updateProfile(x:String, prof:Matrix):Matrix = {
    prof.map{ _.map{ _ * 4 } }.transpose.zip(x.toList).map{ 
        case(prob, 'A') => List(prob(0) + 1) ++ prob.tail
        case(prob, 'C') => prob.take(1) ++ List(prob(1) + 1) ++ prob.drop(2)
        case(prob, 'G') => prob.take(2) ++ List(prob(1) + 1) ++ prob.drop(1)
        case(prob, 'T') => prob.take(3) ++ List(prob(3) + 1)
    }.transpose.map{ _.map{ _ / 4 } }
}

defined [32mfunction[39m [36mformProfile[39m
defined [32mfunction[39m [36mupdateProfile[39m

In [17]:
def score(x:List[String]):Big = {
    val consensus = profileKmer(formProfile(x), x.head.length)._1
    x.map{ motif => Big(hamming(consensus, motif)) }.sum
}

defined [32mfunction[39m [36mscore[39m

In [18]:
def greedyMotifSearch(dnas:List[String], k:Int) = {
    val bestMotifs = dnas.map{ _.take(k) }.toList

    dnas.head.sliding(k).foldLeft(bestMotifs){ case(mm, kmer) =>
        val bestKmers = dnas.tail.foldLeft( List(kmer) ){ case(motifs, dna) =>
            val next = profileMostProbableKmer(dna, formProfile(motifs), k).head._1
            motifs ++ List(next)
        }
        if (score(bestKmers) < score(mm)) bestKmers else mm
    }
}

defined [32mfunction[39m [36mgreedyMotifSearch[39m

## Greedy motif search with Laplacian smoothing

In [19]:
def formProfileLaplacian(x:List[String]):Matrix = {
    x.transpose.
        map{ x => 
            List(
                Big((x.count(_ == 'A').toDouble + 1)/4.0d),
                Big((x.count(_ == 'C').toDouble + 1)/4.0d),
                Big((x.count(_ == 'G').toDouble + 1)/4.0d),
                Big((x.count(_ == 'T').toDouble + 1)/4.0d))
        }.transpose
}

defined [32mfunction[39m [36mformProfileLaplacian[39m

In [20]:
def greedyMotifSearchLaplacian(dnas:List[String], k:Int):List[String] = {
    val bestMotifs = dnas.map{ _.take(k) }.toList

    dnas.head.sliding(k).foldLeft(bestMotifs){ case(mm, kmer) =>
        val bestKmers = dnas.tail.foldLeft( List(kmer) ){ case(motifs, dna) =>
            val next = profileMostProbableKmer(dna, formProfileLaplacian(motifs), k).head._1
            motifs ++ List(next)
        }
        if (score(bestKmers) < score(mm)) bestKmers else mm
    }
}

defined [32mfunction[39m [36mgreedyMotifSearchLaplacian[39m

## Randomized motif search

In [21]:
def genMotifs(dnas:List[String], profile:Matrix, k:Int) = dnas.map{ dna => profileMostProbableKmer(dna, profile, k).head._1 }

defined [32mfunction[39m [36mgenMotifs[39m

In [22]:
def scoreCounts(x:List[String]) = {
    val ret = x.map{ _.toList }.transpose.
        map{ y =>
            "ACGT".map{ b => Big(y.count(_ == b)) }
        }.map{ x => (x.sum - x.max) }

    ret
}.sum

defined [32mfunction[39m [36mscoreCounts[39m

In [41]:
def recursiveMotifSearch(best:List[String], dnas:List[String], k:Int, itr:Int=0):List[String] = {
    val profile = formProfileLaplacian(best)
    val newMotifs = genMotifs(dnas, profile, k)
    println(newMotifs)

    if (scoreCounts(newMotifs) >= scoreCounts(best) && itr > 0) best
    else recursiveMotifSearch(newMotifs, dnas, k, itr + 1)
}

defined [32mfunction[39m [36mrecursiveMotifSearch[39m

In [42]:
def randomInit(dnas:List[String], k:Int) = dnas.map{ dna => dna.drop(scala.util.Random.nextInt(dna.length-k)).take(k) }.toList

defined [32mfunction[39m [36mrandomInit[39m

In [43]:
def randomizedMotifSearch(dnas:List[String], k:Int):List[String] = {
//     val bestMotifs = randomInit(dnas, k)
    val bestMotifs = List("GTC", "CCC", "ATA", "GCT")
    recursiveMotifSearch(bestMotifs, dnas, k)
}

defined [32mfunction[39m [36mrandomizedMotifSearch[39m

In [26]:
def repeatedrandomizedMotifSearch(dnas:List[String], k:Int, best:Big = Big(Int.MaxValue), bestMotifs:List[String] = List[String](), itr:Int=0):List[String] = {
    val motifs = randomizedMotifSearch(dnas, k)
    val s = scoreCounts(motifs)

    print(best.toString + ", ")
    if (s > best && itr >= 100) bestMotifs
    else if (s < best) repeatedrandomizedMotifSearch(dnas, k, s, motifs, 0)
    else repeatedrandomizedMotifSearch(dnas, k, best, bestMotifs, itr+1)
}

defined [32mfunction[39m [36mrepeatedrandomizedMotifSearch[39m

## Gibbs sampling

In [27]:
def gibbs(dnas:List[String], k:Int, bestM:List[String]=List[String](), N:Int):List[String] = {
    val bestMotifs:List[String] = if (bestM.isEmpty) randomInit(dnas, k) else bestM

    val ignore = scala.util.Random.nextInt(dnas.length)
    val motifsWith1Ignored = bestMotifs.zipWithIndex.filter{ case(m, i) => i != ignore }.map{ _._1 }
    val ignProfile = formProfileLaplacian(motifsWith1Ignored)
    val ignMotif = genMotifs(List(dnas(ignore)), ignProfile, k).head
    val motifs = bestMotifs.zipWithIndex.map{ case(m, i) => if (i == ignore) ignMotif else m }
    val best = scoreCounts(bestMotifs)
    val s = scoreCounts(motifs)
    
    if (s >= best && N <= 0) bestMotifs
    else if (s < best) gibbs(dnas, k, motifs, N-1)
    else gibbs(dnas, k, bestMotifs, N-1)
}

defined [32mfunction[39m [36mgibbs[39m

In [39]:
def repeatedGibbs(dnas:List[String], k:Int, bestM:List[String] = List[String](), N:Int=100):List[String] = {
    val bestMotifs:List[String] = if (bestM.isEmpty) randomInit(dnas, k) else bestM
    val motifs = gibbs(dnas, k, N=100)
    val s = scoreCounts(motifs)
    val best = scoreCounts(bestMotifs)
    
    print(best.toString + ", ")
    if (s >= best && N <= 0) bestMotifs
    else if (s < best) repeatedGibbs(dnas, k, motifs, N=N-1)
    else repeatedGibbs(dnas, k, bestMotifs, N=N-1)
}

defined [32mfunction[39m [36mrepeatedGibbs[39m