diff --git a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st index b4980901..58caffdb 100644 --- a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st +++ b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st @@ -37,7 +37,8 @@ BaselineOfBioSmalltalk >> baseline: spec [ projectSpecForPolyMath: spec; projectSpecForDataFrame: spec; projectSpecForINIParser: spec; - projectSpecForPetitParser: spec. + projectSpecForPetitParser: spec; + projectSpecForNeedlemanWunsch: spec. "projectSpecForRoassal2: spec." self baselineOSDeps: spec. @@ -109,7 +110,7 @@ BaselineOfBioSmalltalk >> baselineCommonPackages: spec [ package: 'BioSoftwareCatalog' with: [ spec requires: #('BioTools' ). ]; package: 'BioSupport' with: [ spec requires: #('BioTools' 'DataFrame'). ]; package: 'BioTools' with: [ spec - requires: #('CommonUtils' 'StringExtensions' 'PetitParser2Core' 'PolyMath' 'XMLParser'); + requires: #('CommonUtils' 'StringExtensions' 'PetitParser2Core' 'PolyMath' 'XMLParser' 'NeedlemanWunsch'); includes: #('BioPharoCommon') ]; package: 'BioToolsSamples' with: [ spec requires: #('BioTools' 'BioEntrez' 'BioParsers' ). ]; package: 'BioTools-Tests' with: [ spec requires: #('BioTools' ). ]; @@ -333,6 +334,14 @@ BaselineOfBioSmalltalk >> projectSpecForINIParser: spec [ ] +{ #category : 'specs' } +BaselineOfBioSmalltalk >> projectSpecForNeedlemanWunsch: spec [ + + spec + baseline: 'NeedlemanWunsch' + with: [ spec repository: 'github://hernanmd/needleman-wunsch/src' ] +] + { #category : 'specs' } BaselineOfBioSmalltalk >> projectSpecForOsSubprocess: spec [ diff --git a/repository/BioTools-Tests/BioAlignmentTest.class.st b/repository/BioTools-Tests/BioAlignmentTest.class.st index e40b4358..9e91ed07 100644 --- a/repository/BioTools-Tests/BioAlignmentTest.class.st +++ b/repository/BioTools-Tests/BioAlignmentTest.class.st @@ -4,9 +4,9 @@ Class { #instVars : [ 'align' ], - #category : 'BioTools-Tests-Core', + #category : 'BioTools-Tests-Align', #package : 'BioTools-Tests', - #tag : 'Core' + #tag : 'Align' } { #category : 'accessing' } diff --git a/repository/BioTools-Tests/BioAlphabetTest.class.st b/repository/BioTools-Tests/BioAlphabetTest.class.st index 44af8d90..850e94be 100644 --- a/repository/BioTools-Tests/BioAlphabetTest.class.st +++ b/repository/BioTools-Tests/BioAlphabetTest.class.st @@ -1,12 +1,12 @@ Class { #name : 'BioAlphabetTest', #superclass : 'BioAbstractTest', + #category : 'BioTools-Tests-Alphabets', + #package : 'BioTools-Tests', + #tag : 'Alphabets' #instVars : [ 'alphabet' ], - #category : 'BioTools-Tests-Biological', - #package : 'BioTools-Tests', - #tag : 'Biological' } { #category : 'testing' } diff --git a/repository/BioTools-Tests/BioCodonTableTest.class.st b/repository/BioTools-Tests/BioCodonTableTest.class.st index 2c7d0bad..aec0b474 100644 --- a/repository/BioTools-Tests/BioCodonTableTest.class.st +++ b/repository/BioTools-Tests/BioCodonTableTest.class.st @@ -4,9 +4,9 @@ Class { #instVars : [ 'table' ], - #category : 'BioTools-Tests-Biological', + #category : 'BioTools-Tests-Alphabets', #package : 'BioTools-Tests', - #tag : 'Biological' + #tag : 'Alphabets' } { #category : 'running' } diff --git a/repository/BioTools-Tests/BioHirschbergTest.class.st b/repository/BioTools-Tests/BioHirschbergTest.class.st new file mode 100644 index 00000000..5525fdc5 --- /dev/null +++ b/repository/BioTools-Tests/BioHirschbergTest.class.st @@ -0,0 +1,117 @@ +Class { + #name : 'BioHirschbergTest', + #superclass : 'TestCase', + #category : 'BioTools-Tests-Align', + #package : 'BioTools-Tests', + #tag : 'Align' +} + +{ #category : 'tests' } +BioHirschbergTest >> testAlignEmptyWithEmpty [ + | result | + result := BioHirschberg new align: '' with: ''. + self assert: result firstSequence equals: ''. + self assert: result secondSequence equals: ''. +] + +{ #category : 'tests' } +BioHirschbergTest >> testAlignEmptyWithNonEmpty [ + | result | + result := BioHirschberg new align: '' with: 'GATTACA'. + self assert: result firstSequence equals: '-------'. + self assert: result secondSequence equals: 'GATTACA'. +] + +{ #category : 'tests' } +BioHirschbergTest >> testAlignIdenticalSequences [ + | result | + result := BioHirschberg new align: 'ACGTACGT' with: 'ACGTACGT'. + self assert: result firstSequence equals: 'ACGTACGT'. + self assert: result secondSequence equals: 'ACGTACGT'. +] + +{ #category : 'tests' } +BioHirschbergTest >> testAlignNonEmptyWithEmpty [ + | result | + result := BioHirschberg new align: 'GATTACA' with: ''. + self assert: result firstSequence equals: 'GATTACA'. + self assert: result secondSequence equals: '-------'. +] + +{ #category : 'tests' } +BioHirschbergTest >> testAlignSingleCharacterMismatch [ + + | result | + result := BioHirschberg new align: 'A' with: 'T'. + self assert: result firstSequence size equals: 1. + self assert: result secondSequence size equals: 1. + self assert: result firstSequence equals: 'A'. + self assert: result secondSequence equals: 'T' +] + +{ #category : 'tests' } +BioHirschbergTest >> testBaseCaseManyAgainstOne [ + + | result | + result := BioHirschberg new align: 'GATTACA' with: 'A'. + self + assert: result firstSequence size + equals: result secondSequence size. + self assert: (result secondSequence occurrencesOf: $A) equals: 1 +] + +{ #category : 'tests' } +BioHirschbergTest >> testBaseCaseOneAgainstMany [ + + | result | + result := BioHirschberg new align: 'A' with: 'GATTACA'. + self + assert: result firstSequence size + equals: result secondSequence size. + self assert: (result firstSequence occurrencesOf: $A) equals: 1 +] + +{ #category : 'tests' } +BioHirschbergTest >> testOnlyGapCharacterInsertedForPadding [ + + | result allowed | + result := BioHirschberg new align: 'GAT' with: 'GATTACA'. + allowed := (Set withAll: 'GATTACA') , (Set with: $-). + self assert: + (result firstSequence allSatisfy: [ :ch | allowed includes: ch ]). + self assert: + (result secondSequence allSatisfy: [ :ch | allowed includes: ch ]) +] + +{ #category : 'tests' } +BioHirschbergTest >> testResultSequencesHaveSameLength [ + + | result | + result := BioHirschberg new align: 'GATTACA' with: 'GCATGCU'. + self + assert: result firstSequence size + equals: result secondSequence size +] + +{ #category : 'tests' } +BioHirschbergTest >> testSequencesWithGaps [ + + | result | + result := BioHirschberg new align: 'GATTACA' with: 'GAATACA'. + self assert: (result firstSequence includesSubstring: '-'). + self assert: (result secondSequence includesSubstring: '-') +] + +{ #category : 'tests' } +BioHirschbergTest >> testSequencesWithRepeatedGaps [ + + | result | + result := BioHirschberg new align: 'G--A' with: 'GA--'. + + "Alignments should be of equal length" + self + assert: result firstSequence size + equals: result secondSequence size. + "Alignment contains gaps as expected" + self assert: (result firstSequence includesSubstring: '-') +] diff --git a/repository/BioTools-Tests/BioUnknownSeqTest.class.st b/repository/BioTools-Tests/BioUnknownSeqTest.class.st index 9bcee89b..4ab0c58a 100644 --- a/repository/BioTools-Tests/BioUnknownSeqTest.class.st +++ b/repository/BioTools-Tests/BioUnknownSeqTest.class.st @@ -22,11 +22,13 @@ BioUnknownSeqTest >> setUp [ { #category : 'testing' } BioUnknownSeqTest >> testAlphabet [ - self assert: ( self sequence alphabet isKindOf: BioAlphabet ). - self assert: ( self sequence unknownLetter = BioAlphabet new defaultUnknownElement ). + self assert: (self sequence alphabet isKindOf: BioAlphabet). + self + assert: self sequence unknownLetter + equals: BioAlphabet new defaultUnknownElement. self sequence unknownLetter: $?. - self assert: ( self sequence alphabet isKindOf: BioAlphabet ). - self assert: self sequence unknownLetter = $? + self assert: (self sequence alphabet isKindOf: BioAlphabet). + self assert: self sequence unknownLetter equals: $? ] { #category : 'testing' } diff --git a/repository/BioTools/BioAlignment.class.st b/repository/BioTools/BioAlignment.class.st index dcffbb6a..227f6395 100644 --- a/repository/BioTools/BioAlignment.class.st +++ b/repository/BioTools/BioAlignment.class.st @@ -13,9 +13,9 @@ Class { 'consensusStrategy', 'isReference' ], - #category : 'BioTools-Core', + #category : 'BioTools-Align', #package : 'BioTools', - #tag : 'Core' + #tag : 'Align' } { #category : 'accessing' } diff --git a/repository/BioTools/BioHirschberg.class.st b/repository/BioTools/BioHirschberg.class.st new file mode 100644 index 00000000..f14016c7 --- /dev/null +++ b/repository/BioTools/BioHirschberg.class.st @@ -0,0 +1,146 @@ +" +Hirschberg's algorithm is a more space-efficient modification of the Needleman-Wunsch algorithm, designed for optimal sequence alignment. It uses a divide-and-conquer approach to achieve linear space complexity while maintaining the same time complexity as Needleman-Wunsch. + +This implementation reduces space complexity from O(nm) to O(min(n,m)) while maintaining O(nm) time complexity. +" +Class { + #name : 'BioHirschberg', + #superclass : 'ALNeedlemanWunsch', + #category : 'BioTools-Align', + #package : 'BioTools', + #tag : 'Align' +} + +{ #category : 'initialization' } +BioHirschberg >> align: sequence1 with: sequence2 [ + + | aligned | + self + firstSequence: sequence1; + secondSequence: sequence2. + + (sequence1 isEmpty and: [ sequence2 isEmpty ]) ifTrue: [ ^ self ]. + + aligned := self recursiveAlignment: sequence1 with: sequence2. + self finalize: aligned first reversed with: aligned second reversed. + ^ self +] + +{ #category : 'accessing' } +BioHirschberg >> alignmentScoreOf: alignedA with: alignedB using: aligner [ + + | score | + score := 0. + 1 to: alignedA size do: [ :i | + score := score + + + (aligner + matchScore: (alignedA at: i) + with: (alignedB at: i)) ]. + ^ score +] + +{ #category : 'transforming' } +BioHirschberg >> backwardPass: aSeq with: bSeq [ + "R[j] = score of aligning reverse(aSeq) with reverse(bSeq[1..j]) mapped back" + + ^ (self forwardPass: aSeq reversed with: bSeq reversed) reversed +] + +{ #category : 'transforming' } +BioHirschberg >> forwardPass: aSeq with: bSeq [ + + | current previous diagonal up left | + current := Array new: bSeq size + 1. + 0 to: bSeq size do: [ :j | + current at: j + 1 put: j * self gapPenalty ]. + + aSeq doWithIndex: [ :charA :i | + previous := current copy. + current at: 1 put: i * self gapPenalty. + 1 to: bSeq size do: [ :j | + diagonal := (previous at: j) + + (self matchScore: charA with: (bSeq at: j)). + up := (previous at: j + 1) + self gapPenalty. + left := (current at: j) + self gapPenalty. + current at: j + 1 put: ((diagonal max: up) max: left) ] ]. + ^ current +] + +{ #category : 'initialization' } +BioHirschberg >> initialize [ + + super initialize. + gapCharacter := $- +] + +{ #category : 'transforming' } +BioHirschberg >> needlemanWunschForSmallCases: aSeq with: bSeq [ + + | nw | + nw := self class superclass new. + nw + matchAward: self matchAward; + mismatchPenalty: self mismatchPenalty; + gapPenalty: self gapPenalty; + gapCharacter: self gapCharacter; + align: aSeq with: bSeq. + ^ { + nw firstSequence. + nw secondSequence } +] + +{ #category : 'transforming' } +BioHirschberg >> recursiveAlignment: aSeq with: bSeq [ + + | n m mid leftA rightA scoreL scoreR split leftPart rightPart | + n := aSeq size. + m := bSeq size. + + n = 0 ifTrue: [ + ^ { + (String new: m withAll: self gapCharacter). + bSeq } ]. + + m = 0 ifTrue: [ + ^ { + aSeq. + (String new: n withAll: self gapCharacter) } ]. + + (n = 1 or: [ m = 1 ]) ifTrue: [ + ^ self needlemanWunschForSmallCases: aSeq with: bSeq ]. + + mid := n // 2. + leftA := aSeq copyFrom: 1 to: mid. + rightA := aSeq copyFrom: mid + 1 to: n. + + scoreL := self forwardPass: leftA with: bSeq. + scoreR := self backwardPass: rightA with: bSeq. + split := self splitIndexFromForward: scoreL backward: scoreR. + + leftPart := self + recursiveAlignment: leftA + with: (bSeq copyFrom: 1 to: split). + rightPart := self + recursiveAlignment: rightA + with: (bSeq copyFrom: split + 1 to: m). + + ^ { + (leftPart first , rightPart first). + (leftPart second , rightPart second) } +] + +{ #category : 'transforming' } +BioHirschberg >> splitIndexFromForward: scoreL backward: scoreR [ + + | m bestIndex bestScore candidate | + m := scoreL size - 1. + bestIndex := 0. + bestScore := SmallInteger minVal. + 0 to: m do: [ :j | + candidate := (scoreL at: j + 1) + (scoreR at: m - j + 1). + candidate > bestScore ifTrue: [ + bestScore := candidate. + bestIndex := j ] ]. + ^ bestIndex +]