Python string functions [tutorial](https://www.tutorialspoint.com/python/python_strings.htm) on TutorialsPoint and another [tutorial](https://developers.google.com/edu/python/strings) from Google Education. Also see the Python documentation on the 'text sequence type' aka [string](https://docs.python.org/3/library/stdtypes.html#text-sequence-type-str).

In [1]:
mystring = 'a word'

In [2]:
mystring.upper()

'A WORD'

In [3]:
'a word'.upper()

'A WORD'

In [None]:
# to get a list of what we can do with a string: help(str)

In [5]:
'This is a sentence'.lower()

'this is a sentence'

In [6]:
python_sequence = 'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC'

In [14]:
python_string = """GAGAGCCCTAGATACTCTGACTCTTTGTGGTAGTTGTCTAATAACATGCCTACTGACAT
TATTTTTCCATATGCAAGTGCTATATGTAGTACAGCAAATGTATTATAATTTGTTACTGT
AAAAATGGCCTTGATCATGGTTATGCCTATTAGATTAATATTGTTGGTTCTTTTAAAAAA
GATTATTTTCCCTTTGTCACTAGCAAATTTATTATGAGTTTTTAAATACTAACTGACGGC
ATTTGTTCAatagcataCATAGGGTTCTTCTAGTAGCTCATATTGATCCCCTCTGGCAAACA
TTTTTTAAAAAAAATCTTTTGTATATAACCCAAGATAAAGATATAGAAATTGGTCACT
TCATGGATAGGTGTAGAGTTTGCTTGAAAAATCAAACTGAACATGATTCTCTCTAGGATA"""

In [8]:
'ATAGCATA'.lower()

'atagcata'

In [18]:
bases = set(['A', 'C', 'G', 'T'])
base_counts = dict()
for base in python_string:
    base = base.upper()
    if base in bases:
        base_counts[base] = base_counts.get(base, 0) + 1
for base in bases:
    print(base + ':', base_counts[base])

G: 68
C: 64
T: 155
A: 132


In [17]:
bases = set(['A', 'C', 'G', 'T'])
base_counts = dict()
for base in python_string:
    if base in bases:
        base_counts[base] = base_counts.get(base, 0) + 1
for base in bases:
    print(base + ':', base_counts[base])

G: 67
C: 63
T: 153
A: 128


In [19]:
greeting = "Hello\nHow are you?"
print(greeting)

Hello
How are you?


In [20]:
windows_path = "c:\\Users\\javan"
print(windows_path)

c:\Users\javan


In [21]:
header = "Name\tInstitution"
row1 = "Peter\tUWC"
row2 = "Moussa\tUCT"
print(header)
print(row1)
print(row2)

Name	Institution
Peter	UWC
Moussa	UCT


In [22]:
print("Windows style\n\rLine splitting")

Windows style
Line splitting


In [23]:
print("Please find me a 330Ω resistor") 

Please find me a 330Ω resistor


In [24]:
ord('a')

97

In [25]:
ord('Ω')

8486

In [26]:
# the Ohm symbol is unicode 2126

In [27]:
chr(97)

'a'

In [28]:
ord('A')

65

In [29]:
'AAAAAA' in python_string

True

In [30]:
'fun' in 'fundemental'

True

In [31]:
'fun' in 'Monday'

False

In [32]:
python_string.find('AAAAA')

121

In [33]:
position = python_string.find('AAAAAA')

In [34]:
substring = python_string[position:position+10]
print(substring)

AAAAAA
GAT


In [35]:
substring

'AAAAAA\nGAT'

In [36]:
oneline = 'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC'
print(len(oneline))

60


In [37]:
twolines = '''AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC
ACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA'''

In [38]:
twolines

'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC\nACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA'

In [39]:
print(len(twolines))

121


In [40]:
twolines.find('CTT')

2

In [41]:
help(twolines.find)

Help on built-in function find:

find(...) method of builtins.str instance
    S.find(sub[, start[, end]]) -> int
    
    Return the lowest index in S where substring sub is found,
    such that sub is contained within S[start:end].  Optional
    arguments start and end are interpreted as in slice notation.
    
    Return -1 on failure.



In [42]:
twolines.find('CTT', 3)

24

In [43]:
twolines.find('CTT', 3, 20)

-1

In [44]:
twolines.find('ostrich')

-1

In [45]:
position = twolines.find('CTT')
while position != -1:
    print('match at:', position)
    position = twolines.find('CTT', position+1)

match at: 2
match at: 24
match at: 103


In [46]:
motif = 'CTT'
position = twolines.find(motif)
while position != -1:
    print('match at:', position)
    position = twolines.find(motif, position+1)

match at: 2
match at: 24
match at: 103


In [50]:
# illustration of infinite loop
# because we do not update position
import time
motif = 'CTT'
position = twolines.find(motif)
while position != -1:
    print('match at:', position)
    time.sleep(1)  # pause for 1 second

match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2
match at: 2


KeyboardInterrupt: 

In [51]:
twolines

'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC\nACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA'

In [52]:
twolines.find('CAGACCCACG')

-1

In [63]:
new_sequence = twolines.replace('\n', '')

In [64]:
new_sequence

'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCCACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA'

In [65]:
len(new_sequence)

120

In [56]:
from Bio import Seq

ImportError: No module named 'Bio'

In [66]:
# how to reverse a string
new_sequence = new_sequence[::-1]

In [67]:
new_sequence

'ATCGTTATTTTTACATTCTTGTGGTAATATACGGGAGAGAAATTACAAACCAATGTGGCACCCAGACTTACCATTGAGAATTTTTTATACAGGTTCTTTTATTAAATATATTAACTTCGA'

In [68]:
new_sequence = new_sequence[::-1]

In [70]:
new_sequence.find('CAGACCCACG')

53

In [69]:
new_sequence

'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCCACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA'

In [71]:
silly_sequence = new_sequence.replace('CAGACCCACG', 'baboon')

In [72]:
print(silly_sequence)

AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTbaboonGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA


In [73]:
help(silly_sequence.replace)

Help on built-in function replace:

replace(...) method of builtins.str instance
    S.replace(old, new[, count]) -> str
    
    Return a copy of S with all occurrences of substring
    old replaced by new.  If the optional argument count is
    given, only the first count occurrences are replaced.



In [75]:
mystring = 'egg egg egg egg'
mystring.replace('egg', 'chicken', 2)

'chicken chicken egg egg'

In [76]:
first_part = mystring[:4]
print(first_part)

egg 


In [77]:
mystring[4:].replace('egg', 'chicken', 2)

'chicken chicken egg'

In [78]:
first_part + mystring[4:].replace('egg', 'chicken', 2)

'egg chicken chicken egg'

In [79]:
line = 'Chromosome	ena	exon	106734	107603	.	+	.	Parent=transcript:CCP42822;Name=CCP42822-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42822-1;rank=1;version=1'

In [80]:
line

'Chromosome\tena\texon\t106734\t107603\t.\t+\t.\tParent=transcript:CCP42822;Name=CCP42822-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42822-1;rank=1;version=1'

In [81]:
print(line)

Chromosome	ena	exon	106734	107603	.	+	.	Parent=transcript:CCP42822;Name=CCP42822-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42822-1;rank=1;version=1


In [82]:
help(print)

Help on built-in function print in module builtins:

print(...)
    print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)
    
    Prints the values to a stream, or to sys.stdout by default.
    Optional keyword arguments:
    file:  a file-like object (stream); defaults to the current sys.stdout.
    sep:   string inserted between values, default a space.
    end:   string appended after the last value, default a newline.
    flush: whether to forcibly flush the stream.



In [83]:
print('Hello', end='TT')

HelloTT

In [84]:
line.split('\t')

['Chromosome',
 'ena',
 'exon',
 '106734',
 '107603',
 '.',
 '+',
 '.',
 'Parent=transcript:CCP42822;Name=CCP42822-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42822-1;rank=1;version=1']

In [85]:
fields = line.split('\t')

In [86]:
start_position = int(fields[3])
end_position = int(fields[4])
feature_type = fields[2]
print(feature_type, "at", start_position, "to", end_position)

exon at 106734 to 107603


In [87]:
attributes = fields[8]
print(attributes)

Parent=transcript:CCP42822;Name=CCP42822-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42822-1;rank=1;version=1


In [88]:
attributes = fields[-1]
print(attributes)

Parent=transcript:CCP42822;Name=CCP42822-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42822-1;rank=1;version=1


In [89]:
keyvals = attributes.split(';')

In [90]:
keyvals

['Parent=transcript:CCP42822',
 'Name=CCP42822-1',
 'constitutive=1',
 'ensembl_end_phase=0',
 'ensembl_phase=0',
 'exon_id=CCP42822-1',
 'rank=1',
 'version=1']

In [91]:
for keyval in keyvals:
    parts = keyval.split('=')
    key = parts[0]
    value = parts[1]
    print(key, value)

Parent transcript:CCP42822
Name CCP42822-1
constitutive 1
ensembl_end_phase 0
ensembl_phase 0
exon_id CCP42822-1
rank 1
version 1


In [92]:
attributes_dict = dict()
for keyval in keyvals:
    parts = keyval.split('=')
    key = parts[0]
    value = parts[1]
    attributes_dict[key] = value
print(attributes_dict)

{'ensembl_phase': '0', 'constitutive': '1', 'Name': 'CCP42822-1', 'version': '1', 'exon_id': 'CCP42822-1', 'ensembl_end_phase': '0', 'rank': '1', 'Parent': 'transcript:CCP42822'}


In [93]:
mystring = 'field1^^^field2'
mystring.split('^^^')

['field1', 'field2']

In [94]:
sentence = "This is a fine and rainy day.  I hope the dam levels improve."
words = sentence.split()
print(words)

['This', 'is', 'a', 'fine', 'and', 'rainy', 'day.', 'I', 'hope', 'the', 'dam', 'levels', 'improve.']


In [95]:
threelines = """AATGCTTTAGTAATGCTTCTTGTTAACTTAAACTGTATTATAGCCTCAGTGGTTGACAAC
AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC
ACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA"""

In [96]:
threelines

'AATGCTTTAGTAATGCTTCTTGTTAACTTAAACTGTATTATAGCCTCAGTGGTTGACAAC\nAGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC\nACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA'

In [97]:
threelines.split()

['AATGCTTTAGTAATGCTTCTTGTTAACTTAAACTGTATTATAGCCTCAGTGGTTGACAAC',
 'AGCTTCAATTATATAAATTATTTTCTTGGACATATTTTTTAAGAGTTACCATTCAGACCC',
 'ACGGTGTAACCAAACATTAAAGAGAGGGCATATAATGGTGTTCTTACATTTTTATTGCTA']

In [102]:
prose = """It was a dark and stormy night.
An author sat at their typewriter, struggling to write.
\t"Oh no!", they exclaimed."""

In [103]:
prose

'It was a dark and stormy night.\nAn author sat at their typewriter, struggling to write.\n\t"Oh no!", they exclaimed.'

In [104]:
print(prose)

It was a dark and stormy night.
An author sat at their typewriter, struggling to write.
	"Oh no!", they exclaimed.


In [105]:
words = prose.split()

In [106]:
print(words)

['It', 'was', 'a', 'dark', 'and', 'stormy', 'night.', 'An', 'author', 'sat', 'at', 'their', 'typewriter,', 'struggling', 'to', 'write.', '"Oh', 'no!",', 'they', 'exclaimed.']


In [107]:
onefinalsplit = "One  Two Three"
onefinalsplit.split(' ')

['One', '', 'Two', 'Three']

In [108]:
"^^^".join(['Word1', 'Word2'])

'Word1^^^Word2'

In [109]:
line1 = '>ENA|KX778838|KX778838.1 Python regius Lmbr1 gene, intron. '

In [110]:
line1.startswith('>')

True

In [111]:
line2 = 'CAGCATCAAAATGGTGGGTGCTTCCATCATTTTAATAGTGTTCTCCCCCCTTCTCTCCCT'

In [112]:
line2.startswith('>')

False

In [113]:
'fishpaste'.startswith('fish')

True

In [114]:
'starfish'.startswith('fish')

False

In [115]:
'fishpaste'.endswith('fish')

False

In [116]:
'starfish'.endswith('fish')

True

In [117]:
line1 = '  258 1ped.fasta\n'

In [118]:
line1

'  258 1ped.fasta\n'

In [119]:
data = line1.strip()

In [120]:
data

'258 1ped.fasta'

In [121]:
line1.lstrip()

'258 1ped.fasta\n'

In [122]:
line1.rstrip()

'  258 1ped.fasta'

In [125]:
# first strip off the whitespace (the end of line - \n) at the end 
# of the line
# and then split the resulting line into fields
line1.rstrip().split()

['258', '1ped.fasta']