# Handling text with Python

## Get started

In [28]:
text1 = 'In this world nothing can be said to be certain except death and taxes'

In [29]:
len(text1)

70

In [30]:
text2 = text1.split(' ')

In [31]:
len(text2)

14

In [32]:
text2

['In',
 'this',
 'world',
 'nothing',
 'can',
 'be',
 'said',
 'to',
 'be',
 'certain',
 'except',
 'death',
 'and',
 'taxes']

## Finding Specific Words

### Words that are more than 5 letters long

In [33]:
[w for w in text2 if len(w) > 5]

['nothing', 'certain', 'except']

### Words that end with  'd'

In [35]:
[w for w in text2 if w.endswith('d')]

['world', 'said', 'and']

### Words that start with  'd'

In [36]:
[w for w in text2 if w.startswith('d')]

['death']

In [37]:
[w for w in text2 if w.startswith(('d','n'))]

['nothing', 'death']

## Finding Unique Words

### Using set()

In [1]:
text1 = 'to be or not to be'

In [2]:
text2 = text1.split(' ')

In [4]:
text2

['to', 'be', 'or', 'not', 'to', 'be']

In [5]:
text3 = set(text2)

In [6]:
len(text3)

4

In [7]:
text3

{'be', 'not', 'or', 'to'}

## More String Operations

### t in s - To check whether a substring exists in a given string

In [16]:
'Python' in 'Python is good'

True

### string.isupper() - To check whether all the characters of the string are uppercase

In [17]:
'PYTHON IS GOOD'.isupper()

True

In [18]:
'Python is good'.isupper()

False

### string.islower() - To check whether all the characters of the string are lowercase

In [23]:
'python is good'.islower()

True

In [24]:
'Python is good'.islower()

False

### string.istitle() - To check whether each word in the string starts with an uppercase letter

In [25]:
'Python Is Good'.istitle()

True

In [26]:
'Python is good'.istitle()

False

### s.isdigit() - To check whether the string contains digits only

In [43]:
'000002'.isdigit()

True

In [44]:
'SZ000002'.isdigit()

False

### string.isalpha() - To check whether the string contains alphabetic characters only

In [29]:
'Textmining'.isalpha()

True

In [30]:
'Text mining1'.isalpha()

False

### s.isalnum() - To check whether the string contains alphanumeric characters only

In [31]:
'SZ000002'.isalnum()

True

In [32]:
'SZ_000002#'.isalnum()

False

### Conversion between uppercase and lowercase

In [40]:
s1 = 'python is good'

### string.upper() - Returns a string in which all characters are uppercased

In [41]:
s2 = s1.upper()

In [42]:
s2

'PYTHON IS GOOD'

### string.lower() - Returns a string in which all characters are lowercased

In [43]:
s2.lower()

'python is good'

### string.capitalize() - Returns a string with only its first character capitalized

In [44]:
s1.capitalize()

'Python is good'

### string.title() - Returns a string in which first characters of all the words are capitalized

In [45]:
s1.title()

'Python Is Good'

### string.split()

In [47]:
s1 = 'cattcatt'

In [48]:
s2 = s1.split('a')

In [49]:
s2

['c', 'ttc', 'tt']

### join()

In [50]:
'a'.join(s2)

'cattcatt'

### Get all the characters of s1 

In [51]:
list(s1)

['c', 'a', 't', 't', 'c', 'a', 't', 't']

In [52]:
[c for c in s1]

['c', 'a', 't', 't', 'c', 'a', 't', 't']

### String formatting

In [53]:
'{} {}'.format('hello', 'world')

'hello world'

In [54]:
'{} {}'.format(24, 'seconds')

'24 seconds'

In [55]:
24 + 'seconds' 

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [56]:
name = 'Eric'

In [57]:
age = 74

In [58]:
f'Hello, {name}. You are {age}.'

'Hello, Eric. You are 74.'

### string.strip() – Removes whitespaces at the beginning and at the end of the string

In [59]:
s1 = '  a quick brown fox jumped over the lazy dog  '

In [60]:
s2 = s1.strip()

In [61]:
s2

'a quick brown fox jumped over the lazy dog'

### sring.replace(old, new) - Returns a string where all occurrences of the old substring are replaced with the new substring

In [62]:
s2.replace('o', 'O')

'a quick brOwn fOx jumped Over the lazy dOg'

In [63]:
s2.replace('o', 'O', 2)

'a quick brOwn fOx jumped over the lazy dog'

### string.translate(table) - The translate() method returns a string where each character is mapped to its corresponding character in the translation table. The translation table is created by the static method maketrans().

In [64]:
intab = 'aeiou'

In [65]:
outtab = '12345'

In [66]:
table = str.maketrans(intab, outtab)

In [67]:
s1 = 'aeiou-xmppp'

In [68]:
s1.translate(table)

'12345-xmppp'

In [69]:
table_1 = str.maketrans(intab, outtab, 'xm')

In [70]:
s1.translate(table_1)

'12345-ppp'

## Index and Slice String

### Accessing characters by positive index number

In [71]:
s = 'Hello World!'

In [72]:
s[4]

'o'

### Accessing characters by negative index number

In [73]:
s[-3]

'l'

### Slicing strings

In [48]:
s[1:5]

'ello'

In [49]:
s[:5]

'Hello'

In [50]:
s[-4:-1]

'rld'

In [51]:
s[-2:]

'd!'

### Specifying stride while slicing strings

In [74]:
s[:5]

'Hello'

In [75]:
s[:5:1]

'Hello'

In [76]:
s[:5:2]

'Hlo'

In [88]:
s[::-1]

'!dlroW olleH'

In [89]:
s[-1:-7:-2]

'!lo'

### String.find() - Return the index of the first occurrence of the substring

In [58]:
s.find('o')

4

In [57]:
s.find('or')

7

## Writing to and Reading from CSV File

In [62]:
import csv

### Writing to a csv file

In [67]:
with open('test.csv', 'w', encoding='utf8', newline='') as wf:
    writer = csv.writer(wf)
    writer.writerow(('张三','北京'))

### Reading from a csv file

In [66]:
with open('test.csv', 'r', encoding='utf8') as rf:
    r = csv.reader(rf)
    for row in r:
        print(f'姓名:{row[0]}, 住址:{row[1]}')

姓名:张三, 住址:北京
