# Handling text with Python

## Get started

In [28]:
text1 = 'In this world nothing can be said to be certain except death and taxes'

In [29]:
len(text1)

70

In [30]:
text2 = text1.split(' ')

In [31]:
len(text2)

14

In [32]:
text2

['In',
 'this',
 'world',
 'nothing',
 'can',
 'be',
 'said',
 'to',
 'be',
 'certain',
 'except',
 'death',
 'and',
 'taxes']

## Finding Specific Words

### Words that are more than 5 letters long

In [33]:
[w for w in text2 if len(w) > 5]

['nothing', 'certain', 'except']

### Capitalized words

In [34]:
[w for w in text2 if w.istitle()]

['In']

### Words that end with  'd'

In [35]:
[w for w in text2 if w.endswith('d')]

['world', 'said', 'and']

### Words that start with  'd'

In [36]:
[w for w in text2 if w.startswith('d')]

['death']

In [37]:
[w for w in text2 if w.startswith(('d','n'))]

['nothing', 'death']

### t in s - To check whether a substring exists in a given string

In [38]:
'Text' in 'Text mining'

True

### s.isupper() - To check whether all the characters of the string are uppercase

In [39]:
'TEXT'.isupper()

True

In [40]:
'Text'.isupper()

False

### s.islower() - To check whether all the characters of the string are lowercase

In [41]:
'text'.islower()

True

In [42]:
'Text'.islower()

False

### s.isdigit() - To check whether the string contains digits only

In [43]:
'000002'.isdigit()

True

In [44]:
'SZ000002'.isdigit()

False

### s.isalpha() - To check whether the string contains alphabetic characters only

In [45]:
'Textmining'.isalpha()

True

In [46]:
'Text mining'.isalpha()

False

### s.isalnum() - To check whether the string contains alphanumeric characters only

In [47]:
'SZ000002'.isalnum()

True

In [48]:
'SZ.000002'.isalnum()

False

## Finding Unique Words

### Using set()

In [1]:
text1 = 'to be or not to be'

In [2]:
text2 = text1.split(' ')

In [3]:
len(text2)

6

In [4]:
text2

['to', 'be', 'or', 'not', 'to', 'be']

In [5]:
text3 = set(text2)

In [6]:
len(text3)

4

In [7]:
text3

{'be', 'not', 'or', 'to'}

## String Operations

### Case conversions

In [14]:
s1 = 'text analysis'

In [15]:
s2 = s1.upper()

In [16]:
s2

'TEXT ANALYSIS'

In [17]:
s2.lower()

'text analysis'

In [18]:
s1.capitalize()

'Text analysis'

In [19]:
s1.title()

'Text Analysis'

### From words to characters

In [20]:
s1 = 'cattcatt'

In [21]:
s2 = s1.split('a')

In [22]:
s2

['c', 'ttc', 'tt']

In [23]:
'a'.join(s2)

'cattcatt'

In [24]:
list(s1)

['c', 'a', 't', 't', 'c', 'a', 't', 't']

In [25]:
[c for c in s1]

['c', 'a', 't', 't', 'c', 'a', 't', 't']

### String formatting

In [26]:
'{} {}'.format('hello', 'world')

'hello world'

In [27]:
'{} {}'.format(24, 'seconds')

'24 seconds'

In [28]:
24 + 'seconds' 

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [29]:
name = 'Eric'

In [30]:
age = 74

In [31]:
f'Hello, {name}. You are {age}.'

'Hello, Eric. You are 74.'

### Strip

In [32]:
s1 = '  a quick brown fox jumped over the lazy dog  '

In [33]:
s2 = s1.strip()

In [34]:
s2

'a quick brown fox jumped over the lazy dog'

### Replace

In [37]:
s2.replace('o', 'O')

'a quick brOwn fOx jumped Over the lazy dOg'

### Translate

In [38]:
intab = 'aeiou'

In [39]:
outtab = '12345'

In [40]:
table = str.maketrans(intab, outtab)

In [41]:
s1 = 'aeiou-xmppp'

In [42]:
s1.translate(table)

'12345-xmppp'

In [43]:
table_1 = str.maketrans(intab, outtab, 'xm')

In [44]:
s1.translate(table_1)

'12345-ppp'

## Index and Slice String

### Accessing characters by positive index number

In [59]:
s = 'Hello World!'

In [60]:
s[4]

'o'

### Accessing characters by negative index number

In [61]:
s[-3]

'l'

### Slicing strings

In [48]:
s[1:5]

'ello'

In [49]:
s[:5]

'Hello'

In [50]:
s[-4:-1]

'rld'

In [51]:
s[-2:]

'd!'

### Specifying stride while slicing strings

In [52]:
s[:5]

'Hello'

In [53]:
s[:5:1]

'Hello'

In [54]:
s[:5:2]

'Hlo'

In [55]:
s[::-1]

'!dlroW olleH'

### Using find

In [58]:
s.find('o')

4

In [57]:
s.find('or')

7

In [62]:
import csv

## Writing to and Reading from CSV File

### Writing to a csv file

In [67]:
with open('test.csv', 'w', encoding='utf8', newline='') as wf:
    writer = csv.writer(wf)
    writer.writerow(('张三','北京'))

### Reading from a csv file

In [66]:
with open('test.csv', 'r', encoding='utf8') as rf:
    r = csv.reader(rf)
    for row in r:
        print(f'姓名:{row[0]}, 住址:{row[1]}')

姓名:张三, 住址:北京
