In [1]:
# [a-zA-Z]{3}[ ]?[0-9]{4} -> 3 letters of any case, followed by 1 space (optional), followed by 4 nums!
# ?      means 0 or 1 occurance
# +      means 1 or more occurance
# *      means 0 or more occurance
# ^      means start of the pattern
# $      means end of the pattern

In [2]:
pattern = '£ ?[0-9][0-9]?' # £ then SPACE-optional then digit then digit-optional

document = 'My eggs cost £ 3, bread cost £2, vodka cost £ 35'

In [3]:
import re

In [4]:
re.findall(pattern, document)

['£ 3', '£2', '£ 35']

In [5]:
import pandas as pd

In [6]:
ti = pd.read_csv('https://raw.githubusercontent.com/a-forty-two/diamler_17_jan/main/titanic.csv')
ti.sample(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
90,0,3,male,29.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
805,0,3,male,31.0,0,0,7.775,S,Third,man,True,,Southampton,no,True


In [7]:
ti['ticket'] = "Ticket: " + ti['class'] + "; Price: $ " + ti['fare'].astype(str) + "; Port: " + ti['embark_town'] + ";"

In [8]:
ti[['class', 'fare', 'embark_town', 'ticket']].head(3)

Unnamed: 0,class,fare,embark_town,ticket
0,Third,7.25,Southampton,Ticket: Third; Price: $ 7.25; Port: Southampton;
1,First,71.2833,Cherbourg,Ticket: First; Price: $ 71.2833; Port: Cherbourg;
2,Third,7.925,Southampton,Ticket: Third; Price: $ 7.925; Port: Southampton;


In [9]:
pattern = '(First|Second)'

ti['class'].str.match(pattern)

0      False
1       True
2      False
3       True
4      False
       ...  
886     True
887     True
888    False
889     True
890    False
Name: class, Length: 891, dtype: bool

In [10]:
ti.loc[ ti['class'].str.match(pattern)  , 'survived'].mean()

0.5575

In [11]:
pattern = '(Third)'
ti.loc[ ti['class'].str.match(pattern)  , 'survived'].mean()

0.24236252545824846

In [12]:
ti[['class', 'fare', 'embark_town', 'ticket']].head(3)

Unnamed: 0,class,fare,embark_town,ticket
0,Third,7.25,Southampton,Ticket: Third; Price: $ 7.25; Port: Southampton;
1,First,71.2833,Cherbourg,Ticket: First; Price: $ 71.2833; Port: Cherbourg;
2,Third,7.925,Southampton,Ticket: Third; Price: $ 7.925; Port: Southampton;


In [13]:
#just for demo, no real purpose
pattern = '([0-9.]+)'

ti['ticket'].str.extract(pattern).sample(4)

Unnamed: 0,0
362,14.4542
385,73.5
796,25.9292
664,7.925


In [14]:
ti['ticket'].str.replace('$', '€').sample(1)

776    Ticket: Third; Price: € 7.75; Port: Queenstown;
Name: ticket, dtype: object

In [15]:
ti['ticket'].str.extract('(Ticket: (First|Second))')

Unnamed: 0,0,1
0,,
1,Ticket: First,First
2,,
3,Ticket: First,First
4,,
...,...,...
886,Ticket: Second,Second
887,Ticket: First,First
888,,
889,Ticket: First,First


In [16]:
ti['ticket'].str.extract('( [0-9][0-9])')

Unnamed: 0,0
0,
1,71
2,
3,53
4,
...,...
886,13
887,30
888,23
889,30


In [17]:
ti['ticket'].sample(1)

227    Ticket: Third; Price: $ 7.25; Port: Southampton;
Name: ticket, dtype: object

In [18]:
ti['ticket'].str.extract('(Ticket: [A-Z])').sample(2)

Unnamed: 0,0
614,Ticket: T
811,Ticket: T


In [19]:
ti['ticket'].str.extract('(T........)').sample(3)

Unnamed: 0,0
12,Ticket: T
583,Ticket: F
283,Ticket: T


In [20]:
ti['ticket'].str.extract('(Price: [^0-9A-Za-z] ..)').sample(3)

Unnamed: 0,0
443,Price: $ 13
7,Price: $ 21
313,Price: $ 7.


In [21]:
ti['ticket'].str.extract('(Port: (Cherbourg|Southampton))').sample(3)

Unnamed: 0,0,1
597,Port: Southampton,Southampton
164,Port: Southampton,Southampton
525,,


In [22]:
ti['ticket'].str.extract('([0-9][0-9]?.[0-9]*)').sample(3)

Unnamed: 0,0
161,15.75
313,7.8958
340,26.0


In [23]:
ti['ticket'].str.extract('(Ticket: [a-zA-Z]+)').sample(3)

Unnamed: 0,0
471,Ticket: Third
75,Ticket: Third
542,Ticket: Third


In [24]:
row = 0
match = 1 # second match

ti['ticket'].str.extractall('([a-zA-Z]+: [a-zA-Z]+)').loc[row, match]

0    Port: Southampton
Name: (0, 1), dtype: object

In [25]:
ti['ticket'].str.extract('([a-zA-Z]+tow?n)')

Unnamed: 0,0
0,Southampton
1,
2,Southampton
3,Southampton
4,Southampton
...,...
886,Southampton
887,Southampton
888,Southampton
889,


In [26]:
ti['ticket'].str.extract('(\$ [0-9]+\.[0-9]+)').sample(2)

Unnamed: 0,0
825,$ 6.95
215,$ 113.275


In [27]:
ti['ticket'].str.extractall('([a-zA-Z]+: [a-zA-Z]+;$)').sample(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
344,0,Port: Southampton;


In [28]:
ti['ticket'].str.extractall('(^[a-zA-Z]+: [a-zA-Z]+;)').sample(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
718,0,Ticket: Third;


Exercise (30 min)
find all the words in the tickets

HINT: a word is a repeated letter followed by a space or a colon
HINT: [ :] means a space or a colon
find all the USD prices

HINT: \$ and repeated numbers
find all the high-price tickets

HINT: consider \$, tripple-digit number, \.

In [38]:
tickets = ti['ticket'].str.findall('([a-zA-Z]+[ :])').sample(10)
print(tickets)

735    [Ticket:, Price:, Port:]
827    [Ticket:, Price:, Port:]
90     [Ticket:, Price:, Port:]
259    [Ticket:, Price:, Port:]
664    [Ticket:, Price:, Port:]
47     [Ticket:, Price:, Port:]
373    [Ticket:, Price:, Port:]
154    [Ticket:, Price:, Port:]
809    [Ticket:, Price:, Port:]
296    [Ticket:, Price:, Port:]
Name: ticket, dtype: object


In [39]:
ti['ticket'].str.extract('(\$ [0-9]+\.[0-9]+)').sample(2)

Unnamed: 0,0
687,$ 10.1708
225,$ 9.35


In [40]:
row = 0
match = 1 # second match
ti['ticket'].str.extractall('(\$ [0-9][0-9][0-9]+\.[0-9]+)')#.loc[row, 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
27,0,$ 263.0
31,0,$ 146.5208
88,0,$ 263.0
118,0,$ 247.5208
195,0,$ 146.5208
215,0,$ 113.275
258,0,$ 512.3292
268,0,$ 153.4625
269,0,$ 135.6333
297,0,$ 151.55
