# RegEx in Python

In [1]:
import numpy as np
import pandas as pd
import re

* A Regular Expression (RegEx) is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern.

**Raw String**

* yazılan string değerin ham halini almak için (raw) string başına r ekleriz

In [1]:
my_string = "Hello \nWorld"
print(my_string)

Hello 
World


In [2]:
my_string=r"Hello \nWorld"
print(my_string)

Hello \nWorld


In [4]:
print(r"Backslash: \\")
print(r"New line char: \\n")

Backslash: \\
New line char: \\n


In [5]:
print("Backslash: \\")
print("New line char: \\n")

Backslash: \
New line char: \n


**Invalid Raw String**

In [6]:
#print("\") # gives an error
print("\") # gives an error

SyntaxError: EOL while scanning string literal (<ipython-input-6-6bd1a21bbb40>, line 2)

In [7]:
#print(r"\") # gives an error
print(r"\") # gives an error

SyntaxError: EOL while scanning string literal (<ipython-input-7-b380e1fd9583>, line 2)

In [8]:
#print(r"abc\") # gives an error
print(r"abc\") # gives an error

SyntaxError: EOL while scanning string literal (<ipython-input-8-c620f62d6c72>, line 2)

In [12]:
#print(r"abc\\\)" # gives an error
print(r"abc\\\)" # gives an error

SyntaxError: unexpected EOF while parsing (<ipython-input-12-3da217326b24>, line 2)

In [15]:
print(r"abc\\\ ")

abc\\\ 


# Common Python RegEx Functions

* **re.search():** Scan through string looking for a match to the pattern.
* **re.match():** Try to apply the pattern at the start of the string.
* **re.fullmatch():** Try to apply the pattern to all of the string.
* **re.findall():** Return a list of all non-overlapping matches in the string.
* **re.sub():** Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern in string by the replacement repl.
* **re.split():** Split the source string by the occurrences of the pattern, returning a list containing the resulting substrings.

* shows all functions/methods in re module

In [16]:
dir(re)


['A',
 'ASCII',
 'DEBUG',
 'DOTALL',
 'I',
 'IGNORECASE',
 'L',
 'LOCALE',
 'M',
 'MULTILINE',
 'Match',
 'Pattern',
 'RegexFlag',
 'S',
 'Scanner',
 'T',
 'TEMPLATE',
 'U',
 'UNICODE',
 'VERBOSE',
 'X',
 '_MAXCACHE',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '__version__',
 '_cache',
 '_compile',
 '_compile_repl',
 '_expand',
 '_locale',
 '_pickle',
 '_special_chars_map',
 '_subx',
 'compile',
 'copyreg',
 'enum',
 'error',
 'escape',
 'findall',
 'finditer',
 'fullmatch',
 'functools',
 'match',
 'purge',
 'search',
 'split',
 'sre_compile',
 'sre_parse',
 'sub',
 'subn',
 'template']

In [20]:
method = re.match
help(method)

Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a Match object, or None if no match was found.



**re.search(pattern, string, flags=0)**

Scan through string looking for a match to the pattern, returning a Match object, or None if no match was found

In [21]:
text = "A78L41K"

* **find numeric digits with search function**

In [25]:
re.search("78",text)

<re.Match object; span=(1, 3), match='78'>

In [27]:
re.search("A",text)

<re.Match object; span=(0, 1), match='A'>

* **with regular expression**

In [28]:
re.search("\d\d",text)

#text in içini arıyor gördüğü ilk digit ifadeyi getiriyor. text içerisinde 41 var ancak getirmedi

<re.Match object; span=(1, 3), match='78'>

* **with compile method**

compile yapmak işimizi hızlandırır. Düzenlenmiş ifade arar ve bu bize hız kazandırır. 
Daha çok developerların işine yarar.

In [29]:
comp= re.compile("\d\d")

In [30]:
comp.search(text)

<re.Match object; span=(1, 3), match='78'>

In [31]:
num = comp.search(text)
num

<re.Match object; span=(1, 3), match='78'>

In [32]:
num.start()

1

In [33]:
num.end()

3

In [34]:
num.span()

(1, 3)

In [36]:
num.group()

#group hangi ifade ile eşleştiyse onu getirir

'78'

* **Find non decimal digits with search function**

In [37]:
text = "8PM19MIN"

In [38]:
re.search("\D",text)

<re.Match object; span=(1, 2), match='P'>

In [39]:
re.search("\D\D",text)

<re.Match object; span=(1, 3), match='PM'>

In [40]:
re.search("\D\D\D",text)

<re.Match object; span=(5, 8), match='MIN'>

In [41]:
re.search("\D+",text)

<re.Match object; span=(1, 3), match='PM'>

In [42]:
re.search("\D*",text)

#hiçbir şey yakalayamadı. 

<re.Match object; span=(0, 0), match=''>

* **Find phone number pattern**

In [44]:
text = 'My phone number is 1234567890'

In [46]:
re.search("\d\d\d\d\d\d\d\d\d\d",text)

<re.Match object; span=(19, 29), match='1234567890'>

In [47]:
re.search("\d+",text)

<re.Match object; span=(19, 29), match='1234567890'>

In [48]:
re.search("\d*",text)

<re.Match object; span=(0, 0), match=''>

In [55]:
text = 'My phone number is 123 456 7890'

In [56]:
re.search("\d\d\d \d\d\d \d\d\d\d",text)

<re.Match object; span=(19, 31), match='123 456 7890'>

In [57]:
text = 'My phone number is 123-456-7890'

In [58]:
re.search("\d\d\d-\d\d\d-\d\d\d\d",text)

<re.Match object; span=(19, 31), match='123-456-7890'>

In [59]:
re.search('\d'*3 + '-' + '\d'*3 + '-' + '\d'*4, text)

<re.Match object; span=(19, 31), match='123-456-7890'>

* **Find phone number pattern by grouping**

In [60]:
text

'My phone number is 123-456-7890'

In [61]:
re.search("(\d\d\d)-(\d\d\d)-(\d\d\d\d)",text)

<re.Match object; span=(19, 31), match='123-456-7890'>

In [62]:
telno=re.search("(\d\d\d)-(\d\d\d)-(\d\d\d\d)",text)
telno

<re.Match object; span=(19, 31), match='123-456-7890'>

In [63]:
telno.group()

'123-456-7890'

In [64]:
telno.group(1)

'123'

In [65]:
telno.group(3)

'7890'

In [66]:
telno.group(4)

IndexError: no such group

In [67]:
telno = re.search("(\d*)-(\d*)-(\d*)",text)

In [68]:
telno

<re.Match object; span=(19, 31), match='123-456-7890'>

In [69]:
telno.group()

'123-456-7890'

In [70]:
telno = re.search("(\d+)-(\d+)-(\d+)",text)
telno

<re.Match object; span=(19, 31), match='123-456-7890'>

In [71]:
telno.group()

'123-456-7890'

**Escaping parentheses and create 2 group -> first group:(415) second group:555-1212 print**

In [72]:
text = 'My phone number is (415) 555-1212'

In [73]:
re.search("(\d\d\d)-(\d\d\d)-(\d\d\d\d)",text)

#bu şekilde bir şey yakalayamaz

In [78]:
re.search("(\(\d\d\d\)) (\d\d\d-\d\d\d\d)", text)

<re.Match object; span=(19, 33), match='(415) 555-1212'>

In [79]:
telno =re.search("(\(\d\d\d\)) (\d\d\d-\d\d\d\d)", text)

print(telno.group(1))
print(telno.group(2))

(415)
555-1212


**re.match(pattern, string, flags=0)**

* Try to apply the pattern at the start of the string, returning a Match object, or None if no match was found.

* If you want to locate a match anywhere in string, use search() instead of match()

In [84]:
text = "A78L41K"

In [None]:
# num = re.match("\d\d", text) # gives an error, because match function just look at the begining of string 
# num.group()

In [85]:
re.match("\d\d",text)

In [83]:
alp = re.match("\D\d\d", text)
alp.group()

'A78'

In [86]:
re.match("\w\d\d",text)

#match her durumda en baştakini aldığı için baştan itibaren getirir

<re.Match object; span=(0, 3), match='A78'>

**re.fullmatch(pattern, string, flags=0)**

In [None]:
#ilgili satırın tamamını kapsaması gerek, birebir karşılalı text ile yazdığımız pattern

In [87]:
text = "A78L41K"

In [88]:
alpnum = re.fullmatch("\D\d+\D\d+\D", text)
alpnum

<re.Match object; span=(0, 7), match='A78L41K'>

In [89]:
alpnum = re.fullmatch("\D\d+\D\d+", text)
alpnum

#sonuçta bir şey döndürmez çünkü text sonundaki harfi alamıyoruz

In [90]:
alpnum = re.fullmatch("\w\d+\w\d+\w", text)
alpnum.group()

'A78L41K'

**re.findall(pattern, string, flags=0)**

In [91]:
text = "O 1, t 10, o 100. 100000"


In [95]:
re.findall("\d", text)

['1', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0']

In [92]:
re.findall("\d{1}", text)

['1', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0']

In [93]:

re.findall("\d{2}", text)

['10', '10', '10', '00', '00']

In [94]:
re.findall("\d{3}", text)

['100', '100', '000']

In [96]:
re.findall("\d{4}", text)

['1000']

In [97]:
re.findall("\d{1,6}", text)

['1', '10', '100', '100000']

In [98]:
re.findall("\d+", text)

['1', '10', '100', '100000']

**Extract words begining with "f"**

In [100]:
text = 'which foot or hand fell fastest'

In [101]:
re.findall("f[a-z]*", text)

['foot', 'fell', 'fastest']

In [102]:
re.findall("h[a-z]*", text)

['hich', 'hand']

In [103]:
re.findall("w[a-z]*", text)

['which']

**Extract equations made up of words and numbers**

In [104]:

text = 'set width=20 and height=10'

In [105]:
re.findall('\w+=\d+', text)

['width=20', 'height=10']

In [106]:

re.findall('(\w+)=(\d+)', text)

[('width', '20'), ('height', '10')]

**Check if the string starts with 'hello'**

In [107]:
text = "hello world"

In [108]:
re.findall("^hello", text)

['hello']

In [109]:
re.match("^hello", text).group()

'hello'

In [110]:
re.search("^hello", text).group()

'hello'

**Check if the string ends with 'world'**

In [111]:

re.findall("world$",text)

['world']

**re.sub(pattern, repl, string, count=0, flags=0)**

Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern in string by the replacement repl. repl can be either a string or a callable; if a string, backslash escapes in it are processed. If it is a callable, it's passed the Match object and must return a replacement string to be used.

* Remove anything other than digits

In [112]:

text = "2004-959-559 # This is Phone Number"

In [113]:
re.sub("\D", "", text)

'2004959559'

* Remove digits and replace with "."

In [114]:
re.sub("\d", ".", text)

'....-...-... # This is Phone Number'

In [115]:
re.sub("\d", ".", text, count=4)

'....-959-559 # This is Phone Number'

In [116]:
pd.Series(text).str.replace("\d", ".", regex=True)

0    ....-...-... # This is Phone Number
dtype: object

* Remove anything orther than digit

In [2]:
text = "2004-959-559 # this is phone number" 

In [3]:
re.sub("\D","",text)

'2004959559'

* remove digits and replace with "."

In [4]:
re.sub("\d",".",text)

'....-...-... # this is phone number'

In [5]:
re.sub("\d",".",text,count=4)

#count parametresi baştan kaç karakteri değiştirmseini istiyorsak yazarız

'....-959-559 # this is phone number'

In [6]:
pd.Series(text).str.replace("\d",".",regex=True)

0    ....-...-... # this is phone number
dtype: object

**re.split(pattern, string, maxsplit=0, flags=0)**

In [None]:
#sadece sayıları alıp liste içerisinde göstermeye çalışalım. 

In [7]:
text = "ab56cd78_de fg3hıi49"

In [8]:
re.split("\D", text)

['', '', '56', '', '78', '', '', '', '', '', '3', '', '', '49']

In [9]:
re.split("\D+", text)

['', '56', '78', '3', '49']

In [10]:
re.findall("\d+", text)

#findall kullanarak yapmak daha mantıklı olur

['56', '78', '3', '49']

# RegEx için kullanacağımız Pandas Fonksiyonları

* **count()**: Count occurrences of pattern in each string of the Series/Index
* **replace():** Replace the search string or pattern with the given value
* **contains():** Test if pattern or regex is contained within a string of a Series or Index. Calls re.search() and returns a boolean
* **findall():** Find all occurrences of pattern or regular expression in the Series/Index. Equivalent to applying re.findall() on all elements
* **match():** Determine if each string matches a regular expression. Calls re.match() and returns a boolean
* **split():** Split strings around given separator/delimiter and accepts string or regular expression to split on
* **extract()**: Extract capture groups in the regex pat as columns in a DataFrame and returns the captured groups

In [29]:
data = [['Evert van Dijk', 'Carmine-pink, salmon-pink streaks, stripes, flecks. #94569# Warm pink, clear carmine pink, rose pink shaded salmon.  Mild fragrance.  Large, very double, in small clusters, high-centered bloom form.  Blooms in flushes throughout the season.'],
        ['Every Good Gift', 'Red.  Flowers velvety red.  #079463895689# Moderate fragrance.  Average diameter 4".  Medium-large, full (26-40 petals), borne mostly solitary bloom form.  Blooms in flushes throughout the season.'], 
        ['Evghenya', 'Orange-pink.  75 petals.  Large, very double #68345_686# bloom form.  Blooms in flushes throughout the season.'], 
        ['Evita', 'White or white blend.  None to mild fragrance.  35 petals #9897#.  Large, full (26-40 petals), high-centered bloom form.  Blooms in flushes throughout the season.'],
        ['Evrathin', 'Light pink. [Deep pink.]  Outer petals white. Expand rarely #679754YH89#.  Mild fragrance.  35 to 40 petals.  Average diameter 2.5".  Medium, double (17-25 petals), full (26-40 petals), cluster-flowered, in small clusters bloom form.  Prolific, once-blooming spring or summer.  Glandular sepals, leafy sepals, long sepals buds.'],
        ['Evita 2', 'White, blush shading.  Mild, wild rose fragrance #AGHJS876IOP#.  20 to 25 petals.  Average diameter 1.25".  Small, very double, cluster-flowered bloom form.  Blooms in flushes throughout the season.']]
  
df = pd.DataFrame(data, columns = ['name', 'bloom']) 
df 

Unnamed: 0,name,bloom
0,Evert van Dijk,"Carmine-pink, salmon-pink streaks, stripes, fl..."
1,Every Good Gift,Red. Flowers velvety red. #079463895689# Mod...
2,Evghenya,"Orange-pink. 75 petals. Large, very double #..."
3,Evita,White or white blend. None to mild fragrance....
4,Evrathin,Light pink. [Deep pink.] Outer petals white. ...
5,Evita 2,"White, blush shading. Mild, wild rose fragran..."


**pandas.Series.str.count(pat, flags=0)**

In [30]:
df.bloom[0]

'Carmine-pink, salmon-pink streaks, stripes, flecks. #94569# Warm pink, clear carmine pink, rose pink shaded salmon.  Mild fragrance.  Large, very double, in small clusters, high-centered bloom form.  Blooms in flushes throughout the season.'

In [31]:
df.bloom.str.count("\d+")

#digit ifadeleri saydı

0     1
1     4
2     3
3     4
4    10
5     5
Name: bloom, dtype: int64

In [32]:
#kaç tane karakter oldupunu bulmak için 

df.bloom.apply(len)

0    240
1    196
2    110
3    162
4    327
5    198
Name: bloom, dtype: int64

In [33]:
df.bloom.str.count(".")

#kaç karakter olduğunu bulduk

0    240
1    196
2    110
3    162
4    327
5    198
Name: bloom, dtype: int64

In [34]:
#paragraf kaç cümleden oluşuyor. Cümlelerin noktayla bittiğini düşünüp "." gibi bir parametreyle saydırıyoruz
#örneğin yeni bir satırda yeni bir cümle başlamış ise farklı bir parametre vermemiz gerekirdi
df.bloom.str.count("\.")

0     5
1     6
2     4
3     5
4    11
5     7
Name: bloom, dtype: int64

**pandas.Series.str.replace(pat, repl, n=- 1, case=None, flags=0, regex=None)**

In [35]:
#verilen data içerisinde # karakterleri var ve bu karakterlerin içerisindeki datayı bulmak istiyoruz ve çıkarmak istiyoruz

df["bloom"] = df.bloom.str.replace("#\S+#","", regex=True)

#iki # karakteri arasında ne varsa onu kaldırdı ve # karakterlerini de kaldırdı

In [36]:
df.bloom[0]

'Carmine-pink, salmon-pink streaks, stripes, flecks.  Warm pink, clear carmine pink, rose pink shaded salmon.  Mild fragrance.  Large, very double, in small clusters, high-centered bloom form.  Blooms in flushes throughout the season.'

In [37]:
df.bloom.str.count("#\S+#")

0    0
1    0
2    0
3    0
4    0
5    0
Name: bloom, dtype: int64

In [21]:
df.bloom.str.replace("C","K")

0    Karmine-pink, salmon-pink streaks, stripes, fl...
1    Red.  Flowers velvety red.   Moderate fragranc...
2    Orange-pink.  75 petals.  Large, very double  ...
3    White or white blend.  None to mild fragrance....
4    Light pink. [Deep pink.]  Outer petals white. ...
5    White, blush shading.  Mild, wild rose fragran...
Name: bloom, dtype: object

In [39]:
df.bloom[1]

'Red.  Flowers velvety red.   Moderate fragrance.  Average diameter 4".  Medium-large, full (26-40 petals), borne mostly solitary bloom form.  Blooms in flushes throughout the season.'

In [41]:
df.bloom.str.replace("\d","3333",regex=True)

0    Carmine-pink, salmon-pink streaks, stripes, fl...
1    Red.  Flowers velvety red.   Moderate fragranc...
2    Orange-pink.  33333333 petals.  Large, very do...
3    White or white blend.  None to mild fragrance....
4    Light pink. [Deep pink.]  Outer petals white. ...
5    White, blush shading.  Mild, wild rose fragran...
Name: bloom, dtype: object

In [43]:
a = df.bloom.str.replace("\D","5555",regex=True)

In [47]:
a[1]

'55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555554555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555265555405555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555'

In [48]:
df.bloom.str.replace("\w","ektra",regex=True)

0    ektraektraektraektraektraektraektra-ektraektra...
1    ektraektraektra.  ektraektraektraektraektraekt...
2    ektraektraektraektraektraektra-ektraektraektra...
3    ektraektraektraektraektra ektraektra ektraektr...
4    ektraektraektraektraektra ektraektraektraektra...
5    ektraektraektraektraektra, ektraektraektraektr...
Name: bloom, dtype: object

**pandas.Series.str.contains(pat, case=True, flags=0, na=None, regex=True)**

In [68]:
df.bloom[1]

'Red.  Flowers velvety red.   Moderate fragrance.  Average diameter 4".  Medium-large, full (26-40 petals), borne mostly solitary bloom form.  Blooms in flushes throughout the season.'

In [67]:
df.bloom.str.contains("diameter")

0    False
1     True
2    False
3    False
4     True
5     True
Name: bloom, dtype: bool

In [69]:
df.bloom.str.contains('\d+"')

#diameter olarak belirtilmese de bu bilgi içeriği öenmli olabilir bu yüzden digit olan ifaadelerin bulundupu ve " ile biten kısımları
#getirdik

0    False
1     True
2    False
3    False
4     True
5     True
Name: bloom, dtype: bool

In [71]:
df.loc[df.bloom.str.contains('\d+"')]

Unnamed: 0,name,bloom
1,Every Good Gift,Red. Flowers velvety red. Moderate fragranc...
4,Evrathin,Light pink. [Deep pink.] Outer petals white. ...
5,Evita 2,"White, blush shading. Mild, wild rose fragran..."


In [70]:
df[df.bloom.str.contains('\d+"')]

Unnamed: 0,name,bloom
1,Every Good Gift,Red. Flowers velvety red. Moderate fragranc...
4,Evrathin,Light pink. [Deep pink.] Outer petals white. ...
5,Evita 2,"White, blush shading. Mild, wild rose fragran..."


**pandas.Series.str.findall(pat, flags=0)**

In [72]:
df.bloom.str.findall("\d+")

#her satırda geçen sayısal değerleri gördük

0                                []
1                       [4, 26, 40]
2                              [75]
3                      [35, 26, 40]
4    [35, 40, 2, 5, 17, 25, 26, 40]
5                   [20, 25, 1, 25]
Name: bloom, dtype: object

In [73]:
df.bloom.str.findall('\d+\.\d+"|\d+"')

#sol tarafında bir veya birden fazla sayı olabilir . araya nokta koyduk. sağ tarafta da bir veya daha fazla sayı olabilir. 
#bu duruma uygun olan digit ifadeleri getirir
#sonrasında  | paramateresi de veya olarak kullanıldı . Veya dan sonra gelen ifade de sadece tam sayı olanları getirsin 

0         []
1       [4"]
2         []
3         []
4     [2.5"]
5    [1.25"]
Name: bloom, dtype: object

In [75]:
df.bloom.str.findall('\d+"|\d+\.\d+"')

0         []
1       [4"]
2         []
3         []
4     [2.5"]
5    [1.25"]
Name: bloom, dtype: object

**pandas.Series.str.match(pat, case=True, flags=0, na=None)**

In [76]:
df.bloom
#Aşağıda genel olarak renk ile başladğını görüyoruz.

0    Carmine-pink, salmon-pink streaks, stripes, fl...
1    Red.  Flowers velvety red.   Moderate fragranc...
2    Orange-pink.  75 petals.  Large, very double  ...
3    White or white blend.  None to mild fragrance....
4    Light pink. [Deep pink.]  Outer petals white. ...
5    White, blush shading.  Mild, wild rose fragran...
Name: bloom, dtype: object

In [80]:
#hangisi pembe renkli ?
#Pembe renk yazanları getirmek istiyoruz ancak pembe renk sadece pink şeklinde yazılmamış. ÇEşitleri de yazılmış.

df.bloom.str.match('pink|\w+-pink|\w+ pink')



0     True
1    False
2     True
3    False
4     True
5    False
Name: bloom, dtype: bool

In [82]:
df.bloom.str.match("pink|\w+[- ]?pink")

#[] içerisindeki karakterler olabilir anlamına geliyor. ? işaretini de kullandık çünkü olsa da olmasa da olur. Hata vermez

0     True
1    False
2     True
3    False
4     True
5    False
Name: bloom, dtype: bool

In [83]:
df.bloom.str.match(".+pink")

0     True
1    False
2     True
3    False
4     True
5    False
Name: bloom, dtype: bool

In [81]:
df.bloom.str.contains("pink")

#contains fonksiyonu direkt pink aratır ve getirir. 

0     True
1    False
2     True
3    False
4     True
5    False
Name: bloom, dtype: bool

**pandas.Series.str.split(pat=None, n=- 1, expand=False, *, regex=None)**

In [84]:
df.bloom.str.split("\. ")

0    [Carmine-pink, salmon-pink streaks, stripes, f...
1    [Red,  Flowers velvety red,   Moderate fragran...
2    [Orange-pink,  75 petals,  Large, very double ...
3    [White or white blend,  None to mild fragrance...
4    [Light pink, [Deep pink.]  Outer petals white,...
5    [White, blush shading,  Mild, wild rose fragra...
Name: bloom, dtype: object

In [85]:
df.bloom.str.split("\. ",expand = True)

#expand split ile bölünmüş değerleri ayrı sütunlara atamayı sağlar

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,"Carmine-pink, salmon-pink streaks, stripes, fl...","Warm pink, clear carmine pink, rose pink shad...",Mild fragrance,"Large, very double, in small clusters, high-c...",Blooms in flushes throughout the season.,,,,
1,Red,Flowers velvety red,Moderate fragrance,"Average diameter 4""","Medium-large, full (26-40 petals), borne most...",Blooms in flushes throughout the season.,,,
2,Orange-pink,75 petals,"Large, very double bloom form",Blooms in flushes throughout the season.,,,,,
3,White or white blend,None to mild fragrance,35 petals,"Large, full (26-40 petals), high-centered blo...",Blooms in flushes throughout the season.,,,,
4,Light pink,[Deep pink.] Outer petals white,Expand rarely,Mild fragrance,35 to 40 petals,"Average diameter 2.5""","Medium, double (17-25 petals), full (26-40 pe...","Prolific, once-blooming spring or summer","Glandular sepals, leafy sepals, long sepals b..."
5,"White, blush shading","Mild, wild rose fragrance",20 to 25 petals,"Average diameter 1.25""","Small, very double, cluster-flowered bloom form",Blooms in flushes throughout the season.,,,


In [86]:
info = ["id:345, age:25, salary:1200", "id:346, age:32, salary:1500", "id:347, age:28, salary:1400"]
s = pd.Series(info)
s

0    id:345, age:25, salary:1200
1    id:346, age:32, salary:1500
2    id:347, age:28, salary:1400
dtype: object

In [87]:
s.str.split("\D+", expand = True)

Unnamed: 0,0,1,2,3
0,,345,25,1200
1,,346,32,1500
2,,347,28,1400


In [88]:
df= s.str.split("\D+", expand = True).iloc[:,1:]

In [89]:
df

Unnamed: 0,1,2,3
0,345,25,1200
1,346,32,1500
2,347,28,1400


**pandas.Series.str.extract(pat, flags=0, expand=True)**

In [90]:
s = pd.Series(['a3aa', 'b4aa', 'c5aa'])
s

0    a3aa
1    b4aa
2    c5aa
dtype: object

extract kullanırken grup kullanmamız lazım. Gruplarla çalışmamız lazım. Bu yüzden "(\d)" ifadesini kullandık
extract kullanırken gruplandırma yaptığımız için yaptığımız her grup  bize bir sütun olarak döner. Yani bunu da parantez gibi düşünebiliriz.

In [91]:
s.str.extract("(\d)")

Unnamed: 0,0
0,3
1,4
2,5


In [92]:
s.str.extract("(\D)\d(\D+)")

#(\D) ile grubu temsil ediyor. nümerik olmayan değeri alıyor. 
#\d parantez içinde değil bu yüzden ayrı bir sütun olarak almadık onu
#(\D+) ile de nümerik olmayan kalan harfleri gruplandırmış olduk

Unnamed: 0,0,1
0,a,aa
1,b,aa
2,c,aa


In [93]:
s.str.extract("(\D)\d(\D)(\D)")

Unnamed: 0,0,1,2
0,a,a,a
1,b,a,a
2,c,a,a


In [94]:
s.str.extract("(\w)\d(\w)(\w)")

Unnamed: 0,0,1,2
0,a,a,a
1,b,a,a
2,c,a,a


In [95]:
info = ["id:345, age:25, salary:1200", "id:346, age:32, salary:1500", "id:347, age:28, salary:1400"]
s = pd.Series(info)
s

0    id:345, age:25, salary:1200
1    id:346, age:32, salary:1500
2    id:347, age:28, salary:1400
dtype: object

In [96]:
df = s.str.extract("(\d+)\D+(\d+)\D+(\d+)")
df

Unnamed: 0,0,1,2
0,345,25,1200
1,346,32,1500
2,347,28,1400


In [97]:
df.columns = ["id", "age", "salary"]
df

Unnamed: 0,id,age,salary
0,345,25,1200
1,346,32,1500
2,347,28,1400


In [98]:
s.str.findall("(\d+)")

0    [345, 25, 1200]
1    [346, 32, 1500]
2    [347, 28, 1400]
dtype: object

In [99]:
s= pd.Series(['40 l/100 km (comb)', 
        '38 l/100 km (comb)', '6.4 l/100 km (comb)',
       '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)',
       '5.4 l/100 km (comb)', '6.7 l/100 km (comb)',
       '6.2 l/100 km (comb)', '7.3 l/100 km (comb)',
       '6.3 l/100 km (comb)', '5.7 l/100 km (comb)',
       '6.1 l/100 km (comb)', '6.8 l/100 km (comb)',
       '7.5 l/100 km (comb)', '7.4 l/100 km (comb)',
       '3.6 kg/100 km (comb)', '0 l/100 km (comb)', 
       '7.8 l/100 km (comb)'])
s

0       40 l/100 km (comb)
1       38 l/100 km (comb)
2      6.4 l/100 km (comb)
3     8.3 kg/100 km (comb)
4     5.1 kg/100 km (comb)
5      5.4 l/100 km (comb)
6      6.7 l/100 km (comb)
7      6.2 l/100 km (comb)
8      7.3 l/100 km (comb)
9      6.3 l/100 km (comb)
10     5.7 l/100 km (comb)
11     6.1 l/100 km (comb)
12     6.8 l/100 km (comb)
13     7.5 l/100 km (comb)
14     7.4 l/100 km (comb)
15    3.6 kg/100 km (comb)
16       0 l/100 km (comb)
17     7.8 l/100 km (comb)
dtype: object

In [100]:
#s.str.extract("(\d+\.\d+|\d+)")
#s.str.extract("(\d*\.?\d*)")
s.str.extract("(\S+)")
#\S whitespace hariç her şeyi getirdiği için ilk sıradaki sayıları kolayca alabiliriz. Çünkü white space i bu sayılar bittikten sonra
#görmeye başlayacak

Unnamed: 0,0
0,40.0
1,38.0
2,6.4
3,8.3
4,5.1
5,5.4
6,6.7
7,6.2
8,7.3
9,6.3


In [101]:
s.str.extract('(\d*.\d*).+/(\d*)')

Unnamed: 0,0,1
0,40.0,100
1,38.0,100
2,6.4,100
3,8.3,100
4,5.1,100
5,5.4,100
6,6.7,100
7,6.2,100
8,7.3,100
9,6.3,100


In [102]:
s = pd.Series(['06/2020\n\n4.9 l/100 km (comb)',
'11/2020\n\n166 g CO2/km (comb)',                                 
'10/2019\n\n5.3 l/100 km (comb)',
'05/2022\n\n6.3 l/100 km (comb)',
'07/2019\n\n128 g CO2/km (comb)',
'06/2022\n\n112 g CO2/km (comb)',                                                 
'01/2022\n\n5.8 l/100 km (comb)',
'11/2020\n\n106 g CO2/km (comb)',
'04/2019\n\n105 g CO2/km (comb)',
'08/2020\n\n133 g CO2/km (comb)',
'04/2022\n\n133 g CO2/km (comb)'])
s

0     06/2020\n\n4.9 l/100 km (comb)
1     11/2020\n\n166 g CO2/km (comb)
2     10/2019\n\n5.3 l/100 km (comb)
3     05/2022\n\n6.3 l/100 km (comb)
4     07/2019\n\n128 g CO2/km (comb)
5     06/2022\n\n112 g CO2/km (comb)
6     01/2022\n\n5.8 l/100 km (comb)
7     11/2020\n\n106 g CO2/km (comb)
8     04/2019\n\n105 g CO2/km (comb)
9     08/2020\n\n133 g CO2/km (comb)
10    04/2022\n\n133 g CO2/km (comb)
dtype: object

In [103]:
s.str.extract("(\d\d).(\d\d\d\d)")
#s.str.extract("(\d{2}).(\d{4})")
#s.str.extract("(\d*).(\d*)")
#s.str.extract("(\d\d)/(\d*)")
#s.str.extract("(\d\d)/(\d\d\d\d)")
#s.str.extract("(\d+).(\d+)")
#s.str.extract("(\S+)/(\S+)")

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022
7,11,2020
8,4,2019
9,8,2020


In [112]:
s.str.extract("(\d{2}).(\d{4})")

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022


In [113]:
s.str.extract("(\d*).(\d*)")

Unnamed: 0,0,1
0,4,9
1,166,11
2,5,3
3,6,3
4,128,7
5,112,6
6,5,8


In [104]:
s.str.extract('(\d+/\d+)\s+(\d+.\d+|\d+)')

Unnamed: 0,0,1
0,06/2020,4.9
1,11/2020,166.0
2,10/2019,5.3
3,05/2022,6.3
4,07/2019,128.0
5,06/2022,112.0
6,01/2022,5.8
7,11/2020,106.0
8,04/2019,105.0
9,08/2020,133.0


In [105]:
s = pd.Series(['\n\n4.9 06/2020 l/100 km (comb)',
'\n\n166 11/2020 g CO2/km (comb)',                                 
'\n\n5.3 10/2019 l/100 km (comb)',
'\n\n6.3 05/2022 l/100 km (comb)',
'\n\n128 07/2019 g CO2/km (comb)',
'\n\n112 06/2022 g CO2/km (comb)',                                                 
'\n\n5.8 01/2022 l/100 km (comb)'])
s

0    \n\n4.9 06/2020 l/100 km (comb)
1    \n\n166 11/2020 g CO2/km (comb)
2    \n\n5.3 10/2019 l/100 km (comb)
3    \n\n6.3 05/2022 l/100 km (comb)
4    \n\n128 07/2019 g CO2/km (comb)
5    \n\n112 06/2022 g CO2/km (comb)
6    \n\n5.8 01/2022 l/100 km (comb)
dtype: object

In [106]:
s.str.extract("\S+\s(\d+)/(\d+)")

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022


In [107]:
s.str.extract("(\d+)/(\d+)")

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022


In [108]:
text = "my email adress is example@gmail.com"

In [109]:
reg = re.search("([a-zA-Z0-9_.+-]+)@([a-zA-Z0-9_.+-]+)\.([a-zA-Z0-9_.+-]+)", text)

print(reg.group(0))
print(reg.group(1))
print(reg.group(2))
print(reg.group(3))

example@gmail.com
example
gmail
com


In [110]:
text = "/er._%+-@42f.-.Ab/"

In [111]:
reg = re.search("/[\w._%+-]+@[\w.-]+\.[a-zA-Z]{2,4}/", text)
print(reg.group(0))

/er._%+-@42f.-.Ab/
