In [16]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
text_data = np.array(["I love Brazil. Brazil!",
                      "Sweden is best",
                      "Germany beats both"])

# ngram # unigram # bigram trigram 

In [19]:
count = CountVectorizer(ngram_range=(1,2)) # (2,3) for trigrams
bag_of_words = count.fit_transform(text_data)

In [20]:
bag_of_words

<3x14 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [21]:
print(bag_of_words)

  (0, 10)	1
  (0, 4)	2
  (0, 11)	1
  (0, 5)	1
  (1, 12)	1
  (1, 8)	1
  (1, 2)	1
  (1, 13)	1
  (1, 9)	1
  (2, 6)	1
  (2, 0)	1
  (2, 3)	1
  (2, 7)	1
  (2, 1)	1


In [22]:
columnss = count.get_feature_names_out()

In [23]:
columnss

array(['beats', 'beats both', 'best', 'both', 'brazil', 'brazil brazil',
       'germany', 'germany beats', 'is', 'is best', 'love', 'love brazil',
       'sweden', 'sweden is'], dtype=object)

In [24]:
bag_of_words.toarray()

array([[0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [25]:
pd.DataFrame(bag_of_words.toarray(),columns=columnss)

Unnamed: 0,beats,beats both,best,both,brazil,brazil brazil,germany,germany beats,is,is best,love,love brazil,sweden,sweden is
0,0,0,0,0,2,1,0,0,0,0,1,1,0,0
1,0,0,1,0,0,0,0,0,1,1,0,0,1,1
2,1,1,0,1,0,0,1,1,0,0,0,0,0,0


# TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
text_data = np.array(["I love Brazil. Brazil!",
                      "Sweden is best",
                      "Germany beats both"])

In [28]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [29]:
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [30]:
print(feature_matrix)

  (0, 3)	0.8944271909999159
  (0, 6)	0.4472135954999579
  (1, 1)	0.5773502691896257
  (1, 5)	0.5773502691896257
  (1, 7)	0.5773502691896257
  (2, 2)	0.5773502691896257
  (2, 0)	0.5773502691896257
  (2, 4)	0.5773502691896257


In [31]:
col = tfidf.get_feature_names_out()

In [32]:
col

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

In [33]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [35]:
pd.DataFrame(feature_matrix.toarray(),columns=col)

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0.0,0.0,0.0,0.894427,0.0,0.0,0.447214,0.0
1,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735
2,0.57735,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0


In [36]:
tfidf.vocabulary_ # to view the words associated with each features
# to show columns

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

# unigram

In [37]:
tfidf = TfidfVectorizer(ngram_range=(1,2))
feature_matrix = tfidf.fit_transform(text_data)

In [38]:
print(feature_matrix)

  (0, 5)	0.3779644730092272
  (0, 11)	0.3779644730092272
  (0, 4)	0.7559289460184544
  (0, 10)	0.3779644730092272
  (1, 9)	0.4472135954999579
  (1, 13)	0.4472135954999579
  (1, 2)	0.4472135954999579
  (1, 8)	0.4472135954999579
  (1, 12)	0.4472135954999579
  (2, 1)	0.4472135954999579
  (2, 7)	0.4472135954999579
  (2, 3)	0.4472135954999579
  (2, 0)	0.4472135954999579
  (2, 6)	0.4472135954999579


In [39]:
col = tfidf.get_feature_names_out()

In [40]:
col

array(['beats', 'beats both', 'best', 'both', 'brazil', 'brazil brazil',
       'germany', 'germany beats', 'is', 'is best', 'love', 'love brazil',
       'sweden', 'sweden is'], dtype=object)

In [41]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.75592895,
        0.37796447, 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.37796447, 0.        , 0.        ],
       [0.        , 0.        , 0.4472136 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4472136 , 0.4472136 ,
        0.        , 0.        , 0.4472136 , 0.4472136 ],
       [0.4472136 , 0.4472136 , 0.        , 0.4472136 , 0.        ,
        0.        , 0.4472136 , 0.4472136 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [42]:
pd.DataFrame(feature_matrix.toarray(),columns=col)

Unnamed: 0,beats,beats both,best,both,brazil,brazil brazil,germany,germany beats,is,is best,love,love brazil,sweden,sweden is
0,0.0,0.0,0.0,0.0,0.755929,0.377964,0.0,0.0,0.0,0.0,0.377964,0.377964,0.0,0.0
1,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.0,0.0,0.447214,0.447214
2,0.447214,0.447214,0.0,0.447214,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0


# RE Special Sequences

In [43]:
import re

# \A

In [46]:
# \A

text = "The categorical cat hello mycat"
text1 = "categorical cat hello mycat The"
print(re.findall(r"\AThe",text))
print(re.findall(r"\AThe",text1))

['The']
[]


# \b

In [64]:
# \b
text = "categorical cat hello mycat"
re.findall(r"\bcat",text) # starting of the word

['cat', 'cat']

In [65]:
re.findall(r"cat\b",text) #ending of the word

['cat', 'cat']

In [66]:
re.findall(r"\bcat\b",text) #starting and ending of the word

['cat']

code to findthe index of the specified pattern "cat"

In [61]:
# \b
text = "categorical cat hello mycat"


# code to findthe index of the specified pattern "cat"

matches =re.finditer(r"\bcat",text)

indices = [match.start() for match in matches]

print(indices)

[0, 12]


# \B 
\B(st) - returns the pattern if not at the beginning of the word

(st)\B - returns the pattern if not at the ending of the word

In [68]:
text1 = "This is an amazing classroom, every one speaks ENGLISH zizazo"
text2 = "This is a good classroom, every one speaks ENGLISH zizxzd"
print(re.findall(r"\Bzi",text1))
print(re.findall(r"\Bzi",text2))

['zi']
[]


In [72]:
text = "This is an amazing classroom, every one speaks ENGLISH. This is Highclass!!"
print(re.findall(r"\Bclass",text))

# to find the index
matches =re.finditer(r"\Bclass",text)
indices = [match.start() for match in matches]
print(indices)

['class']
[68]


# \d 
- returns the digits 

In [76]:
text = "This is an 9mazing 5classroom, every one speaks ENGLISH 123.    CLASSY !!"
re.findall("\d",text)

['9', '5', '1', '2', '3']

In [77]:
text = "This is an 9mazing 5classroom, every one speaks ENGLISH 123.    CLASSY !!"
re.findall("\d+",text)

['9', '5', '123']

# \D
- returns the non digits

In [86]:
text = "This is an 2 amazing classroom, 4 every one speaks ENGLISH 123. classy !!"
print(re.findall("\D",text))

['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', 'n', ' ', ' ', 'a', 'm', 'a', 'z', 'i', 'n', 'g', ' ', 'c', 'l', 'a', 's', 's', 'r', 'o', 'o', 'm', ',', ' ', ' ', 'e', 'v', 'e', 'r', 'y', ' ', 'o', 'n', 'e', ' ', 's', 'p', 'e', 'a', 'k', 's', ' ', 'E', 'N', 'G', 'L', 'I', 'S', 'H', ' ', '.', ' ', 'c', 'l', 'a', 's', 's', 'y', ' ', '!', '!']


In [96]:
print(re.findall("\D+",text))

a = re.findall("\D+",text)
"".join(a)

['This is an amazing classroom, every one speaks ENGLISH ', ', classy !!']


'This is an amazing classroom, every one speaks ENGLISH , classy !!'

# \s
- return all the white spaces

In [80]:
text = "This is an amazing classroom, every one speaks ENGLISH 123, classy !!"
print(re.findall("\s",text))

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [81]:
print(len(re.findall("\s",text)))

11


In [83]:
print(len(re.findall(" ",text))) # same thing as above

11


# \S
- retun all the letters that are not white spaces

In [85]:
text = "This is an amazing classroom, every one speaks ENGLISH 123, classy !!"
print(re.findall("\S",text))
print(len(re.findall("\S",text)))

['T', 'h', 'i', 's', 'i', 's', 'a', 'n', 'a', 'm', 'a', 'z', 'i', 'n', 'g', 'c', 'l', 'a', 's', 's', 'r', 'o', 'o', 'm', ',', 'e', 'v', 'e', 'r', 'y', 'o', 'n', 'e', 's', 'p', 'e', 'a', 'k', 's', 'E', 'N', 'G', 'L', 'I', 'S', 'H', '1', '2', '3', ',', 'c', 'l', 'a', 's', 's', 'y', '!', '!']
58


# \w
- "." "," "!" and white spaces are not included
- every thing except these above are returned

In [97]:
text = "This is an amazing classroom, every one speaks ENGLISH 123, classy !!"
a = re.findall("\w",text)
print(re.findall("\w",text))
print("".join(a))
print(len(re.findall("\W",text)))

['T', 'h', 'i', 's', 'i', 's', 'a', 'n', 'a', 'm', 'a', 'z', 'i', 'n', 'g', 'c', 'l', 'a', 's', 's', 'r', 'o', 'o', 'm', 'e', 'v', 'e', 'r', 'y', 'o', 'n', 'e', 's', 'p', 'e', 'a', 'k', 's', 'E', 'N', 'G', 'L', 'I', 'S', 'H', '1', '2', '3', 'c', 'l', 'a', 's', 's', 'y']
ThisisanamazingclassroomeveryonespeaksENGLISH123classy
15


# \W 
- returns "." "," "!" and white spaces

In [98]:
text = "This is an amazing classroom, every one speaks ENGLISH 123, classy !!"
print(re.findall("\W",text))
print(len(re.findall("\W",text)))

[' ', ' ', ' ', ' ', ',', ' ', ' ', ' ', ' ', ' ', ',', ' ', ' ', '!', '!']
15


# \Z
- return the word (the word classy in this example) if at the end
- sees whole sentence as one

In [102]:
text = "This is an amazing class_room, every one speaks ENGLISH 123, classy"
print(re.findall("classy\Z",text))
print(len(re.findall("classy\Z",text)))

['classy']
1


# [m-q]
- returns from m - q

In [114]:
text = "This is an amazing class_room, every one speaks ENGLISH 123 q. classy"
print(re.findall("[m-q]",text))
print(len(re.findall("[m-q]",text)))

['n', 'm', 'n', 'o', 'o', 'm', 'o', 'n', 'p', 'q']
10


In [115]:
text = "This is an amazing class_room, every one speaks ENGLISH 123 q. classy"
print(re.findall("[m-q]+",text))
print(len(re.findall("[m-q]+",text)))

['n', 'm', 'n', 'oom', 'on', 'p', 'q']
7


In [112]:
text = "This is an amazing class_room, every one speaks ENGLISH 123 q. classy"
print(re.findall("[M-Q]",text))
print(len(re.findall("[M-Q]",text)))

['N']
1


# . (dot)
- . can be anything 
- c...s --> class, cavbs, ceees


In [117]:
text = "This is an amazing class_room, every one speaks ENGLISH 123. classy c  1s"
print(re.findall("c...s",text))
print(len(re.findall("c...s",text)))

# to find the index
matches =re.finditer("c...s",text)
indices = [match.start() for match in matches]
print(indices)

['class', 'class', 'c  1s']
3
[19, 61, 68]


In [110]:
print(re.findall("c...sy",text))

['classy']


# ^

In [119]:
text = "This is an amazing class_room, every one speaks ENGLISH 123. classy The c  1s"
print(re.findall("^Th",text))
print(len(re.findall("^Th",text)))

['Th']
1


In [120]:
text = "Tat is an amazing class_room, every one speaks ENGLISH 123. classy The c  1s"
print(re.findall("^Th",text))
print(len(re.findall("^Th",text)))

[]
0


In [121]:
text = "This is an amazing class_room, every one speaks ENGLISH 123. classy The c  1s"
print(re.findall("^A|^T",text))
print(len(re.findall("^A|^T",text)))

['T']
1


 # [+]

In [124]:
text = "This is an amazing++ class_room, @every #one *speaks !ENGLISH |123. classy The c +/ 1s"
print(re.findall("[+]",text))
print(len(re.findall("[+]",text)))

['+', '+', '+']
3


In [125]:
text = "This is an amazing++ class_room, @every #one *speaks !ENGLISH |123. classy The c +/ 1s"
print(re.findall("[+]+",text))
print(len(re.findall("[+]",text)))

['++', '+']
3


In [135]:
text = '\yes 1:30 no 12:30 maybe 1:5 test me 12:5'
print(1, re.findall(r'(\d?\d):(\d?\d)', text))
print(2, re.findall(r'(\d\d):(\d\d)', text))
print(3, re.findall(r'(\d?\d):(\d\d)', text))
print(4, re.findall(r'(\d\d):(\d?\d)', text))

1 [('1', '30'), ('12', '30'), ('1', '5'), ('12', '5')]
2 [('12', '30')]
3 [('1', '30'), ('12', '30')]
4 [('12', '30'), ('12', '5')]


In [136]:
text = "hello 12 hi 89. Howdy 34"

pattern = "\d+"
result = re.findall(pattern, text)
print(result)

['12', '89', '34']


In [140]:
text = "abc 12 de 23        f45 6"
pattern = "\s+"
replace = ""
new_text = re.sub(pattern, replace, text)
print(new_text)

abc12de23f456
