<a href="https://colab.research.google.com/github/jaison-1920/nlp/blob/main/text_preprocessing_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic text preprocessing
## -> Tokenization
## -> Lemmatization
## -> Stemming

In [4]:
#Tokenization using nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
text = 'The car is very sporty!. It is built by Ford. It costs $1555.'
word_tokenize(text)

['The',
 'car',
 'is',
 'very',
 'sporty',
 '!',
 '.',
 'It',
 'is',
 'built',
 'by',
 'Ford',
 '.',
 'It',
 'costs',
 '$',
 '1555',
 '.']

In [8]:
sent_tokenize(text)

['The car is very sporty!.', 'It is built by Ford.', 'It costs $1555.']

In [16]:
text_1 = 'I have a Ph.D in A.I.'
text_2 = 'We\'re here to help!. Contact us at jkj@lkll.com'
text_3 = 'A 5km ride  will cost $5.'

In [17]:
word_tokenize(text_1)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I', '.']

In [18]:
word_tokenize(text_2)
#here the email is not splitted as one element

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 '.',
 'Contact',
 'us',
 'at',
 'jkj',
 '@',
 'lkll.com']

In [19]:
word_tokenize(text_3)
# nltk treated 5km as one word. It  should be 5 and km

['A', '5km', 'ride', 'will', 'cost', '$', '5', '.']

In [12]:
# Tokenization using spacy
import spacy

In [22]:
nlp = spacy.load('en_core_web_sm')
doc1 = nlp(text_1)
for token in doc1:
  print(token)
# it couldn't get Ph.D as a single element

I
have
a
Ph
.
D
in
A.I.


In [23]:
doc2 = nlp(text_2)
for token in doc2:
  print(token)
# here the email got splitted perfectly

We
're
here
to
help
!
.
Contact
us
at
jkj@lkll.com


In [25]:
doc3 = nlp(text_3)
for token in doc3:
  print(token)
#here the 5 and km splitted perfectly

A
5
km
ride
 
will
cost
$
5
.


# Diving deep into spacy

In [31]:
# 'en_core_web_sm' is a package of spacy for english which is pretrained
# we will do spacy simply by blank

nlp1 = spacy.blank('en')
sent_1 = 'India is the largest Democracy in world. It has a GDP of 8%. India invests $5400 every year for defence.'
doc_1 = nlp1(sent_1)
for token in doc_1:
  print(token)

India
is
the
largest
Democracy
in
world
.
It
has
a
GDP
of
8
%
.
India
invests
$
5400
every
year
for
defence
.


In [34]:
sent_2 = '''"Let's go to N.Y." '''
doc_2 = nlp1(sent_2)
for token in doc_2:
  print(token)

"
Let
's
go
to
N.Y.
"


In [36]:
# There are some attributes for each token
token0 = doc_2[0]
type(token0)

spacy.tokens.token.Token

In [37]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [39]:
#for eg:
token1=doc_2[1]
token1

Let

In [40]:
token1.is_alpha
#token1 == Let and that is an alphabet

True

In [42]:
token2 = doc_1[-1]
token2

.

In [43]:
token2.is_punct

True

In [46]:
token3 = doc_1[-7]
token3

$

In [47]:
token3.is_currency

True

In [48]:
token4 = doc_1[-6]
token4

5400

In [49]:
token4.is_digit


True

In [51]:
# to grab any emails from the sentences, we can use attributes
string1 = 'We\'re here to help!. Contact us at jkj@lkll.com'
email = []
doc3 = nlp1(string1)
for token in doc3:
  if token.like_email: # grabbing the email directly using like_email attribute
    email.append(token)

In [52]:
email

[jkj@lkll.com]

In [54]:
#grabbing the websites in this text
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/,
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc4 = nlp1(text)
url = []
for token in doc4:
  if token.like_url:
    url.append(token)


In [55]:
url

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [78]:
# grabbing the currencies out of this sentence
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc5 = nlp1(transactions)

for token in doc5:
  if token.like_num and doc5[token.i+1].is_currency:
    print(token,doc5[token.i+1])

two $
500 €


In [71]:
token4 = doc5[2]
token4.like_num

True