<a href="https://colab.research.google.com/github/jaison-1920/nlp/blob/main/Text_preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Preprocessing:-
##  1) Text Cleaning
#### -> lowercasing,emoji handling,remove html and urls,
#### -> punctuation etc..

## 2) Basic preprocessing
#### -> Tokenization, Lemmatization, stemming

## 3) Advanced preprocessing
#### -> POS, NER, parsing etc..

==============================================================

Lowercasing

In [34]:
#text cleaning
# lowercasing

text = "The curious fox scampered through the Tallgrass Prairie, a gentle breeze rustling the golden blades."
# the text contains upper and lower case letters. So it need to lowercase completely

In [35]:
text = text.lower()
text

'the curious fox scampered through the tallgrass prairie, a gentle breeze rustling the golden blades.'

In [36]:
import pandas as pd

In [37]:
# but in a dataset there will be multiple rows
url = 'https://raw.githubusercontent.com/jaison-1920/nlp/main/sample_data.csv'
df = pd.read_csv(url)
df

Unnamed: 0.1,Unnamed: 0,text
0,0,ID123: Breaking news! A rare albino fox was sp...
1,1,ABC456: Feeling creative? Check out this insp...
2,2,789XYZ: Looking for a delicious and healthy w...
3,3,01A2B3: Did you know that honeybees are essent...
4,4,DEFGHI: Embark on a virtual adventure through ...


In [38]:
df['text'][3]

"01A2B3: Did you know that honeybees are essential for our ecosystem? Learn more about these fascinating creatures and how we can protect them. <p>This informative website provides resources for beekeeping enthusiasts and anyone interested in conservation.  Let's save the bees!  The Bee Conservancy: https://www.example.com <i>**#savethebees**</i></p>"

In [39]:
df['text'][1]

"ABC456:  Feeling creative? Check out this inspiring article on abstract expressionism!  <p>The author explores the techniques and motivations behind this influential art movement. Don't miss it! Abstract Expressionism: A Beginner's Guide: https://www.example.com/random-9763VFtpbst8gBD6mV7XPidaTA;vQw.GWGj?jKl,uqAwMMp </p>"

In [40]:
#to convert all the rows in 'text' column:
df['text'] = df['text'].str.lower()


In [41]:
df

Unnamed: 0.1,Unnamed: 0,text
0,0,id123: breaking news! a rare albino fox was sp...
1,1,abc456: feeling creative? check out this insp...
2,2,789xyz: looking for a delicious and healthy w...
3,3,01a2b3: did you know that honeybees are essent...
4,4,defghi: embark on a virtual adventure through ...


In [42]:
df['text'][4]

'defghi: embark on a virtual adventure through the great barrier reef! this website allows you to explore the wonders of this underwater paradise from the comfort of your home. dive in and discover the incredible marine life! explore the great barrier reef: https://www.example.com'

Removing html and urls

In [43]:
import re

In [44]:
# we can remove html and urls using regex
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub('',text)

def remove_url(text):
  pattern = re.compile('(https://\S+|www.\S+)')
  return pattern.sub('',text)

In [45]:
df['text'] = df['text'].apply(remove_html_tags)

In [46]:
df['text'][1]

"abc456:  feeling creative? check out this inspiring article on abstract expressionism!  the author explores the techniques and motivations behind this influential art movement. don't miss it! abstract expressionism: a beginner's guide: https://www.example.com/random-9763vftpbst8gbd6mv7xpidata;vqw.gwgj?jkl,uqawmmp "

In [47]:
df['text'] = df['text'].apply(remove_url)
df['text'][1]

"abc456:  feeling creative? check out this inspiring article on abstract expressionism!  the author explores the techniques and motivations behind this influential art movement. don't miss it! abstract expressionism: a beginner's guide:  "

# Spelling correction

In [48]:
# we can do the spelling correction through textblob
from textblob import TextBlob

In [49]:
incorrect_text = "The Certanity in teh ntebook is goood"
text_blob = TextBlob(incorrect_text)
text_blob.correct().string

'The Certainty in the notebook is good'

# Removing stopwords

In [50]:
# stopwords are the words which does not contrbute any meaning in the sentence.
# eg: a,the, are, is were etc...
# we can import stopwords in any language
import nltk
nltk.download('stopwords')
stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [51]:
stopwords.words('spanish')

['de',
 'la',
 'que',
 'el',
 'en',
 'y',
 'a',
 'los',
 'del',
 'se',
 'las',
 'por',
 'un',
 'para',
 'con',
 'no',
 'una',
 'su',
 'al',
 'lo',
 'como',
 'más',
 'pero',
 'sus',
 'le',
 'ya',
 'o',
 'este',
 'sí',
 'porque',
 'esta',
 'entre',
 'cuando',
 'muy',
 'sin',
 'sobre',
 'también',
 'me',
 'hasta',
 'hay',
 'donde',
 'quien',
 'desde',
 'todo',
 'nos',
 'durante',
 'todos',
 'uno',
 'les',
 'ni',
 'contra',
 'otros',
 'ese',
 'eso',
 'ante',
 'ellos',
 'e',
 'esto',
 'mí',
 'antes',
 'algunos',
 'qué',
 'unos',
 'yo',
 'otro',
 'otras',
 'otra',
 'él',
 'tanto',
 'esa',
 'estos',
 'mucho',
 'quienes',
 'nada',
 'muchos',
 'cual',
 'poco',
 'ella',
 'estar',
 'estas',
 'algunas',
 'algo',
 'nosotros',
 'mi',
 'mis',
 'tú',
 'te',
 'ti',
 'tu',
 'tus',
 'ellas',
 'nosotras',
 'vosotros',
 'vosotras',
 'os',
 'mío',
 'mía',
 'míos',
 'mías',
 'tuyo',
 'tuya',
 'tuyos',
 'tuyas',
 'suyo',
 'suya',
 'suyos',
 'suyas',
 'nuestro',
 'nuestra',
 'nuestros',
 'nuestras',
 'vuestro'

In [52]:
def remove_stop_words(text):
  new_text = []
  for words in text.split():
    if words in stopwords.words('english'):
      new_text.append(' ')
    else:
      new_text.append(words)
  return new_text

In [53]:
text = df['text'][1]
text

"abc456:  feeling creative? check out this inspiring article on abstract expressionism!  the author explores the techniques and motivations behind this influential art movement. don't miss it! abstract expressionism: a beginner's guide:  "

In [55]:
abc = remove_stop_words(text)
a = ''.join(abc)


In [56]:
a

"abc456:feelingcreative?check  inspiringarticle abstractexpressionism! authorexplores techniques motivationsbehind influentialartmovement. missit!abstractexpressionism: beginner'sguide:"

# Treating Emoji

In [58]:
# treating emoji with emoji.demojizer function
!pip install emoji

Collecting emoji
  Downloading emoji-2.10.1-py2.py3-none-any.whl (421 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.10.1


In [61]:
import emoji
text = "I'm feeling 😊 today because the weather is beautiful!"
print(emoji.demojize(text))

I'm feeling :smiling_face_with_smiling_eyes: today because the weather is beautiful!


In [62]:
print(emoji.demojize("Just finished a great workout session 💪. Feeling energized and ready to conquer the day!"))

Just finished a great workout session :flexed_biceps:. Feeling energized and ready to conquer the day!
