# Removing HTML Tags

In [6]:
# imports
from bs4 import BeautifulSoup

# function to remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()

# call function
remove_html_tags("""<html><h1>Article Heading</h1> <p>First sentence of some important article. And another one. And then the last one</p></html>""")

'Article Heading First sentence of some important article. And another one. And then the last one'

# Removing Accented Characters

In [7]:
# imports
import unicodedata

# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

# call function
remove_accented_chars('Sómě Áccěntěd těxt. Some words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.')

'Some Accented text. Some words such as resume, cafe, protest, divorce, coordinate, expose, latte.'

# Expanding Contractions

In [None]:
# install library
!pip install contractions

In [12]:
# imports
import contractions

contractions.fix("Y'all i'd contractions you're expanded don't think. I'll do it right away")

'you all I would contractions you are expanded do not think. I will do it right away'

# Removing Special Characters

In [14]:
# imports
import re

# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)
 
# call function
remove_special_characters("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

'007 Not sure if this  was fun! 558923 What do you think of it.? 500USD!'

# Removing Numbers

In [15]:
# imports
import re

# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)
 
# call function
remove_numbers("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

' Not sure if this  was fun!  What do you think of it.? USD!'

# Removing Punctuation

In [16]:
# imports
import string

# function to remove punctuation
def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

# call function
remove_punctuation('Article: @First sentence of some, {important} article having lot of ~ punctuations. And another one;!')

'Article First sentence of some important article having lot of  punctuations And another one'

# Stemming
Stemming is the process of reducing inflected/derived words to their word stem, base or root form.
The stem need not be identical to original word.
These mainly rely on chopping-off ‘s’, ‘es’, ‘ed’, ‘ing’, ‘ly’ etc from the end of the words

In [21]:
# imports
import nltk

# function for stemming
def get_stem(text):
    stemmer = nltk.porter.PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# call function
get_stem("we are eating and swimming ; we have been eating and swimming ; he eats and swims ; he ate and swam ")

'we are eat and swim ; we have been eat and swim ; he eat and swim ; he ate and swam'

# Lemmatization
Though stemming and lemmatization both generate the root form of inflected/desired words, but lemmatization is an advanced form of stemming.

In [19]:
# imports
import spacy

# init
nlp = spacy.load('en',parse=True,tag=True, entity=True)

# function to remove special characters
def get_lem(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
# call function
get_lem("we are eating and swimming ; we have been eating and swimming ; he eats and swims ; he ate and swam ; he eats a lot ")

'we be eat and swim ; we have be eat and swim ; he eat and swim ; he eat and swam ; he eat a lot'

# Removing Stopwords

In [30]:
# imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# init
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# functions
def remove_stopwords(text):  
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  return filtered_sentence

# call function
text = remove_stopwords("i am myself you the stopwords list and this article is not should removed")
print(text)

text = remove_stopwords("This is a sample sentence, showing off the stop words filtration.")
print(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['stopwords', 'list', 'article', 'removed']
['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


# Removing extra whitespaces and tabs

In [31]:
# imports
import re

# function to remove special characters
def remove_extra_whitespace_tabs(text):
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

# call function
remove_extra_whitespace_tabs('  This web line  has \t some extra  \t   tabs and whitespaces  ')

'This web line has some extra tabs and whitespaces'

# Lowercase

In [32]:
# function to remove special characters
def to_lowercase(text):
    return text.lower()

# call function
to_lowercase('ConVert THIS string to LOWER cASe.')

'convert this string to lower case.'