**NeatText** is a simple NLP package for cleaning 🧹 textual data and text processing ✨.

In [21]:
#pip install neattext

In [49]:
import neattext as nt 
mytext = "Are you lost? You can visit our website https://perdu.com or contact us by email perdu@mail.com 😇."
docx = nt.TextFrame(text=mytext)

In [50]:
docx.describe()

Key      Value          
Length  : 98             
vowels  : 30             
consonants: 44             
stopwords: 8              
punctuations: 8              
special_char: 8              
tokens(whitespace): 16             
tokens(words): 20             


#### Basic NLP tasks

In [51]:
docx.word_tokens()

['Are',
 'you',
 'lost',
 'You',
 'can',
 'visit',
 'our',
 'website',
 'httpsperducom',
 'or',
 'contact',
 'us',
 'by',
 'email',
 'perdumailcom',
 '😇']

In [52]:
docx.sent_tokens()

[['Are you lost', ' You can visit our website https://perdu'],
 'com or contact us by email perdu@mail',
 'com 😇',
 '']

In [53]:
docx.term_freq()

{'Are': 1.0,
 'lost?': 1.0,
 'You': 1.0,
 'visit': 1.0,
 'website': 1.0,
 'https://perdu.com': 1.0,
 'contact': 1.0,
 'email': 1.0,
 'perdu@mail.com': 1.0,
 '😇.': 1.0}

In [54]:
docx.bow()

Counter({'Are': 1,
         'you': 1,
         'lost': 1,
         'You': 1,
         'can': 1,
         'visit': 1,
         'our': 1,
         'website': 1,
         'https': 1,
         'perdu': 2,
         'com': 2,
         'or': 1,
         'contact': 1,
         'us': 1,
         'by': 1,
         'email': 1,
         'mail': 1})

#### Basic Text Preprocessing

In [57]:
docx.normalize()

'are you lost? you can visit our website https://perdu.com or contact us by email perdu@mail.com 😇.'

You can add the "level" parameter inside of the normalize function which controls the level of normalization. It is set to shallow by default.

In [58]:
docx.normalize(level='deep') # shallow + removing puncts, emojis, bad commas etc

'are you lost you can visit our website httpsperducom or contact us by email perdumailcom '

You can also precise what you would exactly want to remove thanks to the functions below:

In [61]:
#docx.remove_puncts()
#docx.remove_stopwords()
#docx.remove_html_tags()
#docx.remove_special_characters()
#docx.remove_emojis()
#docx.fix_contractions()

#### Clean Text

In [77]:
from neattext.functions import clean_text
mytext = "Are you lost? You can visit our website https://perdu.com or contact us by email perdu@mail.com 😇."
clean_text(mytext)

'lost? visit website https://perdu.com contact email perdu@mail.com .'

By default, stopwords, emojis and multiple whitespaces will be removed. But you can choose what to or not to remove by setting parameters to True or False.

In [78]:
clean_text(mytext, puncts=True, emails=True, emojis=False)

'lost visit website https://perducom contact email 😇'

#### Replace emails or (phone) numbers

In [None]:
#docx.replace_emails()
#docx.replace_numbers()
#docx.replace_phone_numbers()

In [81]:
docx.replace_emails()

'lost visit website httpsperducom contact email perdumailcom '

#### Text extractor

In [87]:
from neattext import TextExtractor
docx = TextExtractor()
docx.text = "Are you lost? You can visit our website https://perdu.com or contact us by email perdu@mail.com 😇."

In [88]:
# example 1
docx.extract_emails()

['perdu@mail.com']

In [86]:
# example 2
docx.extract_emojis()

['😇']

#### TextMetrics

Helps finding words stats such as counts of vowels, consonants, stopwords etc.

In [89]:
from neattext import TextMetrics
docx = TextMetrics()
docx.text = "Are you lost? You can visit our website https://perdu.com or contact us by email perdu@mail.com 😇."

In [90]:
# example 1
docx.count_stopwords()

Counter({'are': 1, 'you': 2, 'can': 1, 'our': 1, 'or': 1, 'us': 1, 'by': 1})

In [94]:
# example 2
docx.count_vowels()

{'a': 5, 'e': 6, 'i': 5, 'o': 8, 'u': 6}

#### Explainer

In [99]:
from neattext.explainer import emojify
emojify('Smile')

'😄'

In [107]:
from neattext.explainer import emoji_explainer
emoji_explainer('😇')

'SMILING FACE WITH HALO'

In [112]:
from neattext.explainer import unicode_2_emoji
unicode_2_emoji('0x1f49b')

'💛'