In [27]:
import re
import numpy as np
import spacy
from spacy.lang.en.examples import sentences
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('gutenberg')
from nltk.corpus import gutenberg
from nltk.corpus import wordnet


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
text = "ABC def 123 ;:-_!😇"

In [3]:
re.search("ABC", text).span()

(0, 3)

In [5]:
re.findall('\d', text)

['1', '2', '3']

In [8]:
re.findall('\d+', 'Tel: 0431-234234')

['0431', '234234']

In [11]:
re.sub('\W', '', text) # find and replace all alpha-numerics by '' which is nothing hehe :D 

'ABCdef123_'

In [12]:
re.findall('abc', text)

[]

In [13]:
re.findall('\W', text) # find all alpha-numerics

[' ', ' ', ' ', ';', ':', '-', '!', '😇']

In [16]:
text2 = 'Email: jannikp@gmail.com, Mobil: 0151 123456'       # a text


In [17]:
re.search(r'\S+@\w+\.(com|de)', text2)      # search E-Mail addresses


<re.Match object; span=(7, 24), match='jannikp@gmail.com'>

In [18]:
def remove_special_characters(text):
    """Remove all characters that are not alphanumeric or space"""
    pattern = r'[\W\s]'
    text = re.sub(pattern, '', text)
    return text

In [19]:
remove_special_characters("ABCDEFG abcdefg;:-.'123#@!😇")


'ABCDEFGabcdefg123'

In [1]:
text = """The fool does think he is wise, but the wise man knows himself to be a fool."""


In [5]:
text2 = re.sub(",", "", text)
ls = nltk.stem.LancasterStemmer()
' '.join([ls.stem(word) for word in text2.split(" ")])

'the fool doe think he is wis but the wis man know himself to be a fool.'

In [6]:
ls = nltk.stem.PorterStemmer()
' '.join([ls.stem(word) for word in text.split(" ")])

'the fool doe think he is wise, but the wise man know himself to be a fool.'

### Lemmatizing

In [7]:
# for spacy you need to download packages. This is English. https://spacy.io/models/en/
from spacy.lang.en.examples import sentences


In [8]:
nlp = spacy.load('en_core_web_sm')     
text = nlp(text)
' '.join([word.lemma_ for word in text])

'the fool do think he be wise , but the wise man know himself to be a fool .'

### Nomalize document


In [10]:
wpt = nltk.WordPunctTokenizer()

In [13]:
stop_words = nltk.corpus.stopwords.words('english')

In [14]:
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-z0-9\s]', '', doc)    # replacing certain patterns or removing them
    #doc = re.sub('[^a-zA-z0-9\\s]', '', doc)    # without the r we need to use a double dash

    doc = doc.lower()                           # lower casing
    doc = doc.strip()                           # stripping is to remove white cases in the beginning or the end
    tokens = wpt.tokenize(doc)                  # using a Tokenizer I have defined earlier (WordPunctTokenizer)
    filtered_tokens = [token for token in tokens if token not in stop_words] # take the tokens which are not in stop_words
    doc = ' '.join(filtered_tokens)
    doc = ' '.join([word.lemma_ for word in nlp(doc)])
    return doc

In [15]:
texts = ['The fool does think he is wise.',
         'The wise man knows himself to be a fool.']

In [23]:
normalize_document(texts[0]) # it selects the first element index 0. But it can't run on an entire list, so it errors.

'fool think wise'

In [20]:
normalize_corpus = np.vectorize(normalize_document)

In [21]:
normalize_corpus(texts)

array(['fool think wise', 'wise man know fool'], dtype='<U18')

### Test

In [29]:
gutenberg.fileids()


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [30]:
moby = gutenberg.raw('melville-moby_dick.txt')   # Document

In [31]:
moby_corpus = nltk.sent_tokenize(moby)           # Tokenizing the entire book into sentences, to have a list of separate sentences

In [32]:
moby[0:100]

'[Moby Dick by Herman Melville 1851]\r\n\r\n\r\nETYMOLOGY.\r\n\r\n(Supplied by a Late Consumptive Usher to a Gr'

In [33]:
normalize_document(moby[0:100])

'[ moby dick herman melville 1851 ] etymology supply late consumptive usher gr'

In [34]:
normalize_corpus(moby_corpus[0:10])

array(['[ moby dick herman melville 1851 ] etymology',
       'supply late consumptive usher grammar school pale usherthreadbare coat heart body brain see',
       'ever dust old lexicon grammar queer handkerchief mockingly embellish gay flag know nation world',
       'love dust old grammar somehow mildly remind mortality',
       'take hand school other teach name whalefish call tongue leave ignorance letter h almost alone maketh signification word deliver true',
       'hackluyt whale', 'sw dan', 'hval',
       'animal name roundness roll dan', 'hvalt arch vault'],
      dtype='<U132')