# Data, Machines and the 🐍 
<img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/lessons/dmap/text/text-normal/html/section00.png" align="left"/>

<a id="install"></a>
## Notebook Preparation for Lesson 1•2•3
Each lesson will start with a similar template (given in the course schedule):  
1. **save** to your google drive (copy to drive)<br/><img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/copy-to-drive.png"/>
2. **update** the NET_ID to be your netID (no need to include @illinois.edu)
3. **run** the next cell to install the IDE. <img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/play-button.png"/>

In [0]:
LESSON_ID = 'dmap:text:text-normal'   # keep this as is
NET_ID    = 'CHANGE_ME' # CHANGE_ME to your netID (keep the quotes)

def install_ide(net_id, lesson_id):
  import sys
  if 'codestories' not in sys.modules:
      print('installing modules')
      !pip install git+https://mehaberman@bitbucket.org/mehaberman/codestories.git --upgrade &> install.log
  
  from codestories.cs.CodeStories import CodeStory
  return CodeStory(net_id, lesson_id)

ide = install_ide(NET_ID, LESSON_ID)
print(ide.welcome())

# Lesson Text Normalization
(hit ▶ to read the first part of the lesson️)

In [0]:
# run to read the next section
ide.reader.view_section(1)

In [0]:
import LessonUtil as Util

def tokenize(text):
    import re
  
    # allow numbers
    # reg = r"['A-Za-z0-9]+-?[A-Za-z0-9']+"
  
    # exclude numbers
    reg = r"['A-Za-z]+-?[A-Za-z']+"
    regex = re.compile(reg)
    return regex.findall(text)

def normalize(words):
    return [w.lower().strip("'") for w in words]

def normalization_demo():
    path = Util.path_for_data('harryPotter.txt')

    with open(path, 'r') as fd:
        all = fd.read()
        # the most basic way to tokenize
        raw = all.split()
            
        # use a regular expression to tokenize
        words = tokenize(all)
        normalized = normalize(words)
    
        uniq = set(words)
        uniq_norm = set(normalized)
        uniq_norm_min = set([w for w in uniq_norm if len(w) > 2])
    
        import nltk
        nltk.download('stopwords')
        from nltk.corpus import stopwords
        stop = stopwords.words('english')
    
        uniq_no_stop = set([w.lower() for w in uniq_norm_min if w not in stop])
    
        # some basic counts of the different techniques
        print(len(raw))    # 78706
        print(len(words))  # 75529 with numbers; 75277 w/out
        print(len(uniq))
        print(len(uniq_norm))
        print(len(uniq_norm_min))
        print(len(uniq_no_stop))
        #print(sorted(uniq_norm_min))

normalization_demo()

In [0]:
# run to read the next section
ide.reader.view_section(3)

In [0]:
import nltk

def porter_test(words):
    from nltk.stem.porter import PorterStemmer
    p_stemmer = PorterStemmer()
    for word in words:
        msg = "{:10s} --> {:s}".format(word, p_stemmer.stem(word))
        print(msg)

words = 'run runner running ran runs easily fairly'.split()
porter_test(words)

In [0]:
# run to read the next section
ide.reader.view_section(5)

In [0]:
def snowball_test(words):
    # Porter2
    # The Snowball Stemmer requires that you pass a language parameter
    stemmer = nltk.stem.snowball.SnowballStemmer(language='english')
    for word in words:
        msg = "{:10s} --> {:s}".format(word, stemmer.stem(word))
        print(msg)
snowball_test(words)

In [0]:
# run to read the next section
ide.reader.view_section(7)

In [0]:
def lancaster_test(words):
    stemmer = nltk.stem.lancaster.LancasterStemmer()
    for word in words:
        msg = "{:10s} --> {:s}".format(word, stemmer.stem(word))
        print(msg)
lancaster_test(words)

In [0]:
# run to read the next section
ide.reader.view_section(9)

In [0]:
def demo_nltk_lemma(words):
    import nltk
    nltk.download('wordnet')
    lemmer  = nltk.stem.WordNetLemmatizer() 
    for word in words:
        msg = "{:10s} --> {:s}".format(word, lemmer.lemmatize(word))
        print(msg)

    # ask for a specific usage
    msg = "{:10s} --> {:s}".format('better', lemmer.lemmatize('better', pos="a"))
    print(msg)  
demo_nltk_lemma(words)

In [0]:
# run to read the next section
ide.reader.view_section(11)

In [0]:
def spacy_lemma_demo1():

    import spacy
    nlp = spacy.load('en')
  
    # tokens have a lemma_
    doc = nlp("Apples are better than ducks")
    for token in doc:
        print(token.text, '==>', token.lemma_)

In [0]:
# run to read the next section
ide.reader.view_section(13)

In [0]:
def spacy_lemma_demo2():
    import spacy
    from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
    nlp = spacy.load('en')
    lemmatizer = nlp.vocab.morphology.lemmatizer
    l = lemmatizer('ducks', NOUN)
    print(l)

In [0]:
# run to read the next section
ide.reader.view_section(15)

In [0]:
def spacy_lemma_demo3():
    from spacy.lemmatizer import Lemmatizer
    from spacy.lookups import Lookups
    lookups = Lookups()
    # add a custom conversion for all nouns
    lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
    lemmatizer = Lemmatizer(lookups)
    lemmas = lemmatizer("ducks", "NOUN")
    print(lemmas)

In [0]:
# run to read the next section
ide.reader.view_section(17)

# Review

In [0]:
# run to read the next section
ide.reader.view_section(18)

# Lesson

In [0]:
# run to read the next section
ide.reader.view_section(19)

# Test and Submit

In [0]:
# run to read the next section
ide.reader.view_section(20)

In [0]:
# print(ide.tester.test_notebook()) 
# print(ide.tester.test_notebook(verbose=True)) 

# once you are ready -- run this 
# ide.tester.download_solution()