# NLP Solutions
We have five exercises in this section. The exercises are:

1. Build your own tokenizer, where you need to implement two functions to implement a tokenizer based on regular expression.
2. Get tags from Trump speech.
3. Get the nouns in the last 10 sentences from Trump's speech and find the nouns divided by sentencens. Use SpaCy.
4. Build your own Bag Of Words implementation using tokenizer created before.
5. Build a 5-gram model and clean up the results.

## Exercise 1: Build your own tokenizer, where you need to implement two functions to implement a tokenizer based on regular expression.

In [None]:
text = "Here we go again. I was supposed to add this text later.\
Well, it's 10.p.m. here, and I'm actually having fun making this course. :o\
I hope you are getting along fine with this presentation, I really did try.\
And one last sentence, just so you can test you tokenizers better."

In [None]:
import re

def tokenize_words(text: str) -> list:
    """Tokenize text into words using regex.

    Parameters
    ----------
    text: str
            Text to be tokenized

    Returns
    -------
    List[str]
            List containing words tokenized from text

    """
    return re.split(' ',text)

print("Tokenized words:")
print(tokenize_words(text))

In [None]:
def tokenize_sentence(text: str) -> list:
    """Tokenize text into words using regex.

    Parameters
    ----------
    text: str
            Text to be tokenized

    Returns
    -------
    List[str]
            List containing words tokenized from text

    """
    return re.split('(?<=[.!?]) +',text)

print("Tokenized sentences:")
print(tokenize_sentence(text))

## Exercise 2: Get tags from Trump speech.

In [None]:
import nltk
from nltk.tokenize import word_tokenize

file = open("./datasets/trump.txt", "r",encoding="utf-8") 
trump = file.read()
words = word_tokenize(trump)
nltk.pos_tag(words)

## Exercise 3: Get the nouns in the last 10 sentences from Trump's speech and find the nouns divided by sentencens. Use SpaCy.

In [None]:
import spacy

file = open("./datasets/trump.txt", "r",encoding='utf-8') 
trump = file.read() 

nlp = spacy.load("en_core_web_sm")
doc = nlp(trump)
sents = list(doc.sents)[-10:]
for sentence in sents:
    print("> "+str(sentence))
    print("Nouns:")
    for noun in sentence.noun_chunks:
        print(">>" + str(noun))

## Exercise 4: Build your own Bag Of Words implementation using tokenizer created before.

In [None]:
import numpy as np
import spacy

class BagOfWords:
    """Basic BoW implementation."""
    
    __nlp = spacy.load("en_core_web_sm")
    __bow_list = []
    
    def __build_list(self, corpus):
        doc = self.__nlp(''.join(map(str,corpus)))
        for span in doc.sents:
            for i in range(span.start, span.end):
                token = doc[i]
                if token.is_punct:
                    continue
                if str(token) not in self.__bow_list:
                    self.__bow_list.append(token.text)
    
    def fit_transform(self, corpus: list):
        """Transform list of strings into BoW array.

        Parameters
        ----------
        corpus: List[str]
                Corpus of texts to be transforrmed

        Returns
        -------
        np.array
                Matrix representation of BoW

        """
        
        self.__build_list(corpus)
        corpus_bow_list = []
        for corp in corpus:
            corpus_list = [0]*len(self.__bow_list)
            doc = self.__nlp(corp)
            for span in doc.sents:
                for i in range(span.start, span.end):
                    token = doc[i]
                    if token.is_punct:
                        continue
                    if str(token) in self.__bow_list:
                        corpus_list[self.__bow_list.index(str(token))]=1
            corpus_bow_list.append(corpus_list)
        return corpus_bow_list

    def get_feature_names(self) -> list:
        """Return words corresponding to columns of matrix.

        Returns
        -------
        List[str]
                Words being transformed by fit function

        """     
        return self.__bow_list

vectorizer = BagOfWords()

X = vectorizer.fit_transform(corpus)
print(X)

vectorizer.get_feature_names()
len(vectorizer.get_feature_names())

## Exercise 5: Build a 5-gram model and clean up the results.

In [None]:
from nltk.book import *

wall_street = text7.tokens

import re

tokens = wall_street

def cleanup():
    compiled_pattern = re.compile("^[a-zA-Z0-9.!?]")
    clean = list(filter(compiled_pattern.match,tokens))
    return clean
tokens = cleanup()

def build_ngrams():
    ngrams = []
    for i in range(len(tokens)-N+1):
        ngrams.append(tokens[i:i+N])
    #print(ngrams)
    return ngrams

def ngram_freqs(ngrams):
    counts = {}

    for ngram in ngrams:
        token_seq  = SEP.join(ngram[:-1])
        last_token = ngram[-1]

        if token_seq not in counts:
            counts[token_seq] = {}

        if last_token not in counts[token_seq]:
            counts[token_seq][last_token] = 0

        counts[token_seq][last_token] += 1;

    return counts;
#ngram_freqs(ngrams)

def next_word(text, N, counts):

    token_seq = SEP.join(text.split()[-(N-1):]);
    choices = counts[token_seq].items();

    total = sum(weight for choice, weight in choices)
    r = random.uniform(0, total)
    upto = 0
    for choice, weight in choices:
        upto += weight;
        if upto > r: return choice
    assert False # should not reach here

In [None]:
import random

def clean_generated(generated):
    sentences = generated.split('.')
    clean = ""
    for sentence in sentences:
        if len(sentence) > 0:
            clean += sentence[0].upper()
            clean += sentence[1:]
            clean += sentence[0:-1]+'.'
        elif len(sentence) == 1:
            clean += sentence        
    return clean
   

N=5

SEP=" "

sentence_count=5

ngrams = build_ngrams()

start_seq="Was named a nonexecutive"

counts = ngram_freqs(ngrams)

if start_seq is None: start_seq = random.choice(list(counts.keys()))
generated = start_seq.lower();

sentences = 0
while sentences < sentence_count:
    generated += SEP + next_word(generated, N, counts)
    sentences += 1 if generated.endswith(('.','!', '?')) else 0


print(clean_generated(generated))