In [1]:
!pip install {razdel,pymorphy2}

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
Collecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: razdel, pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844 razdel-0.5.0
[0m

![](https://i.imgur.com/qV1wBi5.jpg)

## <b>1 <span style='color:#F1A424'>|</span> Tokenisation</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#F1A424'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>

- Our starting point in our problem are python strings, containing unstructured sentences
- For these sentences to be interpreted by a machine learning model, we need to give them a numeric representation
- To achieve this, the very first step we need to do is to separate a string into multiple substrings, the process is called tokenisation
- We will look at a few ways we can do this; **character**, **sentence**, **word** & **subword** tokenisation

<br>

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>1.1 | </span></span></b> Character Tokenisers</b></p></div>

- The simplest tokenisation approach is **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">character tokenisation</mark>**, we can use python's inbuilt **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">list</mark>** class
- Convert each character into an integer (numericalisation)
- `token2idx` gives us a mapping from each character in the **vocabulary** to a unique integer

In [2]:
text = 'Tokenisation of text is a core task of NLP.'
tokenised_text = list(text)

# Character Tokenised list
print(f'\nNumber of tokens: {len(tokenised_text)}')
print(tokenised_text)

# Mapping Vecabulary dictionary
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenised_text)))}

print(f'\nLength of vocabulary: {len(token2idx)}')
print(token2idx)

# Let's represent text in numerical format 
input_ids = [token2idx[token] for token in tokenised_text]

print(f'\n{len(input_ids)} characters')
print(input_ids)


Number of tokens: 43
['T', 'o', 'k', 'e', 'n', 'i', 's', 'a', 't', 'i', 'o', 'n', ' ', 'o', 'f', ' ', 't', 'e', 'x', 't', ' ', 'i', 's', ' ', 'a', ' ', 'c', 'o', 'r', 'e', ' ', 't', 'a', 's', 'k', ' ', 'o', 'f', ' ', 'N', 'L', 'P', '.']

Length of vocabulary: 18
{' ': 0, '.': 1, 'L': 2, 'N': 3, 'P': 4, 'T': 5, 'a': 6, 'c': 7, 'e': 8, 'f': 9, 'i': 10, 'k': 11, 'n': 12, 'o': 13, 'r': 14, 's': 15, 't': 16, 'x': 17}

43 characters
[5, 13, 11, 8, 12, 10, 15, 6, 16, 10, 13, 12, 0, 13, 9, 0, 16, 8, 17, 16, 0, 10, 15, 0, 6, 0, 7, 13, 14, 8, 0, 16, 6, 15, 11, 0, 13, 9, 0, 3, 2, 4, 1]


<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>1.2 | </span></span></b> Sentence Tokenisers</b></p></div>

**<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Sentence tokenisation</mark>** is the process of splitting a text corpus into sentences that act as the first level of tokens the corpus is comprised of

In [3]:
''' Razdel Module '''

import razdel

paragraph = "Write paragaraph here to convert into tokens. This is the next sentence."

sentences = [sentence.text for sentence in razdel.sentenize(paragraph)]
sentences

['Write paragaraph here to convert into tokens.', 'This is the next sentence.']

In [4]:
''' NLTK Module '''

paragraph = "Write paragaraph here to convert into tokens. This is the next sentence."

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

sentences = nltk.sent_tokenize(paragraph)
print(sentences)

from nltk.tokenize import PunktSentenceTokenizer

paragraph = "Write paragaraph here to convert into tokens. This is the next sentence."
tokeniser = nltk.PunktSentenceTokenizer()
sentences = tokeniser.tokenize(paragraph)
print(sentences)

from nltk.tokenize import RegexpTokenizer

sentence_tokens = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'

tokeniser = nltk.RegexpTokenizer(pattern = sentence_tokens,
                                 gaps = True)
sentences = tokeniser.tokenize(paragraph)
print(sentences)

['Write paragaraph here to convert into tokens.', 'This is the next sentence.']
['Write paragaraph here to convert into tokens.', 'This is the next sentence.']
['Write paragaraph here to convert into tokens.', 'This is the next sentence.']


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
''' SpaCy Module '''
import spacy

paragraph = "Write paragaraph here to convert into tokens. This is the next sentence."

# Load Statistical Model
nlp = spacy.load("en_core_web_sm")
doc = nlp(paragraph)

# Sentence Tokeniser
sentences = [token.text for token in doc.sents]
sentences

['Write paragaraph here to convert into tokens.', 'This is the next sentence.']

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>1.3 | </span></span></b> Word Tokenisation</b></p></div>

**<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Word tokenisation</mark>** is the process of splitting, segmenting sentences into their constiuent words

In [6]:
''' NLTK Module '''

import nltk
from nltk.tokenize import word_tokenize

paragraph = "write paragaraph here to convert into tokens."

words = nltk.word_tokenize(paragraph)
print(words)

from nltk.tokenize import WordPunctTokenizer
  
tokenizer = WordPunctTokenizer()
words = tokenizer.tokenize(paragraph)
print(words)

# Split based on input Regular Expression

from nltk.tokenize import RegexpTokenizer
  
pattern = "[\w']+"
tokenizer = RegexpTokenizer(pattern)
words = tokenizer.tokenize(paragraph)
print(words)

# Splits based on whitespaces; tabs, newlines, spaces

from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
words = tokenizer.tokenize(paragraph)
print(words)

['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens', '.']
['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens', '.']
['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens']
['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens.']


In [7]:
''' SpaCy Library '''

import spacy

paragraph = "write paragaraph here to convert into tokens."

# Load Statistical Model
nlp = spacy.load("en_core_web_sm")
doc = nlp(paragraph)

# Tokeniser
tokens = [token.text for token in doc]
tokens

['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens', '.']

In [8]:
''' Keras Library '''

from keras.preprocessing.text import text_to_word_sequence

text_to_word_sequence(paragraph)

['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens']

In [9]:
''' Gensim Libray '''

from gensim.utils import tokenize

words = list(tokenize(paragraph))
print(words)

['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens']


In [10]:
''' razdel library '''
import razdel

paragraph = "write paragaraph here to convert into tokens."
tokens = [token.text for token in razdel.tokenize(paragraph)]
print(tokens)

['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens', '.']


In [11]:
''' pymorphy2 Library '''

from pymorphy2.tokenizers import simple_word_tokenize 

paragraph = "write paragaraph here to convert into tokens."
tokens = simple_word_tokenize(paragraph)
print(tokens)

['write', 'paragaraph', 'here', 'to', 'convert', 'into', 'tokens', '.']


<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>1.5 | </span></span></b> Subword Tokenisation | HuggingFace</b></p></div>

- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Subword tokenization</mark>** is to combine the best aspects of **<span style='color:#FFC300'>character</span>** & **<span style='color:#FFC300'>word</span>** tokenisation
- The main distinguishing feature of **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Subword tokenization</mark>** is that it is learned from a **pretraining corpus** using a mix of statistical rules and algorithms


- There are several **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Subword tokenization</mark>** algorithms that are commonly used in NLP
    - let’s start with `WordPiece`, which is used by the `BERT` and `DistilBERT` tokenizers
    

- `AutoTokenizer` class allows us to quickly load te tokeniser associated with a pretrained model
- Or we can load the Tokeniser manually from `transformers.DistilBertTokenizer`


In [12]:
# Load Tokeniser using AutoTokenizer
from transformers import AutoTokenizer

text = 'Tokenisation of text is a core task of NLP.'

# Load parameters of the tokeniser
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Show tokeniser information
tokenizer

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [13]:
# Load Tokeniser directly from transformers
from transformers import DistilBertTokenizer

model_ckpt = "distilbert-base-uncased"
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)
distilbert_tokenizer

PreTrainedTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [14]:
print('')
print(f'Vocabulary size: {tokenizer.vocab_size}')
print(f'Max length: {tokenizer.model_max_length}')
print(f'Tokeniser model input names: {tokenizer.model_input_names}')


Vocabulary size: 30522
Max length: 512
Tokeniser model input names: ['input_ids', 'attention_mask']


In [15]:
print('Encoded text')
encoded_text = tokenizer(text)
print(encoded_text,'\n')

print('Tokens')
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens,'\n')

print('Convert tokens to string')
print(tokenizer.convert_tokens_to_string(tokens),'\n')

Encoded text
{'input_ids': [101, 19204, 6648, 1997, 3793, 2003, 1037, 4563, 4708, 1997, 17953, 2361, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 

Tokens
['[CLS]', 'token', '##isation', 'of', 'text', 'is', 'a', 'core', 'task', 'of', 'nl', '##p', '.', '[SEP]'] 

Convert tokens to string
[CLS] tokenisation of text is a core task of nlp. [SEP] 



## <b>2 <span style='color:#F1A424'>|</span> Traditional Feature Generation</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#F1A424'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>

- Having tokenised our sentence, the next step is to convert our substrings into numerical format; **encoding**
- There are several ways we can go about this, lets first get to know more traditional methods

<br>

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>2.1 |</span></span></b> Text/Categorical Encoders</b></p></div>

- Traditional approaches (count based) to feature engineering are part of Bag-of-Words methods
- They are effective methods for extracting features from text, however they miss vital information like semantics, structure, sequence & context around nearby words

**<span style='color:#FFC300'>Traditional methods</span>** for transforming/encoding text to numerical values:

- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">One-Hot Encoding Features</mark>** (`preprocessing.OneHotEncoder`)
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Bag-of-Words Feature</mark>** (`feature_extraction.text.CountVectorizer`)
)
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Bag-of-Words Feature (ngrams)</mark>** (`feature_extraction.text.CountVectorizer`)
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">TF-IDF Feature</mark>**
(`feature_extraction.text.TfidfVectorizer`)

Other methods (More Encoders can be found in library **[cateogory_encoders](https://contrib.scikit-learn.org/category_encoders/index.html)**)
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Ordinal Encoding</mark>** (`preprocessing.OrdinalEncoder`) [used for 2D encoding]
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Label Encoding</mark>** (`preprocessing.LabelEncoder`) [used for 1D encoding]
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Frequency Encoding</mark>** (`Collection.counter`)
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Hashing Encoding</mark>** (`category_encoder.HashingEncoder`)
- **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Dictionary Encoding</mark>** (`feature_extraction.DictVectorizer`)

In [16]:
''' One-Hot Encoding Features'''
# Vertical direction -> number of words in the list (sentences are treated as words) 
# Horizontal direction -> unique name in list 
# Value represents the location where the word occurs 
# (eg. awesome -> 3rd & last string, 1 value @ its identifier location -> 3rd column (3rd in categories_))

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# tokenised or categorical data in column
words = np.array(['NLP','is','awesome','eh','NLP today','awesome'])
print(words)

encoder = OneHotEncoder(sparse=False)
vectors = encoder.fit_transform(words[:,None])
vectors

df_matrix = pd.DataFrame(vectors,columns=encoder.categories_)
df_matrix.values

['NLP' 'is' 'awesome' 'eh' 'NLP today' 'awesome']


array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [17]:
encoder.categories_

[array(['NLP', 'NLP today', 'awesome', 'eh', 'is'], dtype='<U9')]

In [18]:
''' Bag of Words Features'''
# Vertical direction -> number of sentences in corpus
# Horizontal diction -> Vocabulary ID 
# Value represents the number of words found in the sentence

from sklearn.feature_extraction.text import CountVectorizer

# corpus with multiple entries
corpus = [
    'Girl likes cat Tom',
    'Who likes the cat?',
    'Tom is a quiet cat'
]

vectoriser = CountVectorizer()
vectors = vectoriser.fit_transform(corpus)

df_matrix = pd.DataFrame(vectors.toarray(),
                         columns=vectoriser.vocabulary_)
df_matrix.values

array([[1, 1, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 1, 0]])

In [19]:
# The default regexp select tokens of 2 or more alphanumeric characters (punctuation is 
# completely ignored and always treated as a token separator).

print(vectoriser.tokenizer)
print(vectoriser.token_pattern)

None
(?u)\b\w\w+\b


In [20]:
# vocabulary IDs
vectoriser.vocabulary_

{'girl': 1,
 'likes': 3,
 'cat': 0,
 'tom': 6,
 'who': 7,
 'the': 5,
 'is': 2,
 'quiet': 4}

In [21]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [22]:
''' Bag of Words Features (custom tokeniser) '''
# We can utilise a custom tokeniser, if we already havent 
# modified our input documents of the corpus
# A common preprocessing approach is lammatisation of words
# ie return them to their basic form
# well combine it with nltks word tokeniser

from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'Girl likes cat Tom',
    'Who likes the cat?',
    'Tom is a quiet cat'
]

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# lemmatisation preprocessing tokenisation
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vectoriser = CountVectorizer(tokenizer=LemmaTokenizer())  
vectors = vectoriser.fit_transform(corpus)

df_matrix = pd.DataFrame(vectors.toarray(),
                         columns=vectoriser.vocabulary_)
df_matrix.values

array([[0, 0, 1, 1, 0, 1, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 1, 0, 1, 0, 1],
       [0, 1, 1, 0, 1, 0, 1, 0, 1, 0]])

In [23]:
import pandas as pd

dict_corpus = {'Torchtext':"PyTorch's NLP and text processing library",
          'Flair':"Simple framework for NLP",
          'AllenNLP':'Library for designing and evaluating NLP models',
          'ParlAI':'Framework for sharing, training, and testing dialogue models',
          'NeMo':'Toolkit for conversational AI',
          'PyTorch NLP':'Basic utilities for NLP',
          'Translate':"Facebook's machine translation platform",
          'TorchAudio':"PyTorch's library for audio preprocessing"}

corpus = pd.DataFrame(dict_corpus.items(),columns=['library','description'])

In [24]:
print(vectoriser.tokenizer)

<__main__.LemmaTokenizer object at 0x7e98670406d0>


In [25]:
# vocabulary IDs
vectoriser.vocabulary_

{'girl': 3,
 'like': 5,
 'cat': 2,
 'tom': 8,
 'who': 9,
 'the': 7,
 '?': 0,
 'is': 4,
 'a': 1,
 'quiet': 6}

In [26]:
''' Bag of Words Features (n-grams)'''
# Vertical direction -> number of sentences in corpus
# Horizontal diction -> Vocabulary ID 
# Value represents the number of words found in the sentence

from sklearn.feature_extraction.text import CountVectorizer

# corpus with multiple entries
corpus = [
    'Girl likes cat Tom',
    'Who likes the cat?',
    'Tom is a quiet cat'
]

# ngram_range = (1,2) -> both unigrams & bigrams
vectoriser = CountVectorizer(ngram_range=(1,2))
vectors = vectoriser.fit_transform(corpus)

df_matrix = pd.DataFrame(vectors.toarray(),
                         columns=vectoriser.vocabulary_)
df_matrix.values

array([[1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1],
       [1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0]])

In [27]:
# vocabulary IDs
vectoriser.vocabulary_

{'girl': 2,
 'likes': 6,
 'cat': 0,
 'tom': 13,
 'girl likes': 3,
 'likes cat': 7,
 'cat tom': 1,
 'who': 15,
 'the': 11,
 'who likes': 16,
 'likes the': 8,
 'the cat': 12,
 'is': 4,
 'quiet': 9,
 'tom is': 14,
 'is quiet': 5,
 'quiet cat': 10}

In [28]:
''' TF-IDF features (TfidfVectorizer) '''
# corpus -> 3 lines -> 3 vectors
# length of vectors equal to the vocabulary length
# we can also use TfidfTransformer w/ CountVectorizer matrix

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'Girl likes cat Tom',
    'Who likes the cat?',
    'Tom is a quiet cat'
]

vectoriser = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
                             use_idf=True, smooth_idf=True)
vectors = vectoriser.fit_transform(corpus)
df_matrix = pd.DataFrame(vectors.toarray(),
                         columns=vectoriser.vocabulary_)
df_matrix.values
# display(df_matrix)

array([[0.37311881, 0.63174505, 0.        , 0.4804584 , 0.        ,
        0.        , 0.4804584 , 0.        ],
       [0.34520502, 0.        , 0.        , 0.44451431, 0.        ,
        0.5844829 , 0.        , 0.5844829 ],
       [0.34520502, 0.        , 0.5844829 , 0.        , 0.5844829 ,
        0.        , 0.44451431, 0.        ]])

In [29]:
# Vocabulary Identifier
vectoriser.vocabulary_

{'girl': 1,
 'likes': 3,
 'cat': 0,
 'tom': 6,
 'who': 7,
 'the': 5,
 'is': 2,
 'quiet': 4}

In [30]:
''' Ordinal Encoder '''
from sklearn.preprocessing import OrdinalEncoder

words = np.array([['NLP','is','awesome','eh','NLP'],
                  ['NLP','is','very','interesting','eh']]).T

encoder = OrdinalEncoder()
corpus_encoded = encoder.fit_transform(words)
corpus_encoded

array([[0., 0.],
       [3., 3.],
       [1., 4.],
       [2., 2.],
       [0., 1.]])

In [31]:
encoder.categories_

[array(['NLP', 'awesome', 'eh', 'is'], dtype='<U11'),
 array(['NLP', 'eh', 'interesting', 'is', 'very'], dtype='<U11')]

In [32]:
''' Label Encoding '''
from sklearn.preprocessing import LabelEncoder

paragraph = "write paragaraph here to convert into tokens."
lst_paragraph = paragraph.split(' ')

label_encoder = LabelEncoder()
corpus_encoded = label_encoder.fit_transform(lst_paragraph)
corpus_encoded

array([6, 3, 1, 4, 0, 2, 5])

In [33]:
label_encoder.classes_

array(['convert', 'here', 'into', 'paragaraph', 'to', 'tokens.', 'write'],
      dtype='<U10')

In [34]:
''' Frequency Encoding '''
# Slightly more time consuming (just one approach)
# Extracting a constant subset from input text

import pandas as pd

paragraph = {"text":["Miss","Mrs","Mrs","Miss","Mrs"],
             "label":[0,1,1,0,0]}

df = pd.DataFrame(paragraph)

from collections import Counter

counter = Counter()
def extract_const_text(val):
    for i in range(len(val)):
        counter[val[i:i+4]] += 1
        
df['text'].apply(extract_const_text)
mapper = dict(counter.most_common())
df['text'].replace(mapper)
df

Unnamed: 0,text,label
0,Miss,0
1,Mrs,1
2,Mrs,1
3,Miss,0
4,Mrs,0


In [35]:
''' Hashing Encoder '''
# Useful if you don't know how many categories
# Hashes the categorical columns into n_components

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
import category_encoders as ce
import numpy as np

words = np.array([['NLP','is','awesome','eh','NLP'],
                  ['NLP','is','very','interesting','eh']]).T

encoder = ce.HashingEncoder()
encoder.fit_transform(words)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,0,2,0,0,0
1,0,2,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0
3,0,0,1,0,0,0,1,0
4,0,0,0,0,1,0,1,0


In [36]:
''' Dictionary Encoder '''
# If we have a dictinary, we can encode it
# strings are ohe, values are added directly

from sklearn.feature_extraction import DictVectorizer

measurements = [
     {'fox': 'white', 'length': 21.3},
     {'fox': 'red', 'length': 23.8},
     {'fox': 'orange', 'temperature': 25.2}]

vectoriser = DictVectorizer()
vectoriser.fit_transform(measurements).toarray()

array([[ 0. ,  0. ,  1. , 21.3,  0. ],
       [ 0. ,  1. ,  0. , 23.8,  0. ],
       [ 1. ,  0. ,  0. ,  0. , 25.2]])

In [37]:
vectoriser.get_feature_names_out()

array(['fox=orange', 'fox=red', 'fox=white', 'length', 'temperature'],
      dtype=object)

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>2.2 |</span></span></b> Understanding the TF-IDF Model</b></p></div>

### <b><span style='color:#F1A424'>Recreating TfidfVectorizer</span></b>

It's always useful to know the inner workings of common methods used in **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">NLP</mark>**

- Start off with a DataFrame containing a column of text (corpus)
- **Normalise** the corpus, **tokenise** & remove **stopwords**
- Compute the **<span style='color:#FFC300'>Term frequencies</span>** (TF) for the corpus (BOW)
- Compute the **<span style='color:#FFC300'>Document frequency</span>** (# of documents the term exists)
- Compute the **<span style='color:#FFC300'>Inverse Document Frequency</span>** & set it to the diagonal component
- Compute the **<span style='color:#FFC300'>TF-IDF</span>** matrix (using matrix multiplication)
- Compute the normalised **<span style='color:#FFC300'>TF-IDF</span>** matrix

In [38]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import re
from numpy.linalg import norm

''' Create Data '''

corpus = ['Girl likes cat Tom',
          'I like the cat',
          'The cat likes to stay at home']

corpus = pd.DataFrame(corpus,columns=['text'])
corpus = corpus['text']


''' Normalise Text '''
# TfidfTransformer will remove stop words & tokenise the data

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalise(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalise)
norm_corpus = normalize_corpus(corpus)

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def get_TFIDF(corpus):

    # BOW
    cv = CountVectorizer(min_df=0.0, 
                         max_df=1.0)
    
    cv_matrix = cv.fit_transform(norm_corpus)

    # TF-IDF
    tt = TfidfTransformer(norm='l2', 
                          smooth_idf=True,
                          use_idf=True)

    tt_matrix = tt.fit_transform(cv_matrix)

    tt_matrix = tt_matrix.toarray()
    vocab = cv.get_feature_names_out()
    ldf = pd.DataFrame(tt_matrix, columns=vocab)
    return ldf.sort_index(axis=1).values

# For comparison
get_TFIDF(corpus)

array([[0.34520502, 0.5844829 , 0.        , 0.        , 0.44451431,
        0.        , 0.5844829 ],
       [0.50854232, 0.        , 0.        , 0.861037  , 0.        ,
        0.        , 0.        ],
       [0.34520502, 0.        , 0.5844829 , 0.        , 0.44451431,
        0.5844829 , 0.        ]])

In [40]:
# For comparison
get_TFIDF(corpus)

array([[0.34520502, 0.5844829 , 0.        , 0.        , 0.44451431,
        0.        , 0.5844829 ],
       [0.50854232, 0.        , 0.        , 0.861037  , 0.        ,
        0.        , 0.        ],
       [0.34520502, 0.        , 0.5844829 , 0.        , 0.44451431,
        0.5844829 , 0.        ]])

In [41]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import re
from numpy.linalg import norm
import scipy.sparse as sp

def get_TFIDF2(corpus,verbose=1):

    ''' Define Vocabulary Dictionary '''
    # create a blank dictionary

    lst_words = list(set([word for doc in [doc.split() 
                                           for doc in corpus] 
                             for word in doc]))
    # initialise dict
    def_feature_dict = {w: 0 for w in lst_words}

    ''' Compute the Term Frequencies (TF) (BOW)'''
    # compute Bag of Words 

    BoW = []
    for doc in corpus:
        bow_feature_doc = Counter(doc.split())
        all_features = Counter(def_feature_dict)
        bow_feature_doc.update(all_features)
        BoW.append(bow_feature_doc)
        
    BoW = pd.DataFrame(BoW)
    np_BoW = np.array([BoW])
    if(verbose == 1):
        print(f'BoW:\n {np_BoW}','\n')

    ''' Compute the Document Frequencies (DF) '''
    # with smoothing (df=df+1)

    features = list(BoW.columns)
    df = np.diff(sp.csc_matrix(BoW, copy=True).indptr)
    df = 1 + df
    
    if(verbose == 1):
        print(f'DF:\n {df}','\n')

    ''' Compute IDF (inverse document frequencies) '''
    # with smoothing (1+ ...)
    
    total_docs = 1 + len(corpus)
    idf = 1.0 + np.log(float(total_docs) / df)
    
    if(verbose == 1):
        print(f'IDF:\n {idf}','\n')

    ''' Add computed IDF terms to diagonal '''
    # so we can use it in matrix mult TF,IDF

    total_features = BoW.shape[1]
    idf_diag = sp.spdiags(idf, diags=0, 
                          m=BoW.shape[1], 
                          n=BoW.shape[1])
    idf_dense = idf_diag.todense()

    ''' Compute TF-IDF feature matrix '''

    tf = np.array(BoW, dtype='float64')
    tfidf = np.matmul(tf,idf_dense)
    
    if(verbose == 1):
        print(f'TFIDF:\n {tfidf}','\n')
    
    ''' Compute Matrix L2 Norm '''
    # If required L2 Normalisation

    norms = norm(tfidf, axis=1)
    norm_tfidf = tfidf / norms[:, None]
    
    if(verbose == 1):
        print(f'L2 TFIDF:\n {norm_tfidf}','\n')

    ldf = pd.DataFrame(norm_tfidf,columns=features)
    ldf.sort_index(axis=1)
    print('Sorted Data')
    print(ldf.values)

get_TFIDF2(norm_corpus)

BoW:
 [[[1 1 1 1 0 0 0]
  [0 0 1 0 1 0 0]
  [0 1 1 0 0 1 1]]] 

DF:
 [2 3 4 2 2 2 2] 

IDF:
 [1.69314718 1.28768207 1.         1.69314718 1.69314718 1.69314718
 1.69314718] 

TFIDF:
 [[1.69314718 1.28768207 1.         1.69314718 0.         0.
  0.        ]
 [0.         0.         1.         0.         1.69314718 0.
  0.        ]
 [0.         1.28768207 1.         0.         0.         1.69314718
  1.69314718]] 

L2 TFIDF:
 [[0.5844829  0.44451431 0.34520502 0.5844829  0.         0.
  0.        ]
 [0.         0.         0.50854232 0.         0.861037   0.
  0.        ]
 [0.         0.44451431 0.34520502 0.         0.         0.5844829
  0.5844829 ]] 

Sorted Data
[[0.5844829  0.44451431 0.34520502 0.5844829  0.         0.
  0.        ]
 [0.         0.         0.50854232 0.         0.861037   0.
  0.        ]
 [0.         0.44451431 0.34520502 0.         0.         0.5844829
  0.5844829 ]]


## <b>3 <span style='color:#F1A424'>|</span> Advanced Feature Generation</b> 

### <b><span style='color:#F1A424'>Embedding Models</span></b>

Tradition methods described in **<span style='color:#FFC300'>Section 2</span>** are effective methods for extracting features from text data


Due to their inherint nature of the models being an unstructured group of words, we lose potentially important information about our text
- **<span style='color:#FFC300'>Semantics</span>**, **<span style='color:#FFC300'>structure</span>**, **<span style='color:#FFC300'>sequence</span>** & **<span style='color:#FFC300'>context around neighbouring words</span>**
    
    
As a result, let's explore more sophisticated models which can capture this type of information:
    - These methods are based on **<span style='color:#FFC300'>vector representations</span>** **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>**
    
    
Let's explore two approaches common **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>** approaches:

- **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** (Continuous bag of words) model
- **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">skip-gram</mark>** (CG) model
    
    

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>3.1 |</span></span></b> Continuous Bag of Words (CBOW) Model | PyTorch</b></p></div>

### <b><span style='color:#F1A424'>CBOW Model</span></b>

- The CBOW model architecture is a type of neural network 
- It's usually a shallow neural network, which aims to predict a target word, based on its **context of words**
- The madel aims to predict the missing word in the centre, given a set of words of its surrounding (eg. +/ n words)
- The CBOW model is trained by feeding it with large amounts of text data, and it learns to associate words that appear in similar contexts
- This allows the model to generate accurate word embeddings, which can be used for various NLP tasks 

### <b><span style='color:#F1A424'>Implementation of CBOW Model in PyTorch</span></b>

Our goal is to train a model from which we can extract **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>** & use them for feature generation

The implementation will focus on six parts:
    
- Building a **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">corpus</mark>** vocabulary (string to number dictionary)
- Building a **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** (Context,Target) Vector (context are the words around the main word, get all possible cases & store numerical representation in tensor)
- Building a **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** Model Architecture
- Training the **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** Model 
- Extracting the **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>** from the model
- Confirming the correctness of the **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** model

In [42]:
string = 'Today is a good day for taking a walk'
tokens = string.split()
print(tokens)

['Today', 'is', 'a', 'good', 'day', 'for', 'taking', 'a', 'walk']


In [43]:
token_set = set(tokens) # create all unique tokens
word2id = {word:idx for idx,word in enumerate(token_set)} # give unique identifier to each unique token
id2word = {idx:word for idx,word in enumerate(token_set)}
window = 2        # context window size
embeddings = 100  # number of embeddings to be used for representation
epochs = 100     # number of training iterations
lr = 0.001       # learning rate for CBOW 
vocab_size = len(token_set)  # size of vocabulary

In [44]:
import torch
import torch.nn as nn

def context_vector(tokens:list):
    # list of values for each token
    val_context = [word2id[word] for word in tokens] 
    return val_context
    
    
context_pairs = []

# loop through all possible cases 
for i in range(window,len(tokens) - window):
    
    context = []
    
    # words to the left
    for j in range(-window,0):
        context.append(tokens[i+j])
    
    # words to the right
    for j in range(1,window+1):
        context.append(tokens[i+j])
        
    context_pairs.append((context,tokens[i]))
    
# show all context pairs in document
print('context, target pairs\n')
for context in context_pairs:
    print(context)
    
# sample tensor conversion
print('\nfor pytorch; context, target word tensors\n')
for context,target in context_pairs:
    X = torch.tensor(context_vector(context))
    y = torch.tensor(word2id[target])
    print(X,y)

context, target pairs

(['Today', 'is', 'good', 'day'], 'a')
(['is', 'a', 'day', 'for'], 'good')
(['a', 'good', 'for', 'taking'], 'day')
(['good', 'day', 'taking', 'a'], 'for')
(['day', 'for', 'a', 'walk'], 'taking')

for pytorch; context, target word tensors

tensor([1, 4, 7, 6]) tensor(0)
tensor([4, 0, 6, 3]) tensor(7)
tensor([0, 7, 3, 2]) tensor(6)
tensor([7, 6, 2, 0]) tensor(3)
tensor([6, 3, 0, 5]) tensor(2)


In [45]:
from torch.optim import Adam

class CBOW(torch.nn.Module):
    
    def __init__(self,vocab_size,embed_dim):
        super(CBOW,self).__init__()
        
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        self.linear = nn.Linear(embed_dim,vocab_size)
        self.active = nn.LogSoftmax(dim=-1)
        
    def forward(self,x):
        x = sum(self.embedding(x)).view(1,-1)
        x = self.linear(x)
        x = self.active(x)
        return x
    

model = CBOW(vocab_size,embeddings)    
criterion = nn.NLLLoss()
optimiser = Adam(model.parameters(),lr=lr)

In [46]:
# training loop

lst_loss = []
for epoch in range(epochs):
    
    loss = 0.0
    for context,target in context_pairs:
        
        X = torch.tensor(context_vector(context))
        y = torch.tensor([word2id[target]])        

        y_pred = model(X)
        loss += criterion(y_pred,y)
        
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()
    lst_loss.append(float(loss.detach().numpy()))
        
print(lst_loss[-1])

0.1866779327392578


In [47]:
''' Test out our CBOW model '''

# tokenised text
temp = ['good','day','taking','a']

# convert to tensor
cont_vector = torch.tensor(context_vector(temp)) 

# prediction
y_pred = model(cont_vector)
print('model output:',y_pred)

pred_index = torch.argmax(y_pred[0]) # get largest argument
print(f'prediction word: ',id2word[pred_index.item()])

model output: tensor([[-5.2092, -6.3915, -5.8735, -0.0512, -7.7725, -6.9833, -3.5222, -4.7050]],
       grad_fn=<LogSoftmaxBackward0>)
prediction word:  for


<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>3.2 |</span></span></b> Skip Gram (CG) Model | Keras</b></p></div>

### <b><span style='color:#F1A424'>SG Model </span></b>

- The **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">SG</mark>** (skip-gram) variant takes a **<span style='color:#FFC300'>target word</span>** and tries to predict the **<span style='color:#FFC300'>surrounding context words</span>** (which is the opposite of **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>**)

> - The "fake" task for skip-gram model would be, given a word, we’ll try to predict its neighboring words
> - We’ll define a neighboring word by the window size — a hyper-parameter

- We'll be using Keras, as we have a convenient function that generates **<span style='color:#FFC300'>skip-gram</span>** training data `keras.preprocessing.sequence.skipgrams`
- The data generated using `skipgrams` will be elaborated using an example below

### <b><span style='color:#F1A424'>Example Document </span></b>

Unlike **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>**, it seems like the **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">SG</mark>** model takes a bit more time to understand

- **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">docment</mark>** **[The quick brown fox jumps over the red dog]**
- **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">skip-grams</mark>** (Assume we use a window size of **2 words**, same as **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** (+/- n words)

<br>



![](https://media.geeksforgeeks.org/wp-content/uploads/word2vec_diagram-1.jpg)

**[Reference](https://www.geeksforgeeks.org/implement-your-own-word2vecskip-gram-model-in-python/)** - Implement your own word2vec(skip-gram) model in Python



Looking at the top frame (words only the right) - very first case

- Target Word: **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">The</mark>** Neighbouring Words: (`quick`,`brown`) Training Samples (`The`,`quick`), (`The`,`brown`) will have target specified as `label=1`
- All other word combinations outside the outlined in black will have `label=0` (not all training samples are generated)

All other combinations are shown below:

In [48]:
import pandas as pd
from keras.preprocessing import text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
import numpy as np

corpus = ['The quick brown fox jumps over the red dog']
print(f'Corpus Length: {len(corpus)}')

# Tokenise & Get Vocab Size
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id) + 1 
embed_size = 10

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]
print(wids)

# # Generat


Corpus Length: 1
[[1, 2, 3, 4, 5, 6, 1, 7, 8]]


### <b><span style='color:#F1A424'>Input Data </span></b>

`data` Training Sample:

- We have a pair of input words (input target word & one context word)

`labels` Target Value Identifier:

- If it is a positive sample, the word has contextual meaning (context word, label = 1)
- If it is a negative sample, the word has no contextual meaning (random word, label = 0)

### <b><span style='color:#F1A424'>Implementation of SG Model in Keras</span></b>

Our goal is to train a model from which we can extract **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>** & use them for feature generation

- Input data is passed on to **<span style='color:#FFC300'>two embedding layers</span>** (one into context & one into word model) which both have a size of (vocab_size,embed_size)
- Next, use merge layer to compute the dot product of the two embeddings & get the dot product value
- Then it is sent to the dense sigmoid layer (outputs 1/0)
- Compare the dense value to the actual label, compute the loss & back propagate the errors to adjust embedding layer weights
- Repeat the process for all (target,context) pairs for multiple epoch loops

In [49]:
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input, Dense
from keras.layers.merge import Dot
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
import gensim
import logging
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('tensorflow').setLevel(logging.FATAL)

import pandas as pd
from keras.preprocessing import text

# Let's utilise the TBBT plot text data
corpus = pd.read_csv('/kaggle/input/the-big-bang-theory-plots-all-seasons/The Big Bang Theory.csv')['plot'].tolist()
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2] # save all as a list
corpus = corpus[0] # string


# # Tokenise & Get Vocab Size
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
vocab_size = len(tokenizer.word_index) + 1


word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id) + 1 
embed_size = 20

# wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]
# print('Vocabulary Size:', vocab_size)
# print('Vocabulary Sample:', list(word2id.items())[:10])

In [50]:
''' Define SG Model '''

from keras.layers import Dot
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.models import Model

word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,input_length=1))
word_model.add(Reshape((embed_size, )))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,input_length=1))
context_model.add(Reshape((embed_size,)))

model_arch = Dot(axes=1)([word_model.output, context_model.output])
model_arch = Dense(1,activation="sigmoid")(model_arch)
model = Model([word_model.input,context_model.input], model_arch)
model.compile(loss="mean_squared_error", optimizer="rmsprop")

In [51]:
# Randomly Generated Initial Weights

print('First Iteration Word Embedding:')
word_embed_layer = model.layers[2]
weights = word_embed_layer.get_weights()[0][1:]
pd.DataFrame(weights, index=id2word.values()).head()

First Iteration Word Embedding:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
to,0.001018,0.030958,-0.021828,0.038249,-0.01439,0.009286,0.001913,0.039623,-0.007687,0.045438,0.001009,0.048673,-0.011482,0.013123,-0.047074,0.028881,0.039772,-0.014625,-0.035699,-0.023828
is,-0.017053,0.029733,0.036733,0.034252,0.026469,-0.001017,-0.001769,0.016534,0.0074,0.02483,-0.024753,0.009743,0.045727,0.003403,-0.019739,0.029668,0.049558,-0.042104,0.034967,-0.040312
her,-0.044901,0.007324,0.019523,-0.00605,-0.00944,0.000186,0.048875,-0.002762,-0.009452,-0.038289,0.02149,0.030297,0.031077,0.02188,0.031627,-0.016604,-0.047135,0.031179,-0.013803,-0.014999
and,0.01127,-0.018285,0.02143,-0.03948,0.009038,-0.041462,-0.003732,0.010398,-0.029211,0.009094,-0.030253,-0.009871,0.049645,0.03134,-0.018123,-0.031906,0.043259,-0.035269,0.009955,0.030515
their,0.042003,0.043692,-0.01731,-0.044624,-0.030629,-0.018605,0.046698,3e-05,-0.002818,-0.037878,0.003803,-0.003974,0.032519,-0.001324,0.03591,0.022881,-0.049265,-0.022948,0.035178,0.014026


In [52]:
''' Training Model '''

from tqdm.notebook import tqdm
from keras.preprocessing.sequence import skipgrams

lst_loss = []
for ii in tqdm(range(40)):
    total_loss = 0.0
    
    # Enumerate over tokenised text
    for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
        
        data, labels = skipgrams(sequence=doc, 
                                 vocabulary_size=vocab_size, 
                                 window_size=2, 
                                 negative_samples=5.)
        
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        
        if x:
            total_loss += model.train_on_batch(x, y)
            
#     print(ii,total_loss)
    lst_loss.append(total_loss)
    
pd.options.plotting.backend = "plotly"
df_loss = pd.DataFrame({'loss':lst_loss})
df_loss.plot(template='plotly_white',title='SG Model Loss')

  0%|          | 0/40 [00:00<?, ?it/s]

In [53]:
print('Last Iteration Word Embedding:')
word_embed_layer = model.layers[2]
weights = word_embed_layer.get_weights()[0][1:]
pd.DataFrame(weights, index=id2word.values()).head()

Last Iteration Word Embedding:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
to,0.001018,0.030958,-0.021828,0.038249,-0.01439,0.009286,0.001913,0.039623,-0.007687,0.045438,0.001009,0.048673,-0.011482,0.013123,-0.047074,0.028881,0.039772,-0.014625,-0.035699,-0.023828
is,-0.017053,0.029733,0.036733,0.034252,0.026469,-0.001017,-0.001769,0.016534,0.0074,0.02483,-0.024753,0.009743,0.045727,0.003403,-0.019739,0.029668,0.049558,-0.042104,0.034967,-0.040312
her,-0.044901,0.007324,0.019523,-0.00605,-0.00944,0.000186,0.048875,-0.002762,-0.009452,-0.038289,0.02149,0.030297,0.031077,0.02188,0.031627,-0.016604,-0.047135,0.031179,-0.013803,-0.014999
and,0.01127,-0.018285,0.02143,-0.03948,0.009038,-0.041462,-0.003732,0.010398,-0.029211,0.009094,-0.030253,-0.009871,0.049645,0.03134,-0.018123,-0.031906,0.043259,-0.035269,0.009955,0.030515
their,0.042003,0.043692,-0.01731,-0.044624,-0.030629,-0.018605,0.046698,3e-05,-0.002818,-0.037878,0.003803,-0.003974,0.032519,-0.001324,0.03591,0.022881,-0.049265,-0.022948,0.035178,0.014026


In [54]:
# Save Embeddings in w2v format
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1,embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>3.3 |</span></span></b> Word2Vec | (Hybrid CBOW,CG) | Gensim</b></p></div> 

### <b><span style='color:#F1A424'>Efficient Impementation of Word2Vec</span></b>

- Custom implementations of **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>** generation models are nice to play around with and understand the inner workings of common methods in NLP
- However, realistic problems tend to contain **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">corpus</mark>** with lots of documents, hence efficiency becomes quite important
- An efficient & very commonly utilised model that utilised both **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">CBOW</mark>** & **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">SG</mark>** is `gensim`'s **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Word2vec</mark>**, so it's worth being familar with it

### <b><span style='color:#F1A424'>Preparing Corpus</span></b>

We can choose to normalise the document, so our dictionary will contain less vocabulary

In [55]:
import nltk

''' Normalise Text '''

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalise(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [56]:
import pandas as pd
import numpy as np
import re

# Let's utilise the TBBT plot text data
corpus = pd.read_csv('/kaggle/input/the-big-bang-theory-plots-all-seasons/The Big Bang Theory.csv')['plot']

# Normalise corpus documents
normalize_corpus = np.vectorize(normalise)
norm_corpus = normalize_corpus(corpus)
print('First 4 Normalised Documents:')

norm_corpus[:4]

First 4 Normalised Documents:


array(['unsuccessful visit highiq sperm bank dr leonard hofstadter dr sheldon cooper return home find aspiring actress penny new neighbor across hall apartment sheldon thinks leonard immediately interested chasing dream never catch leonard invites penny sheldons apartment indian food asks use shower since broken wrapped towel gets meet visiting friends howard wolowitz wannabe ladies man tries hit rajesh koothrappali unable speak suffers selective mutism presence women leonard infatuated penny helping use shower agrees retrieve tv exboyfriend kurt however kurts physical superiority overwhelms leonards sheldons combined iq return without pants tv penny feeling bad offers take guys dinner initiating friendship',
       'sheldon leonard drop box flat pack furniture came penny sheldon deeply disturbed messy disorganized apartment later night penny sleeps obsessivecompulsive sheldon unable sleep sneaks apartment organize clean leonard finds reluctantly helps next morning penny furious discov

In [57]:
# Tokenize corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

print('First Tokenised corpus:\n')
print(tokenized_corpus[0])

First Tokenised corpus:

['unsuccessful', 'visit', 'highiq', 'sperm', 'bank', 'dr', 'leonard', 'hofstadter', 'dr', 'sheldon', 'cooper', 'return', 'home', 'find', 'aspiring', 'actress', 'penny', 'new', 'neighbor', 'across', 'hall', 'apartment', 'sheldon', 'thinks', 'leonard', 'immediately', 'interested', 'chasing', 'dream', 'never', 'catch', 'leonard', 'invites', 'penny', 'sheldons', 'apartment', 'indian', 'food', 'asks', 'use', 'shower', 'since', 'broken', 'wrapped', 'towel', 'gets', 'meet', 'visiting', 'friends', 'howard', 'wolowitz', 'wannabe', 'ladies', 'man', 'tries', 'hit', 'rajesh', 'koothrappali', 'unable', 'speak', 'suffers', 'selective', 'mutism', 'presence', 'women', 'leonard', 'infatuated', 'penny', 'helping', 'use', 'shower', 'agrees', 'retrieve', 'tv', 'exboyfriend', 'kurt', 'however', 'kurts', 'physical', 'superiority', 'overwhelms', 'leonards', 'sheldons', 'combined', 'iq', 'return', 'without', 'pants', 'tv', 'penny', 'feeling', 'bad', 'offers', 'take', 'guys', 'dinner',

In [58]:
from gensim.models import word2vec as w2v

# Set Model Parameters
feature_size = 100           # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1           # Minimum word count                        
sample = 1e-3                # Downsample setting for frequent words

# Word2Vec Model
w2v_model = w2v.Word2Vec(tokenized_corpus, 
                         vector_size=feature_size, 
                         window=window_context, 
                         min_count=min_word_count,
                         sample=sample, 
                         epochs=50)

### <b><span style='color:#F1A424'>Individual Vector Embeddings</span></b>
- Each **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embedding</mark>** has **feature_size** components
- By stacking invididual word embeddings, we can obtain a feature matrix that contains the invidual word **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embeddings</mark>**
- Once we have selected the desired list of word **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">embeddings</mark>**, we can conduct any form of statistical (eg. mean) evaluation for each **document**, so we will obtain a single vector per document

In [59]:
vocab_len = len(w2v_model.wv)
print(f'Vocabulary size: {vocab_len}')

print('First 10 words in vocabulary:')
print(w2v_model.wv.index_to_key[:10])

Vocabulary size: 5817
First 10 words in vocabulary:
['sheldon', 'leonard', 'penny', 'howard', 'amy', 'raj', 'bernadette', 'reference', 'title', 'sheldons']


In [60]:
lst_words = ['sheldon','amy','raj','leonard','penny','bernadette','howard']

np_list = []
for word in lst_words:
    np_list.append(w2v_model.wv[word])
    
# Calculate mean array of selected document words
X = pd.DataFrame(np.stack(np_list).T,columns = lst_words)
X.mean(axis=1)

0    -0.931894
1     0.204056
2    -0.731927
3    -0.268958
4    -0.402197
        ...   
95    0.660655
96    0.511408
97   -1.004928
98   -1.314493
99   -0.118664
Length: 100, dtype: float32

### <b><span style='color:#F1A424'>Averaged Sentence Embedding</span></b>

- Let's use **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">Word2vec</mark>** model for the creation of **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">embeddings</mark>** (**vector representation for words**)
- Every word will have **100 components** & keep only words which appear 20 times (min_word_count)
- For each document, we **sum the component vectors** and divide by their total number to get the **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">averaged sentence vector</mark>**

In [61]:
# Get average embedding vector for each text
def doc_vectorizer(doc, model):
    doc_vector = []
    num_words = 0
    for word in doc:
        try:
            if num_words == 0:
                doc_vector = model.wv[word]
            else:
                doc_vector = np.add(doc_vector, model.wv[word])
            num_words += 1
        except:
            # pass if word is not found
            pass
     
    return np.asarray(doc_vector) / num_words

X = []
for doc in tokenized_corpus:
    X.append(doc_vectorizer(doc, w2v_model))
    
print(f'Sentences: {len(X)}')
print(f'Each sentence has {X[0].shape} dimensions')

Sentences: 279
Each sentence has (100,) dimensions


### <b><span style='color:#F1A424'>Word Embedding Visualisation</span></b>

- Having `feature_size` of given components, we can find which **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">embeddings</mark>** that are closer to one another
- Given such a high dimension space, we can use **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">dimensionality reduction</mark>** methods such as TSNE, which will allow us to visualise the **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">embeddings</mark>** in two dimensions

In [62]:
# View similar words based on gensim's model
print('Similar Words')
similar_words = {search_term: [item[0] for item in w2v_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['sheldon','amy','raj','leonard','penny','bernadette','howard']}
similar_words

Similar Words


{'sheldon': ['alarming', 'retorts', 'willing', 'compromises', 'risky'],
 'amy': ['expect', 'mating', 'want', 'snuggle', 'bunny'],
 'raj': ['lucy', 'briefs', 'amazed', 'emily', 'claire'],
 'leonard': ['overhears', 'fast', 'threatened', 'continuing', 'elaborate'],
 'penny': ['proposes', 'feelings', 'leonard', 'later', 'confesses'],
 'bernadette': ['howard', 'soothe', 'bernadettes', 'halley', 'sing'],
 'howard': ['bernadette', 'bernadettes', 'money', 'heartbeat', 'bear']}

In [63]:
# Lower dimensionality visualisation of embeddings (100->2)
import plotly.express as px
from sklearn.manifold import TSNE
import warnings; warnings.filterwarnings('ignore')

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = w2v_model.wv[words]

tsne = TSNE(n_components=2, 
            random_state=0, 
            n_iter=10000)

X = tsne.fit_transform(wvs)
labels = words
    
px.scatter(X[:, 0], X[:, 1],text=labels,
           template='plotly_white',
           width=800,
           title='Word Embedding Visualisation')

## <b>4 <span style='color:#F1A424'>|</span> Transformer Pipelines</b> 

### <b><span style='color:#F1A424'>Transformer Introduction</span></b>

- In 2017, researchers at google published a paper that proposed a **<span style='color:#FFC300'>neural network architecture</span>** for sequence modeling called the **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">transformer</mark>**
- The architecture outperforms **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">RNN</mark>** on machine translation tasks both in terms of translation quality and training cost
- An effective transfer learning approach called **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">ULMFiT</mark>** showed that training **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">LSTM</mark>** on a very large and diverse corpus could produce good text classifiers with very little labelled data
- These advanced were the catalysts for the two most well known tranformers **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">GPT</mark>** (Generative Pretrained Transformer) & **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">BERT</mark>** (Bidirectional Encoder Representations from Transformers)

### <b><span style='color:#F1A424'>Huggingface Transformer Library</span></b>

- **<span style='color:#FFC300'>Huggingface Transformers</span>** library offers a standardised interface to a wide range of transformer models (typically utilising PyTorch or Tensorflow)
- **<span style='color:#FFC300'>Huggingface transformers</span>** library actually allows us to interact with the library at various levels of simplification/abstraction
- **<span style='color:#FFC300'>Huggingface transformers</span>**' `pipeline` offers a very easy way to demonstrate some of the most common **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** problems with very little code (like **<span style='color:#FFC300'>fastai</span>**)

In [64]:
# Load the pipline 
from transformers import pipeline

### <b><span style='color:#F1A424'>Common problems in NLP</span></b>

- As per usual, let's create a document that will be used to demonstate what types of problems are commonly encountered in **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>**
- The document is self explanatory, it's a review of a customer who had mistakenly been sent an incorrect shipment via Amazon

**<span style='color:#FFC300'>Explored NLP Pipelines</span>**:
- Text Classification
- Named Entity Recognition
- Question Answering
- Summarisation
- Translation
- Text Generation

In [65]:
text =    """
          Dear Amazon, last week I ordered an Optimus Prime
          action figure from your online store in Germany. 
          Unfortunately, when I opened the package, 
          I discovered to my horror that I had been sent 
          an action figure of Megatron instead! As a lifelong
          enemy of the Decepticons, I hope you can understand 
          my dilemma. To resolve the issue, 
          I demand an exchange of Megatron for the Optimus Prime 
          figure I ordered. Enclosed are copies of my records 
          concerning this purchase. I expect to hear from you soon. 
          Sincerely, Bumblebee.
          """

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>4.1 |</span></span></b> Text Classification</b></p></div> 

### <b><span style='color:#F1A424'>Sentiment Analysis</span></b>

- A typical problem in **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** is **<span style='color:#FFC300'>sentiment analysis</span>**, classifying text into tones, positive or negative
- A more thorough **<span style='color:#FFC300'>sentiment analysis</span>** can be found in **[Twitter Emotion Classification](https://www.kaggle.com/code/shtrausslearning/twitter-emotion-classification)**
- Such applications gives us information about what a customer things about a certain product and so on
- Lets load the classification pipeline, using `pipeline` with the argument **<span style='color:#FFC300'>text-classification</span>** (when we don't add a model argument, a **[default model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)** is loaded/downloaded)

In [66]:
classifier = pipeline('text-classification')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

If we visit the **[model link](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)**, we can note that it has been setup as a **<span style='color:#FFC300'>binary classifier</span>**

- We will obtain either a **positive** or **negative sentiment** (ie. binary classification) or label
- The model predicts quite confidently that the corpus is negative (which is true)
- We can see how little code we need to write when utilising the **<span style='color:#FFC300'>huggingface</span>** `pipeline`

In [67]:
import pandas as pd

outputs = classifier(text)
pd.DataFrame(outputs)

Unnamed: 0,label,score
0,NEGATIVE,0.901546


<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>4.2 |</span></span></b> Named Entity Recognition (NER)</b></p></div> 

- Another **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** problem is Named Entity Recognition **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">NER</mark>**
- **<mark style="background-color:#FFC300 ;color:white;border-radius:5px;opacity:1.0">NER</mark>** is a sort of multiclass classification problem, the model will find text in the corpus which corresponds to a particular group
- Real world objects like products, places, people etc are called **<span style='color:#FFC300'>named entities</span>**

In [68]:
tagger = pipeline('ner',
                  aggregation_strategy='simple')
outputs = tagger(text)
pd.DataFrame(outputs)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english)


Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Unnamed: 0,entity_group,score,word,start,end
0,ORG,0.87901,Amazon,16,22
1,MISC,0.990859,Optimus Prime,47,60
2,LOC,0.999755,Germany,111,118
3,MISC,0.55657,Mega,262,266
4,PER,0.590256,##tron,266,270
5,ORG,0.669692,Decept,317,323
6,MISC,0.498349,##icons,323,328
7,MISC,0.775362,Megatron,436,444
8,MISC,0.987854,Optimus Prime,453,466
9,PER,0.812096,Bumblebee,621,630


- For the output we get a classified group of words, in this case we have entity_groups **<span style='color:#FFC300'>ORG</span>** (organisation), **<span style='color:#FFC300'>MISC</span>**, **<span style='color:#FFC300'>LOC</span>** (location) & **<span style='color:#FFC300'>PER</span>** (person)
- The **score** tells us about how confident the model was about the named entities it identified

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>4.3 |</span></span></b> Question Answering</b></p></div> 

### <b><span style='color:#F1A424'>Extracting Answers from Context</span></b>

- Another **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** problem is answering of a user question
- We can ask the model a question, based on a particular **<span style='color:#FFC300'>context</span>** (which is our **<span style='color:#FFC300'>corpus</span>**)
- There are a few forms of **<span style='color:#FFC300'>Question Answering</span>** approaches
- This particular one **extracts an answer directly from context** and is called **<span style='color:#FFC300'>Extractive</span>** **<span style='color:#FFC300'>Question Answering</span>**

In [69]:
import pandas as pd

# load the relevant pipeline
reader = pipeline("question-answering") 

# define a question
question = "What does the customer want?"

# model output
outputs = reader(question=question, 
                 context=text) 
pd.DataFrame([outputs])

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Unnamed: 0,score,start,end,answer
0,0.631292,421,444,an exchange of Megatron


<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>4.4 |</span></span></b> Summarisation</b></p></div> 

### <b><span style='color:#F1A424'>Smartly Reducing Text Content</span></b>

- With **<span style='color:#FFC300'>Extractive Question Answering</span>** can extract relevant information quickly from a customer’s feedback
- What can we do if you get a vert long list of customer feedback, and there is no time to read them all? Let’s see if a **<span style='color:#FFC300'>summarisation</span>** model can help
- The goal of **<span style='color:#FFC300'>Summarisation</span>** is to take a **long text as input** and **generate a short version** with all the relevant facts
- This is a much more complicated task than the previous ones since it requires the model
to generate coherent text

In [70]:
summarizer = pipeline("summarization") 
outputs = summarizer(text,
                     min_length=45,
                     max_length=56, 
                     clean_up_tokenization_spaces=True) 
summary = outputs[0]['summary_text']

print(f'original text: {len(text)}')
print(f'summarised text: {len(summary)}')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

original text: 642
summarised text: 226


- Aside from "The Decepticons are a long-time enemy of the Decepticon", the summaration does make sence 

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>4.5 |</span></span></b> Translation</b></p></div> 

### <b><span style='color:#F1A424'>Sequence to Sequence</span></b>

- Another common task in **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** is **<span style='color:#FFC300'>Text Translation</span>**
- A task where the output consists of generated text, generated from input text (eg. translation of from one language to another)
- Let's translate the original `text` into **French**

In [71]:
translator = pipeline("translation_en_to_fr",
                      model="Helsinki-NLP/opus-mt-en-fr")

outputs = translator(text, 
                     clean_up_tokenization_spaces=True,
                     min_length=100)

translation = outputs[0]['translation_text']

print(f'Translation:\n {translation}')

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Translation:
 Cher Amazon, la semaine dernière j'ai commandé une figure d'action Optimus Prime de votre boutique en ligne en Allemagne. Malheureusement, quand j'ai ouvert le paquet, j'ai découvert à mon horreur que j'avais été envoyé une figure d'action de Megatron à la place! En tant qu'ennemi à vie des Decepticons, j'espère que vous pouvez comprendre mon dilemme. Pour résoudre le problème, j'exige un échange de Megatron contre la figure d'Optimus Prime que j'ai commandé.


<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>4.6 |</span></span></b> Text Generation</b></p></div> 

- Another **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** problem is **<span style='color:#FFC300'>generation of text</span>**
- If we wanted to be able to provide faster replies to customer feedback, by having access to an autocomplete
function, we can do this with **<span style='color:#FFC300'>text generation</span>** 

In [72]:
generator = pipeline("text-generation")
response = "Customer service response: Dear Bumblebee, I am sorry to hear that your order was mixed up, we will"
gen_text = generator(response, max_length=50) 
print(f'\n',gen_text[0]['generated_text'])

No model was supplied, defaulted to gpt2 (https://huggingface.co/gpt2)


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Customer service response: Dear Bumblebee, I am sorry to hear that your order was mixed up, we will have you back on course at your cost. I am still sending out orders. Thank you for your patience. -- Jeff C.


## <b>5 <span style='color:#F1A424'>|</span> Sentiment Analysis</b>   
    
### <b><span style='color:#F1A424'>Methods</span></b>
    
There are two approaches to **<span style='color:#FFC300'>sentiment analysis</span>**
- A **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">rule based</mark>** algorithm created by a human
- A **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">model</mark>** approach, which learns from data

### <b><span style='color:#F1A424'>Rule Based Approaches</span></b>

- A common approach to rule based sentiment analysis is to search the text and **<span style='color:#FFC300'>find specific keywords</span>** which are in a **<span style='color:#FFC300'>mapping dictionary</span>** (which would contain the keyword & corresponding sentiment value)
- Going through the entire text, the found keywords would be summed and we would obtain a sentiment score for the entire sentence

### <b><span style='color:#F1A424'>Machine Learning Approaches</span></b>

- Another approach relies on **<span style='color:#FFC300'>labeled data</span>** (positive,negative,...)
- **<span style='color:#FFC300'>Sentences</span>** with a labeled sentiment value are converted to **<span style='color:#FFC300'>numerical representation</span>** (Eg. using methods in **<span style='color:#FFC300'>Section 2,3</span>**)
- A model then uses this numerical data, together with labelled target values in order to train a model that will predicts sentiment of each text

<br>

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>5.1 |</span></span></b> Rule Based Approaches</b></p></div> 

### <b><span style='color:#F1A424'>VADER</span></b>

- One of the very first successful **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">rule based</mark>** sentment analysis algorithms
- Valence Aware Dictionary for Sentiment Reasoning **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">VADER</mark>**
- We'll utilise the library `vaderSentiment`, which is the original library, `nltk` also has implemented the **[VADER](/https://www.nltk.org/api/nltk.sentiment.vader.html)** algorithm

In [73]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
[0mNote: you may need to restart the kernel to use updated packages.


In [74]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sa = SentimentIntensityAnalyzer()

print('Sentiment Values for specific lexicon (single words):')
for key in list(sa.lexicon)[:10]:
    print(key, sa.lexicon[key])
    
print('\nSentiment Values for specific lexicon (multiple words):')
print([(tok,score) for tok, score in sa.lexicon.items() if " " in tok])

Sentiment Values for specific lexicon (single words):
$: -1.5
%) -0.4
%-) -1.5
&-: -0.4
&: -0.7
( '}{' ) 1.6
(% -0.9
('-: 2.2
(': 2.3
((-: 2.1

Sentiment Values for specific lexicon (multiple words):
[("( '}{' )", 1.6), ("can't stand", -2.0), ('fed up', -1.8), ('screwed up', -1.5)]


In [75]:
sentiment1 = sa.polarity_scores(text="Python is very readable and it's great for NLP.")
print(sentiment1)

sentiment2 = sa.polarity_scores(text='Python is a a bad choice for most application.')
print(sentiment2)

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}
{'neg': 0.304, 'neu': 0.696, 'pos': 0.0, 'compound': -0.5423}


- **<span style='color:#FFC300'>great</span>** gives a more positive tone compared to **<span style='color:#FFC300'>bad</span>**
- The downside of the **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">VADER</mark>** is that it **ignores any words which aren't in its lexicon** `sa.lexicon`

<div style="color:white;display:fill;border-radius:8px;background-color:#323232;font-size:150%; letter-spacing:1.0px"><p style="padding: 12px;color:white;"><b><b><span style='color:white'><span style='color:#F1A424'>5.2 |</span></span></b> Machine Learning Approaches</b></p></div> 

### <b><span style='color:#F1A424'>Naive Bayes </span></b>

- For the machine learning model, we'll use a **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">Naive Bayes</mark>** model
- Model weights that are generated during training are similar to those used to map scores in the rule based **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">VADER</mark>** model
- We need to use an example dataset, so we'll use **Hutto's movie** sentiment data
- A more sophisticated approach can be found in notebook **[Twitter Emotion Classification](https://www.kaggle.com/code/shtrausslearning/twitter-emotion-classification)** using Transformers

In [76]:
import pandas as pd
import numpy as np
pd.set_option('display.width',75)
from nltk.tokenize import casual_tokenize
from collections import Counter

movies = pd.read_csv('/kaggle/input/huttodata/hutto_movies.csv',index_col='id')
movies

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.266667,The Rock is destined to be the 21st Century's ...
2,3.533333,The gorgeously elaborate continuation of ''The...
3,-0.600000,Effective but too tepid biopic
4,1.466667,If you sometimes like to go to the movies to h...
5,1.733333,"Emerges as something rare, an issue movie that..."
...,...,...
10601,-0.062500,Well made but mush hearted.
10602,-1.500000,A real snooze.
10603,-0.625000,No surprises.
10604,1.437500,We’ve seen the hippie turned yuppie plot befor...


In [77]:
# Using the BoW feature extraction approach from section 2

from sklearn.feature_extraction.text import CountVectorizer

vectoriser = CountVectorizer()
vectors = vectoriser.fit_transform(movies.text.tolist())

df_matrix = pd.DataFrame(vectors.toarray(),
                         columns=vectoriser.vocabulary_)
df_matrix

Unnamed: 0,the,rock,is,destined,to,be,21st,century,new,conan,...,abysmally,ame,drudgery,snubbing,degenerates,hogwash,slummer,rashomon,dipsticks,muttering
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
# Train a Naive Bayes Binary Classifier

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb = nb.fit(df_matrix,movies.sentiment > 0)

movies['predicted_sentiment'] = nb.predict_proba(df_matrix)[:,1] * 8 - 4   # adjust sentiment value
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
print(f'abs error: {movies.error.mean().round(3)}')

abs error: 1.846


In [79]:
# Let's analyse the positive sentiment results

movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(int)
movies['''sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'''.split()].head()

Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,2.403965,1,1
2,3.533333,3.998899,1,1
3,-0.6,-3.38826,0,0
4,1.466667,1.569746,1,1
5,1.733333,3.90493,1,1


In [80]:
(movies.predicted_ispositive == movies.sentiment_ispositive).sum()/len(movies)

0.9282413955681282

## <b>6 <span style='color:#F1A424'>|</span> Text Clusterisation</b> 

### <b><span style='color:#F1A424'>News Clusterisation </span></b>

- An interesting application in Natural Language Processing **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">NLP</mark>** is **<span style='color:#FFC300'>text clusterisation</span>**
- In this section, we'll attempt to group together various new articles from four categories & visualise the result in two dimensional space
- We will be working with **a subset**, which takes into account **four categories** (to reduce the problem size)
- `fetch_20newsgroups` is used to download news data in the form of a dictionary

In [81]:
# Load relevant libraries
import numpy as np
np.random.seed(0)
import pandas as pd
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
import seaborn as sns; sns.set(style='whitegrid')

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os

# Choose 4 categories
categories = [
    'rec.sport.hockey',      # Hockey
    'talk.politics.mideast', # Middle East Political News 
    'comp.graphics',         # Computer Graphics
    'sci.crypt'              # Cryptography
]

# Download data
dataset = fetch_20newsgroups(subset='all', 
                             categories=categories,
                             shuffle=True, 
                             random_state=42)

# # save pickle file
# with open('20news.pickle','wb') as handle:
#     pickle.dump(dataset,handle,protocol=pickle.HIGHEST_PROTOCOL)

# open pickle file
# with open('20news.pickle','rb') as handle:
    # dataset_read = pickle.load(handle)

# Let's look at the dictionary keys
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

- Our data contains 3903 documents spread across 4 different categories

In [82]:
# Let's look at the keys & size of our data
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

# Define target variable
labels = dataset.target
names = dataset.target_names

3903 documents
4 categories


### <b><span style='color:#F1A424'>Data Preprocessing </span></b>

- Let's look at one example, data which contains news about hockey games from the NHL
- We need to transform texts into numerical values using `CountVectorizer`
- Filter our stop words `stopwords`, which are too common words and dont have any value by themselves

In [83]:
# One example text
print(f'Document has {len(dataset.data[0])} characters\n')
print(dataset.data[0])

Document has 2721 characters

From: c5ff@jupiter.sun.csd.unb.ca (COOK  Charlie)
Subject: NHL Summary parse results for games played Mon, April 19, 1993
Organization: University of New Brunswick
Lines: 79

Toronto                          1 1 1--3
Detroit                          1 4 1--6
First period
     1, Detroit, Yzerman 1 (Gallant, Ciccarelli) 4:48.
     2, Toronto, Cullen 1 (Clark, Gill) 10:44.
Second period
     3, Detroit, Sheppard 1 (Probert, Coffey) pp, 5:04.
     4, Detroit, Burr 1 (Racine) sh, 6:42.
     5, Detroit, Chiasson 1 (Coffey) pp,11:00.
     6, Detroit, Howe 1 (Yzerman, Drake) 14:46.
     7, Toronto, Gilmour 1 (Borschevsky, Ellett) pp, 19:59.
Third period
     8, Detroit, Racine 1 (Primeau, Drake) 5:10.
     9, Toronto, Lefebvre 1 (Cullen, Pearson) 7:45.

Detroit: 6    Power play: 6-2   Special goals:  pp: 2  sh: 1  Total: 3
Scorer            G    A   Pts
---------------  ---  ---  ---
Burr               1    0    1
Chiasson           1    0    1
Ciccarelli        

In [84]:
# Create an object which will tokenise our data
analyzer = CountVectorizer(stop_words='english').build_analyzer()

# Tokenise data
docs = []
for document in dataset.data:
    tokens = analyzer(document.replace('_', ''))
    docs.append(tokens)

# Lets look at the formed tokens for one document
print(f"Tokenised data contains {len(docs[0])} words\n")
print(docs[0], end='\n')

# Confirm the number of documents has stayed the same
print(f'{len(docs)} documents parsed')

Tokenised data contains 180 words

['c5ff', 'jupiter', 'sun', 'csd', 'unb', 'ca', 'cook', 'charlie', 'subject', 'nhl', 'summary', 'parse', 'results', 'games', 'played', 'mon', 'april', '19', '1993', 'organization', 'university', 'new', 'brunswick', 'lines', '79', 'toronto', 'detroit', 'period', 'detroit', 'yzerman', 'gallant', 'ciccarelli', '48', 'toronto', 'cullen', 'clark', 'gill', '10', '44', 'second', 'period', 'detroit', 'sheppard', 'probert', 'coffey', 'pp', '04', 'detroit', 'burr', 'racine', 'sh', '42', 'detroit', 'chiasson', 'coffey', 'pp', '11', '00', 'detroit', 'howe', 'yzerman', 'drake', '14', '46', 'toronto', 'gilmour', 'borschevsky', 'ellett', 'pp', '19', '59', 'period', 'detroit', 'racine', 'primeau', 'drake', '10', 'toronto', 'lefebvre', 'cullen', 'pearson', '45', 'detroit', 'power', 'play', 'special', 'goals', 'pp', 'sh', 'total', 'scorer', 'pts', 'burr', 'chiasson', 'ciccarelli', 'coffey', 'drake', 'gallant', 'howe', 'primeau', 'probert', 'racine', 'sheppard', 'yzerman

### <b><span style='color:#F1A424'>Vectorisation of Text </span></b>
- We looked at embedding methods in **<span style='color:#FFC300'>Section 3</span>**
- Let's use **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">Word2Vec</mark>** model for the creation of **<mark style="background-color:#FFC300;color:white;border-radius:5px;opacity:1.0">embeddings</mark>** (**vector representation for words**)
- Every word will have **50 components** & keep only words which appear 20 times (min_count)
- For each text, we **sum the component vectors** and divide by their total number; averaged sentence vector

In [85]:
!pip install -U gensim

Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.0.1
    Uninstalling gensim-4.0.1:
      Successfully uninstalled gensim-4.0.1
Successfully installed gensim-4.2.0
[0m

In [86]:
from gensim.models import Word2Vec

# Train the vectoriser model 
# The output will be a feature vector for each word
model = Word2Vec(docs, 
                 min_count=20, 
                 vector_size=50)

# Get average embedding vector for each text
def doc_vectorizer(doc, model):
    doc_vector = []
    num_words = 0
    for word in doc:
        try:
            if num_words == 0:
                doc_vector = model.wv[word]
            else:
                doc_vector = np.add(doc_vector, model.wv[word])
            num_words += 1
        except:
            # pass if word is not found
            pass
     
    return np.asarray(doc_vector) / num_words

X = []
for doc in docs:
    X.append(doc_vectorizer(doc, model))

- We obtain a vector made up of 50 components, component averaged for each text, lets check one example:

In [87]:
# How our documents look
print('Averaged text w2v representstion of the hockey score document:')
print(X[0])

# check the size of document vector
print(np.array(X).shape)

Averaged text w2v representstion of the hockey score document:
[-1.6716752  -0.11647164  0.7472527   0.9269589   0.17632335  0.01014612
  1.018923    0.69532    -0.9778698  -0.03308142 -0.20705809 -0.7368745
 -0.5933816   0.32716438  0.0973087  -0.17402852 -0.2317784  -0.14911069
 -0.14395085 -0.5214422  -0.1303203  -0.02557528  1.0683396  -0.64349717
  0.19657405  0.28445783 -0.92583865 -0.06371204 -0.31289467 -0.6510486
 -0.1486652   0.32125127  0.28050053  0.4535382  -1.2328303  -0.13323756
 -0.27970943 -0.20315778  0.6250774  -0.81554705  0.46289375 -0.47906813
 -0.53048176  0.02681099  0.3159594   0.399262   -0.5722732  -0.9323026
 -0.30813485 -0.05658304]
(3903, 50)


### <b><span style='color:#F1A424'>Dimensionality Reduction </span></b>
- Clusterisation will work better if instead of 50 components, we have less
- Lets use a manifold learning unsupervised learning model `TSNE`
- As we reduce dimensions, the model can retain the closeness of elements


In [88]:
# t-SNE
from sklearn.manifold import TSNE

# Create t-SNE object
tsne = TSNE(n_components=2, random_state=0) 

# Transform our data from 50 components to 2 
X = tsne.fit_transform(X) 
print(np.array(X).shape)
print(np.array(X)[:10])

(3903, 2)
[[ 74.07277   -19.305073 ]
 [ 73.64727     9.518189 ]
 [-21.392765    1.1420317]
 [-39.3321      7.871047 ]
 [-15.548744   -1.9471322]
 [  4.614679   11.8018265]
 [  6.944525   17.566723 ]
 [-41.351994   18.92087  ]
 [  2.7967808 -30.771679 ]
 [-59.629375    6.2531796]]


### <b><span style='color:#F1A424'>Clusterisation </span></b>

- For clusterisation, lets use `kmeans` with 4 components,as we have four classes, this will group our unlabelled data into 4 groups

In [89]:
# Create kmeans object
kmeans = KMeans(n_clusters=4)

# Obtain the prediction
y_pred = kmeans.fit(X).predict(X)

# Coordinates of centroids
print ("Cluster Centroids:\n", kmeans.cluster_centers_)

Cluster Centroids:
 [[-37.573822    -0.75575083]
 [ -7.1976366   45.75536   ]
 [ 52.965427     2.9969554 ]
 [ -4.623557   -36.692543  ]]


### <b><span style='color:#F1A424'>Clusterisation Evaluation </span></b>

- Lets check different evaluation metrics for our unsupervised learning model
- **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">Silhouette score</mark>** can be used without knowing the values of the real class
- **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">Homogeneity</mark>**, **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">completeness</mark>** and **<mark style="background-color:#393939;color:white;border-radius:5px;opacity:1.0">v-score</mark>** show that the clusters are quite complete, if we have the target values, we can use these metrics

In [90]:
print ("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, y_pred, metric='euclidean'))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, y_pred))
print("completeness: %0.3f" % metrics.completeness_score(labels, y_pred))
print("V-meaure: %0.3f" % metrics.v_measure_score(labels, y_pred))

Silhouette Coefficient: 0.507
Homogeneity: 0.772
completeness: 0.775
V-meaure: 0.774


### <b><span style='color:#F1A424'>Visualisation of Clusters </span></b>

- Lets check how well our model grouped different news texts

In [91]:
data = pd.DataFrame(np.concatenate([X,y_pred[:,None]],axis=1),
                    columns=['x','y','colour'])
data0 = pd.DataFrame(np.concatenate([X,labels[:,None]],axis=1),
                     columns=['x','y','colour'])
data0['colour'] = data0['colour'].replace({3:3,0:2,2:0,1:1})

data['colour'] = data['colour'].replace(dict(zip([0,1,2,3],names)))
data0['colour'] = data0['colour'].replace(dict(zip([0,1,2,3],names)))

In [92]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

fig = px.scatter(data,x='x',y='y',color='colour')
fig.update_layout(template='plotly_white')
fig.update_traces(marker=dict(size=6,
                              line=dict(width=0.75,
                                        color='black')),
                  selector=dict(mode='markers'))
fig.update_layout(coloraxis_showscale=False,title='Text Cluster Prediction',width=600)
# fig.update_layout(coloraxis = {'colorscale':'rainbow'})
fig.show()

fig = px.scatter(data0,x='x',y='y',color='colour')
fig.update_layout(template='plotly_white')
fig.update_traces(marker=dict(size=6,
                              line=dict(width=0.75,
                                        color='black')),
                  selector=dict(mode='markers'))
fig.update_layout(title='Cluster Ground Truth',width=600)
fig.show()
