## MARC 2022 Training Workshop on Machine Learning and NLP 
## Part II: NLP

### Jiangang Hao, ETS, contact: <jhao@ets.org>
----

### 1. Load packages

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import nltk
from spellchecker import SpellChecker
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/jhao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jhao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### 2. Text preprocessing and Ngram 

In [5]:
text = 'The class is over. I hopep it is intersting to you. Please let me knoww if not.'

In [6]:
#change to lower case
text.lower()

'the class is over. i hopep it is intersting to you. please let me knoww if not.'

In [7]:
# word tokenization
word_tokens = word_tokenize(text)
print(word_tokens)

['The', 'class', 'is', 'over', '.', 'I', 'hopep', 'it', 'is', 'intersting', 'to', 'you', '.', 'Please', 'let', 'me', 'knoww', 'if', 'not', '.']


In [8]:
# remove stop words and punctuations
stopword_list = stopwords.words('english')
punctuation_list = list(string.punctuation)
cleaned_text = [txt for txt in word_tokenize(text.lower()) if txt not in stopword_list+punctuation_list]
print(cleaned_text)

['class', 'hopep', 'intersting', 'please', 'let', 'knoww']


In [9]:
# typo correction
spell = SpellChecker()
corrected_text = [spell.correction(wd) for wd in cleaned_text]
print(corrected_text)

['class', 'hope', 'interesting', 'please', 'let', 'knoww']


In [10]:
# part of speech tagging
pos_tag(corrected_text)

[('class', 'NN'),
 ('hope', 'NN'),
 ('interesting', 'VBG'),
 ('please', 'JJ'),
 ('let', 'VB'),
 ('knoww', 'VB')]

In [11]:
# Stemming the words
porter = PorterStemmer()
stem_words = [porter.stem(txt) for txt in corrected_text]
list(zip(corrected_text,stem_words))

[('class', 'class'),
 ('hope', 'hope'),
 ('interesting', 'interest'),
 ('please', 'pleas'),
 ('let', 'let'),
 ('knoww', 'knoww')]

In [12]:
# ngram representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [13]:
# sentence tokenization
sentence_list = sent_tokenize(text)
print(sentence_list)

['The class is over.', 'I hopep it is intersting to you.', 'Please let me knoww if not.']


In [14]:
# applying the stop words removal and typo correction
correct_sentence_list = []
for sent in sentence_list:
    correct_sentence_list.append(' '.join([spell.correction(wd) for wd in word_tokenize(sent.lower()) \
                                  if wd not in stopword_list+punctuation_list]))
   

In [15]:
correct_sentence_list

['class', 'hope interesting', 'please let knoww']

In [17]:
#unigram
vectorizer = CountVectorizer(ngram_range=(1,1)) 
X = vectorizer.fit_transform(correct_sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names_out()
df

Unnamed: 0,class,hope,interesting,knoww,let,please
0,1,0,0,0,0,0
1,0,1,1,0,0,0
2,0,0,0,1,1,1


In [50]:
# Tf-Idf transformation of unigram
vectorizer = TfidfVectorizer(ngram_range=(1,1)) 
X = vectorizer.fit_transform(correct_sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names_out()
df.round(2)

Unnamed: 0,class,hope,interesting,knoww,let,please
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.71,0.71,0.0,0.0,0.0
2,0.0,0.0,0.0,0.58,0.58,0.58


In [21]:
#bigram
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names_out()
df

Unnamed: 0,class is,hopep it,if not,intersting to,is intersting,is over,it is,knoww if,let me,me knoww,please let,the class,to you
0,1,0,0,0,0,1,0,0,0,0,0,1,0
1,0,1,0,1,1,0,1,0,0,0,0,0,1
2,0,0,1,0,0,0,0,1,1,1,1,0,0


### 3. Latent Semantic Analysis
Here is a great tutorial for more details for using Gensim: <https://www.datacamp.com/tutorial/discovering-hidden-topics-python>

In [48]:
from sklearn.decomposition import TruncatedSVD

In [65]:
# specify the number of topics
num_components=2 

# create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

In [66]:
# fit SVD model on data
lsa.fit_transform(X)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

In [67]:
V_transpose

array([[ 4.19518474e-17, -1.03829427e-16],
       [ 7.07106781e-01,  9.03795930e-17],
       [ 7.07106781e-01,  3.55817821e-17],
       [-1.21774228e-16,  5.77350269e-01],
       [-1.21774228e-16,  5.77350269e-01],
       [-1.21774228e-16,  5.77350269e-01]])

In [63]:
# Print the topics with their terms
terms = vectorizer.get_feature_names_out()

In [92]:
# Print the topics

def print_topics(lsa_model):
    for index, component in enumerate(lsa_model.components_):
        zipped = zip(terms, component)
        top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
        top_terms_list=list(dict(top_terms_key).keys())
        print("Topic "+str(index)+": ",top_terms_list)
print_topics(lsa)

Topic 0:  ['hope', 'interesting', 'class', 'knoww', 'let']
Topic 1:  ['knoww', 'let', 'please', 'hope', 'interesting']


### 4. Neural Embedding

In [22]:
# word vectors (word2vec)
import gensim.downloader as api
from scipy.spatial.distance import cosine

In [23]:
#loading the 100 dimension word vector dictionary trained on twitter data. https://nlp.stanford.edu/projects/glove/
model = api.load("glove-twitter-100")



In [25]:
# get the vector of the word cat
model.get_vector('cat')

array([ 0.38446  , -0.45507  ,  0.45351  ,  0.4301   , -0.050908 ,
       -0.26414  ,  0.43253  , -0.3166   ,  0.32214  ,  0.0064333,
       -0.47066  ,  0.95335  , -3.2063   ,  0.010913 , -0.27565  ,
        1.1732   ,  0.52033  , -0.045973 ,  0.094254 , -0.53846  ,
        0.0035668,  0.11934  , -0.17815  , -0.58093  ,  0.65081  ,
       -0.48746  , -0.50961  ,  0.42771  , -0.30638  ,  0.32385  ,
        0.33687  , -0.1717   , -0.39104  , -0.19038  ,  0.37016  ,
       -0.50396  ,  0.041969 , -0.20517  ,  0.3223   ,  0.41217  ,
       -0.42191  , -0.26359  , -0.1773   , -0.35658  ,  0.52145  ,
        0.57282  ,  0.60204  ,  0.74369  ,  0.33377  , -0.45041  ,
        0.015978 , -0.12575  ,  0.29786  , -0.77635  ,  0.23759  ,
        0.63821  ,  0.63726  ,  1.0079   ,  0.13714  , -0.031928 ,
       -0.21299  ,  0.52348  ,  0.67934  , -0.1427   , -0.64236  ,
       -0.47996  , -0.87915  ,  0.17501  ,  0.64517  ,  0.3778   ,
        0.53493  , -0.29723  , -0.25206  , -0.757    ,  0.3364

In [26]:
# get the most similar words as cat
model.most_similar('dinosaur')

[('dinosaurs', 0.7145547866821289),
 ('turtle', 0.6854123473167419),
 ('monkey', 0.6693367958068848),
 ('unicorn', 0.6604658961296082),
 ('t-rex', 0.6527642011642456),
 ('jurassic', 0.6525610089302063),
 ('penguin', 0.6506965756416321),
 ('extinct', 0.6369237303733826),
 ('pig', 0.6347753405570984),
 ('frog', 0.6319982409477234)]

In [21]:
# cosine similarity between cat and tiger
1-cosine(model.get_vector('cat'), model.get_vector('tiger'))

0.6474888920783997

In [22]:
#cosine similarity between cat and kitten
1-cosine(model.get_vector('cat'), model.get_vector('kitten'))

0.7936834692955017

In [23]:
#cosine similarit between cat and car
1-cosine(model.get_vector('cat'), model.get_vector('car'))

0.5291033983230591

### 5. Deep Learning Language Models
<https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads>

In [46]:
# fill the blank task
from transformers import pipeline
unmasker = pipeline('fill-mask', model='xlm-roberta-base')

In [47]:
pd.DataFrame(unmasker("Hello I'm Jiangang, and I am running a <mask> to teach people machine learning."))


Unnamed: 0,score,token,token_str,sequence
0,0.213851,15411,course,"Hello I'm Jiangang, and I am running a course ..."
1,0.130649,18507,class,"Hello I'm Jiangang, and I am running a class t..."
2,0.127468,1528,program,"Hello I'm Jiangang, and I am running a program..."
3,0.093742,13452,project,"Hello I'm Jiangang, and I am running a project..."
4,0.068059,10696,school,"Hello I'm Jiangang, and I am running a school ..."


In [68]:
# another one, A: ordinary, B: stubborn, C: skeptical, D. fascinating, E. unobtrusive
pd.DataFrame(unmasker("It is ironic and somehow tragic that good people are often dull while evil people can be endlessly <mask>"))


Unnamed: 0,score,token,token_str,sequence
0,0.479797,5,.,It is ironic and somehow tragic that good peop...
1,0.098909,27,...,It is ironic and somehow tragic that good peop...
2,0.032577,4127,good,It is ironic and somehow tragic that good peop...
3,0.020705,17723,happy,It is ironic and somehow tragic that good peop...
4,0.01682,17110,sad,It is ironic and somehow tragic that good peop...


In [82]:
#sentence generation
import warnings
warnings.filterwarnings('ignore')
from transformers import pipeline, set_seed
#generator = pipeline('text-generation', model='gpt2')
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')
set_seed(43)

In [90]:
prompt='Today, I am giving a traing workshop on machine learning and NLP. I am going to '
print(generator(prompt, max_length=120, num_return_sequences=1)[0].get('generated_text'))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Today, I am giving a traing workshop on machine learning and NLP. I am going to 
be working with a group from Stanford, and will be speaking to students
over the course of the course. I am very excited and looking forward to
working with a great group of people, and I hope to make some new
contributions in the coming week. I would like to ask for any recommendations
of topics that I should study if I do get this opportunity. I know that
studying computational linguistics is a very active field right now, so I
will
