## MARC 2022 Training Workshop on Machine Learning and NLP 
## Part II: NLP

### Jiangang Hao, ETS, contact: <jhao@ets.org>
----

### 1. Load packages

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import nltk
from spellchecker import SpellChecker
import string
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

### 2. Text preprocessing and Ngram 

In [None]:
text = 'The class is over. I hopep it is intersting to you. Please let me knoww if not.'

In [None]:
#change to lower case
text.lower()

In [None]:
# word tokenization
word_tokens = word_tokenize(text)
print(word_tokens)

In [None]:
# remove stop words and punctuations
stopword_list = stopwords.words('english')
punctuation_list = list(string.punctuation)
cleaned_text = [txt for txt in word_tokenize(text.lower()) if txt not in stopword_list+punctuation_list]
print(cleaned_text)

In [None]:
# typo correction
spell = SpellChecker()
corrected_text = [spell.correction(wd) for wd in cleaned_text]
print(corrected_text)

In [None]:
# part of speech tagging
pos_tag(corrected_text)

In [None]:
# Stemming the words
porter = PorterStemmer()
stem_words = [porter.stem(txt) for txt in corrected_text]
list(zip(corrected_text,stem_words))

In [None]:
# ngram representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [None]:
# sentence tokenization
sentence_list = sent_tokenize(text)
print(sentence_list)

In [None]:
# applying the stop words removal and typo correction
correct_sentence_list = []
for sent in sentence_list:
    correct_sentence_list.append(' '.join([spell.correction(wd) for wd in word_tokenize(sent.lower()) \
                                  if wd not in stopword_list+punctuation_list]))
   

In [None]:
correct_sentence_list

In [None]:
#unigram
vectorizer = CountVectorizer(ngram_range=(1,1)) 
X = vectorizer.fit_transform(correct_sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names_out()
df

In [None]:
# Tf-Idf transformation of unigram
vectorizer = TfidfVectorizer(ngram_range=(1,1)) 
X = vectorizer.fit_transform(correct_sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names_out()
df.round(2)

In [None]:
#bigram
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names_out()
df

### 3. Latent Semantic Analysis
Here is a great tutorial for more details for using Gensim: <https://www.datacamp.com/tutorial/discovering-hidden-topics-python>

In [None]:
from sklearn.decomposition import TruncatedSVD
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# documents from communication in a collaborative task
doc_list=["So, apt. A has mail + packages delivered directly to the tenants, guaranteed same rent for 2 years, and free wifi", 
          "I like a as well","I did pick C as the best however A also has several friends that live in the same building", 
          "all utilities are included, and it includes 2 full bathrooms", "A or C is my pick, because B's rent apparently usually increases 20% after the 1st yr",
          "It also has onsite laundry which is clean and well maintained and available 24 7", "I think I'd go with A, then", "I think that the availability of  maintenance makes C the best",
          "Can we all agree that B is the worst?", "Apartment B has tenants next door with a salt water aquarium, cell phone service connectivity is weak, and there is only one full bathroom in the apartment",
          "C says maintenance isn't always handled promptly, so I'm worried about it", "B is def worst", "so yeah I dislike B", "A,C,B then?", "that weak cell phone service is enough to make me not even consider it lol",
          "Yeah utilities arent included for b", "A, C, B is definitely what I'd do, yeah", "I'd agree", "Apartment C offers a discount for local college students, the complex is located on land that used to be a farm, and the landlord owns a pizza restaurant",
          "i agree with A, C, B", "The 20% rent increase after a year for B is a big oof"]

In [None]:
len(doc_list)

In [None]:
# Applying preprocessing

doc_list_correct = []
for sent in doc_list:
    doc_list_correct.append(' '.join([spell.correction(wd) for wd in word_tokenize(sent.lower()) \
                                  if wd not in stopword_list+punctuation_list]))

In [None]:
# get the doc term matrix
vectorizer_lsa = TfidfVectorizer(ngram_range=(1,1)) 
X_lsa = vectorizer_lsa.fit_transform(doc_list_correct)

In [None]:
#document term matrix
pd.DataFrame(X_lsa.toarray()).head(5)

In [None]:
# specify the number of topics and create SVD object
num_components=10
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

In [None]:
# fit SVD model on data
lsa.fit_transform(X_lsa)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_T = lsa.components_.T


In [None]:
# explained variance by topics
plt.plot(lsa.explained_variance_ratio_,'bo-')

In [None]:
# Print the topics with their terms
terms = vectorizer_lsa.get_feature_names_out()

In [None]:
# Print the topics

def print_topics(lsa_model):
    for index, component in enumerate(lsa_model.components_):
        zipped = zip(terms, component)
        top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
        top_terms_list=list(dict(top_terms_key).keys())
        print("Topic "+str(index)+": ",top_terms_list)
print_topics(lsa)

### 4. Neural Embedding

In [None]:
# word vectors (word2vec)
import gensim.downloader as api
from scipy.spatial.distance import cosine

In [None]:
#loading the 100 dimension word vector dictionary trained on twitter data. https://nlp.stanford.edu/projects/glove/
model = api.load("glove-twitter-100")

In [None]:
# get the vector of the word cat
model.get_vector('cat')

In [None]:
# get the most similar words as cat
model.most_similar('cat')

In [None]:
# cosine similarity between cat and tiger
1-cosine(model.get_vector('cat'), model.get_vector('tiger'))

In [None]:
#cosine similarity between cat and kitten
1-cosine(model.get_vector('cat'), model.get_vector('kitten'))

In [None]:
#cosine similarit between cat and car
1-cosine(model.get_vector('cat'), model.get_vector('car'))

### 5. Deep Learning Language Models
<https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads>

In [None]:
# fill the blank task
from transformers import pipeline
unmasker = pipeline('fill-mask', model='xlm-roberta-base')

In [None]:
pd.DataFrame(unmasker("Hello I'm Jiangang, and I am running a <mask> to teach people machine learning."))


In [None]:
# another one, A: ordinary, B: stubborn, C: skeptical, D. fascinating, E. unobtrusive
pd.DataFrame(unmasker("It is ironic and somehow tragic that good people are often dull while evil people can be endlessly <mask>"))


In [None]:
#sentence generation
from transformers import pipeline, set_seed
#generator = pipeline('text-generation', model='gpt2')
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')
set_seed(43)

In [None]:
prompt='Today, I am giving a traing workshop on machine learning and NLP. I am going to'
print(generator(prompt, max_length=120, num_return_sequences=1)[0].get('generated_text'))