# Text Processing - Yelp 2021 - Part 3

This notebook covers:
* Word Embedding Models
* Word2Vec
* Doc2Vec
* Bert
* Fasttext

## Imports and Global Settings

In [1]:
# Common Libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Main NLP libraries
import nltk
import gensim
# Word2Vec
from gensim.models import Word2Vec, word2vec
import gensim.downloader as api
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Fasttext
import csv
import fasttext
from gensim.utils import simple_preprocess
# Bert - DistilBert
import torch
import transformers as ppb

pd.set_option('display.float_format', lambda x: '%.5f' % x)



## Import Data

In [2]:
file_location = "../data/full_data/analytics_ready/"

In [3]:
filename_train = "text_data_train.json"
filename_test = "text_data_test.json"

In [4]:
# 5523992 training records available
# 1382379 testing records available
num_records_to_load = 1000

In [5]:
train = pd.read_json(file_location + filename_train, nrows=num_records_to_load, orient="records", lines=True)
test = pd.read_json(file_location + filename_test, nrows=num_records_to_load, orient="records", lines=True)

## Data Overview

In [6]:
train.head(5)

Unnamed: 0,review_id,review_stars,review_text,target_ufc_bool,target_ufc_count
0,---zlFD4Kgfatr0SbDh_zg,4,Been looking for a halfway decent Chinese/Amer...,False,0
1,--BcxYRlOpG0v7nVQWseYA,4,I visited Kyma last week for the first time an...,False,0
2,--KO46TSxWzv32x00s5w9Q,5,It might be the most expensive gelato I've eve...,False,0
3,--XNrIWxRUafMsGqzB5o0g,5,"Love this place! They have great antiques, be...",True,1
4,--aGgQu9HVva6F9fB2-0ew,4,Great salad and cold sandwich.. The soup is am...,False,0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         1000 non-null   object
 1   review_stars      1000 non-null   int64 
 2   review_text       1000 non-null   object
 3   target_ufc_bool   1000 non-null   object
 4   target_ufc_count  1000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 39.2+ KB


In [8]:
print('Training Counts')
train.target_ufc_bool.value_counts()

Training Counts


True     522
False    478
Name: target_ufc_bool, dtype: int64

In [9]:
print('Testing Counts')
test.target_ufc_bool.value_counts()

Testing Counts


True     528
False    472
Name: target_ufc_bool, dtype: int64

In [10]:
print('Training Percent')
train.target_ufc_bool.value_counts(normalize=True)

Training Percent


True    0.52200
False   0.47800
Name: target_ufc_bool, dtype: float64

In [11]:
print('Testing Percent')
test.target_ufc_bool.value_counts(normalize=True)

Testing Percent


True    0.52800
False   0.47200
Name: target_ufc_bool, dtype: float64

## Word2Vec

https://www.kaggle.com/jungealexander/word2vec-and-random-forest-classification

### Preprocessing for Word2Vec

In [12]:
# Load the punkt tokenizer used for splitting reviews into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [13]:
def review_to_wordlist(review, remove_stopwords=False):
    """
    Convert a review to a list of words. Removal of stop words is optional.
    """
    # remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review)
    
    # convert to lower case and split at whitespace
    words = review_text.lower().split()
    
    # remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return words

In [14]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    """
    Split review into list of sentences where each sentence is a list of words.
    Removal of stop words is optional.
    """
    # use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())

    # each sentence is furthermore split into words
    sentences = []    
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
            
    return sentences

In [15]:
training_text = []
for review in train['review_text']:
    training_text += review_to_sentences(review, tokenizer)

In [16]:
training_text[3]

['i',
 'visited',
 'kyma',
 'last',
 'week',
 'for',
 'the',
 'first',
 'time',
 'and',
 'really',
 'enjoyed',
 'it']

### Training Word2Vec Model

In [17]:
# Set values for various word2vec parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 1    # Minimum word count                        
num_workers = 3       # Number of threads to run in parallel
context = 10          # Context window size

w2v_model = Word2Vec(sentences=training_text,
                     workers=num_workers, vector_size=num_features,
                     min_count=min_word_count, window=context)

### Pre-Trained Word2Vec Model

In [18]:
# goog_w2v_model = api.load('word2vec-google-news-300')

### Preprocessing for Classification

In [19]:
def make_feature_vec(words, model, num_features, local_trained_model):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0
    if local_trained_model:
        index2word_set = set(model.wv.index_to_key)  # words known to the model
        for word in words:
            if word in index2word_set: 
                nwords += 1
                feature_vec = np.add(feature_vec,model.wv.get_vector(word))
    else:
        index2word_set = set(model.index_to_key)
        for word in words:
            if word in index2word_set: 
                nwords += 1
                feature_vec = np.add(feature_vec, model[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

In [20]:
def get_avg_feature_vecs(reviews, model, num_features, local_trained_model):
    """
    Calculate average feature vectors for all reviews
    """
    counter = 0
    review_feature_vecs = np.zeros((len(reviews),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features, local_trained_model)
        counter += 1
    return review_feature_vecs

In [21]:
# calculate average feature vectors for training and test sets
clean_train_reviews = []
for review in train['review_text']:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))
trainDataVecs = get_avg_feature_vecs(clean_train_reviews, w2v_model, num_features, True)

clean_test_reviews = []
for review in test['review_text']:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
testDataVecs = get_avg_feature_vecs(clean_test_reviews, w2v_model, num_features, True)

### Train Classifier

In [22]:
# Fit a random forest to the training data
forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(trainDataVecs, train['target_ufc_bool'])

### Predict

In [23]:
w2v_cls_results = forest.predict(testDataVecs)

### Metrics

In [24]:
print(classification_report(test['target_ufc_bool'], w2v_cls_results))

              precision    recall  f1-score   support

       False       0.54      0.54      0.54       472
        True       0.59      0.58      0.58       528

    accuracy                           0.56      1000
   macro avg       0.56      0.56      0.56      1000
weighted avg       0.56      0.56      0.56      1000



## Fasttext

https://fasttext.cc/docs/en/supervised-tutorial.html  
https://towardsdatascience.com/fasttext-for-text-classification-a4b38cbff27c

### Preprocessing

In [25]:
ft_train = train.copy()
ft_test = test.copy()

In [26]:
ft_train['review_text'] = ft_train['review_text'].apply(lambda x: ' '.join(simple_preprocess(x)))
ft_train['target_ufc_bool'] = ft_train['target_ufc_bool'].apply(lambda x: '__label__' + x)

ft_test['review_text'] = ft_test['review_text'].apply(lambda x: ' '.join(simple_preprocess(x)))
ft_test['target_ufc_bool'] = ft_test['target_ufc_bool'].apply(lambda x: '__label__' + x)

In [27]:
ft_train[['target_ufc_bool', 'review_text']].to_csv('ft_train.txt', 
                                                  index = False, 
                                                  sep = ' ',
                                                  header = None, 
                                                  quoting = csv.QUOTE_NONE, 
                                                  quotechar = "", 
                                                  escapechar = " ")

ft_test[['target_ufc_bool', 'review_text']].to_csv('ft_test.txt', 
                                                 index = False, 
                                                 sep = ' ',
                                                 header = None, 
                                                 quoting = csv.QUOTE_NONE, 
                                                 quotechar = "", 
                                                 escapechar = " ")

### Train Model

In [28]:
ft_model = fasttext.train_supervised(input='ft_train.txt', wordNgrams = 2, epoch=25, lr=1.0)

### Test Model

In [29]:
ft_model.test('ft_test.txt')

(1000, 0.585, 0.585)

### Get Predictions

In [30]:
ft_test['predictions'] = ft_test['review_text'].apply(ft_model.predict)

In [31]:
def get_quality_prob(x):
    if x[0][0] == '__label__True':
        return round(x[1][0], 5)
    elif x[0][0] == '__label__False':
        return round(1 - x[1][0], 5)

In [32]:
ft_test['ft_quality_prob'] = ft_test['predictions'].apply(get_quality_prob)

In [33]:
ft_test.head()

Unnamed: 0,review_id,review_stars,review_text,target_ufc_bool,target_ufc_count,predictions,ft_quality_prob
0,--p3d1axlnA7ka_p6hO-QQ,5,oh we love this place discovered it years ago ...,__label__False,0,"((__label__True,), [0.9469317197799683])",0.94693
1,-1v3W4XqQcIe44_I1lZYyA,5,didn smell anything bad like the other reviewe...,__label__True,5,"((__label__True,), [0.9846104979515076])",0.98461
2,-21y2QEKfhjxh2algH_0nQ,5,wow this place is good am so glad that work in...,__label__True,1,"((__label__True,), [0.999253511428833])",0.99925
3,-358vecdAUh6ECkNfawvHw,5,for crepe you can choose up to flavors the men...,__label__False,0,"((__label__True,), [0.6119228601455688])",0.61192
4,-3_NmlYMibrapNEnS_gfcg,5,ve visited kyma twice now with approximately o...,__label__True,1,"((__label__True,), [0.5569931864738464])",0.55699


## BERT

https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb

!!! Model overloads memory !!!

In [34]:
bert_train = train.copy()
bert_test = test.copy()

In [35]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
tokenized_train = bert_train['review_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=500)))
tokenized_test = bert_test['review_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=500)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [37]:
max_len = 0
for i in tokenized_train.values:
    if len(i) > max_len:
        max_len = len(i)

padded_train = np.array([i + [0]*(max_len-len(i)) for i in tokenized_train.values])

max_len = 0
for i in tokenized_test.values:
    if len(i) > max_len:
        max_len = len(i)

padded_test = np.array([i + [0]*(max_len-len(i)) for i in tokenized_test.values])

In [38]:
attention_mask_train = np.where(padded_train != 0, 1, 0)
attention_mask_test = np.where(padded_test != 0, 1, 0)

In [None]:
input_ids_train = torch.tensor(padded_train)  
attention_mask_train = torch.tensor(attention_mask_train)

with torch.no_grad():
    last_hidden_states_train = model(input_ids_train, attention_mask=attention_mask_train)

In [None]:
input_ids_test = torch.tensor(padded_test)  
attention_mask_test = torch.tensor(attention_mask_test)

with torch.no_grad():
    last_hidden_states_test = model(input_ids_test, attention_mask=attention_mask_test)

In [None]:
bert_train_features = last_hidden_states_train[0][:,0,:].numpy()
bert_test_features = last_hidden_states_test[0][:,0,:].numpy()

### Train Classifier

In [None]:
# Fit a random forest to the training data
bert_forest = RandomForestClassifier(n_estimators = 100)

bert_forest = bert_forest.fit(bert_train_features, train['target_ufc_bool'])

### Predict

In [None]:
bert_cls_results = bert_forest.predict(bert_test_features)

### Metrics

In [None]:
print(classification_report(test['target_ufc_bool'], bert_cls_results))