# Text Processing - Yelp 2021 - Part 3

This notebook covers:
* Word Embedding Models
* Word2Vec
* Doc2Vec
* Bert
* Deep Learning Classification

## Imports and Global Settings

In [1]:
# Common Libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Main NLP libraries
import nltk
import gensim
# Word2Vec
from gensim.models import Word2Vec
import gensim.downloader as api


pd.set_option('display.float_format', lambda x: '%.5f' % x)



## Import Data

In [2]:
file_location = "../data/full_data/analytics_ready/"
filename = "text_data.json"

In [3]:
# 6907890 records available
num_records_to_load = 1000

In [4]:
df = pd.read_json(file_location + filename, nrows=num_records_to_load, orient="records", lines=True)

## Dataframe Pre-Processing

In [5]:
df.head(5)

Unnamed: 0,review_id,review_stars,review_text,target_ufc_bool,target_ufc_count
0,---zlFD4Kgfatr0SbDh_zg,4,Been looking for a halfway decent Chinese/Amer...,False,0
1,--BcxYRlOpG0v7nVQWseYA,4,I visited Kyma last week for the first time an...,False,0
2,--KO46TSxWzv32x00s5w9Q,5,It might be the most expensive gelato I've eve...,False,0
3,--XNrIWxRUafMsGqzB5o0g,5,"Love this place! They have great antiques, be...",True,1
4,--aGgQu9HVva6F9fB2-0ew,4,Great salad and cold sandwich.. The soup is am...,False,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         1000 non-null   object
 1   review_stars      1000 non-null   int64 
 2   review_text       1000 non-null   object
 3   target_ufc_bool   1000 non-null   object
 4   target_ufc_count  1000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 39.2+ KB


In [7]:
df.target_ufc_bool.value_counts()

True     546
False    454
Name: target_ufc_bool, dtype: int64

## Splitting Text

In [8]:
corpus = df.review_text
text_target = df[['target_ufc_bool', 'review_text']]
Q_corpus = df[df["target_ufc_bool"] == "True"]["review_text"]
NQ_corpus = df[df["target_ufc_bool"] == "False"]["review_text"]
print(f'Corpus Size: Total:{corpus.size}, Quality:{Q_corpus.size}, Not Quality:{NQ_corpus.size}')

Corpus Size: Total:1000, Quality:546, Not Quality:454


## Word2Vec

### Preprocessing

In [None]:
w2v_df = text_target.copy()

In [None]:
w2v_df['review_text_clean'] = w2v_df['review_text'].apply(lambda x: preprocess_text(x, run_stemm=False,
                                                                                   run_lemm=False,
                                                                                   remove_num=True,
                                                                                   stopwords=None))

In [None]:
w2v_df['clean_text_lst'] = w2v_df['review_text_clean'].apply(lambda x: x.split())

In [None]:
w2v_df.head()

### All Text Combined

In [None]:
all_text = " ".join(w2v_df['review_text_clean'])
all_text_lst = [x for x in w2v_df['clean_text_lst']]
all_text_lst

In [None]:
word2vec_model = Word2Vec(all_text_lst, min_count=2)

In [None]:
word2vec_model.wv.most_similar('good', topn=10)

In [None]:
word2vec_model.wv.('good', 'beef')

In [None]:
api.info('word2vec-google-news-300')

In [None]:
# Overloads
# pre_trained_word2vec_model = api.load('word2vec-google-news-300')