In [62]:
import spacy 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from spacy import displacy

In [63]:
nlp = spacy.load("en_core_web_trf")

In [4]:
df = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [5]:
df.head(3)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


# Spacy Linguistic Features

* Text: The original word text.
* Lemma: The base form of the word.
* POS: The simple UPOS part-of-speech tag.
* Tag: The detailed part-of-speech tag.
* Dep: Syntactic dependency, i.e. the relation between tokens.
* Shape: The word shape – capitalization, punctuation, digits.
* is alpha: Is the token an alpha character?
* is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [41]:
reviews = df["Review"]
review = reviews[12]

doc = nlp(review)

In [44]:
doc = nlp(review) #Contains the whole document 

for token in doc: #Every individual token/word is a spacy object that has been instantiated with its inbuilt features 
    text = token.text
    lemma = token.lemma_
    pos = token.pos_ 
    tag = token.tag_ 
    dep = token.dep_ 
    shape = token.shape_
    alpha = token.is_alpha 
    stop = token.is_stop
    print(f"Text : {text}\nLemma Form : {lemma}\nParts Of Speech : {pos}\nPOS Tag : {tag}\nSyntatic Dependency : {dep}\nWord Shape : {shape}\nAlphabet Check : {alpha}\nStopword Check : {stop}")
    break

Text : nice
Lemma Form : nice
Parts Of Speech : ADJ
POS Tag : JJ
Syntatic Dependency : amod
Word Shape : xxxx
Alphabet Check : True
Stopword Check : False


In [53]:
# Entity Extraction 

for token in doc.ents:
    element = token.text 
    label = token.label_
    print(f"Entity : {element}\t Label : {label}" ) 
    


Entity : kimpton	 Label : ORG
Entity : 5th	 Label : ORDINAL
Entity : new central library	 Label : ORG
Entity : benaroya concert hall	 Label : ORG
Entity : steep	 Label : ORG
Entity : 29	 Label : CARDINAL
Entity : evening	 Label : TIME


In [59]:
displacy.render(doc, style="ent")

# Spacy Transformers 

In [56]:
from thinc.api import set_gpu_allocator, require_gpu

# To prevent memory allocation issues on the GPU

In [57]:
set_gpu_allocator("pytorch")
require_gpu(0)

True

In [61]:
review = reviews[110]

review

"fun charming just needed issues stayed marqueen days march tourist trip seattle intention hitting tourist sites having traveled seattle business times, marqueen location perfect seattle center space needle emp sci fi museum children museum theaters intiman seattle rep mccaw hall key arena, charming neighborhood good restaurants great not good selection does wear days rooms appointed having kitchen great, ate room going larry market right near good fresh meals.the staff friendly helpful seen awhile, knowledgeable friendly fake, quick recommendation need, not say good things staff, really stay worthwhile, weekend staff not friendly helpful.the drawbacks bed uncomfortable slept pillows flat hard, does loud event key arena bars street especially weekend avoid room floor facing street costs, n't like stairs avoid room floor, no elevator walking day gets tiring, clear day views floor room facing street gorgeous.you not close downtown pike place waterfront, did walk couple times n't terrible

In [72]:
nlp = spacy.load("en_core_web_lg")

In [73]:
doc = nlp(review)
doc[1]

charming

In [75]:
doc[1].vector.shape

(300,)

In [86]:
v1 = "man"
v2 = "king"
v3 = "queen"
v4 = "woman"

v1 = nlp(v1)[0]
v2 = nlp(v2)[0]
v3 = nlp(v3)[0]
v4 = nlp(v4)[0]

In [89]:
v1.similarity(v2), v1.similarity(v3), v1.similarity(v4)

(0.41661593317985535, 0.3541485369205475, 0.8273442983627319)