In [7]:
import pandas as pd
import scattertext as st
import spacy
from pprint import pprint
from collections import Counter
import re
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer

shops = pd.read_csv('C:\\Users\\lilyx\\DS-Unit-4-Sprint-1-NLP\\module1-text-data\\data\\yelp_coffeeshop_review_data.csv')
shops.sample(6)

Unnamed: 0,coffee_shop_name,full_review_text,star_rating
2580,Flightpath Coffeehouse,3/11/2014 26 check-ins Flightpath has quickly...,4.0 star rating
6977,My Sweet Austin,2/27/2016 All I can say is PERFECTION! We sho...,5.0 star rating
3332,Jo’s Coffee,10/16/2015 1 check-in Yummy yum yum! :) I ha...,5.0 star rating
7453,Hot Mama’s Cafe,3/9/2013 1 check-in Underneath the sign for H...,2.0 star rating
3900,Lola Savannah Coffee Lounge,1/25/2016 Out of the 45+ coffee houses that I...,5.0 star rating
891,Legend Coffee,7/1/2015 I love a good coffee so came over to...,5.0 star rating


In [3]:
#to remove dates at the beginning of string, split the string into a list of words, then indexing the second to last words
splitty = lambda x: x['full_review_text'].split()[1:]
shops['full_review_text'] = shops.apply(splitty, axis=1)

shops.sample(3)

Unnamed: 0,coffee_shop_name,full_review_text,star_rating
3566,Patika,"[So, my, coffee, standards, are, high, to, sta...",5.0 star rating
3600,Cenote,"[Cenote, has, been, my, go-to, cafe, since, I,...",3.0 star rating
4755,Venezia Italian Gelato,"[Simply, awesome., This, is, the, place, for, ...",5.0 star rating


In [4]:
#join list of words back together
jointty = lambda x: ' '.join(map(lambda x: str(x), x['full_review_text']))
shops['full_review_text'] = shops.apply(jointty, axis=1)
# shops['full_review_text'] = ','.join([str(x) for x in shops['full_review_text']])
shops.sample(3)

Unnamed: 0,coffee_shop_name,full_review_text,star_rating
6378,"Strange Brew, Austin Coffee",Decent coffee and easy laid back atmosphere. I...,3.0 star rating
6023,La Tazza Fresca,Pretty good coffee and really great sandwiches...,4.0 star rating
5513,Radio Coffee & Beer,They have a great selection of beer and have a...,2.0 star rating


In [5]:
#lowercasing all words in review
lowercase = lambda x: str.lower(x['full_review_text'])
shops['full_review_text'] = shops.apply(lowercase, axis=1)
# shops['full_review_text'] = ','.join([str(x) for x in shops['full_review_text']])
shops.sample(3)

Unnamed: 0,coffee_shop_name,full_review_text,star_rating
1997,Epoch Coffee,10 check-ins listed in coffee thursday this pl...,4.0 star rating
5716,Trianon Coffee,3 check-ins my monday morning stop for fresh r...,3.0 star rating
5206,TOMS,2 check-ins i finally was able to check this p...,4.0 star rating


In [1]:
#calling spacy's default stop words and adding some manually
nlp = spacy.load("en_core_web_md")
tokenizer = Tokenizer(nlp.vocab)#breaks text, it creates character indexes instead of splitting words up

STOP_WORDS = nlp.Defaults.stop_words.union(['check-in','=','= =','male','u','want', 'u want', 'cuz','him','check-in','him',"i've", 'deaf','on', 'her','told','told him','ins', '1 check','I', 'i"m', 'i', ' ', 'it', "it's", 'it.','they','coffee','place','they', 'the', 'this','its', 'l','-','they','this','don"t','the ', ' the', 'it', 'i"ve', 'i"m', '!', '1','2','3','4', '5','6','7','8','9','0','/','.',','])

# STOP_WORDS

NameError: name 'spacy' is not defined

In [6]:
#create tokens based on stop words
tokens = []

for doc in tokenizer.pipe(shops['full_review_text'], batch_size=800):
    
    doc_tokens = []
    
    for token in doc: 
        if token.text not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
   
    tokens.append(doc_tokens)
    
shops['tokens'] = tokens

In [7]:
shops.sample(3)

Unnamed: 0,coffee_shop_name,full_review_text,star_rating,tokens
1138,The Factory - Cafe With a Soul,this place is really nice. they have done a gr...,5.0 star rating,"[nice., great, job..., type, spend, hours, in...."
5357,Cuvée Coffee Bar,you can find me at cuvee almost every other mo...,5.0 star rating,"[find, cuvee, morning., great, baristas, aweso..."
5242,Genuine Joe,room nice. plain coffee meh. they have intrigu...,4.0 star rating,"[room, nice., plain, meh., intriguing, over-th..."


In [8]:
#join tokens back together for use
jointty = lambda x: ' '.join(map(lambda x: str(x), x['tokens']))
shops['joined_tokens'] = shops.apply(jointty, axis=1)
shops.sample(3)

Unnamed: 0,coffee_shop_name,full_review_text,star_rating,tokens,joined_tokens
4958,Once Over Coffee Bar,"i like the location, and i like some of the pe...",3.0 star rating,"[like, location,, like, people, work, there......","like location, like people work there... like ..."
6819,Teo,2 check-ins the gelato is on the pricy side co...,2.0 star rating,"[check-ins, gelato, pricy, compares, gelato, p...",check-ins gelato pricy compares gelato places ...
6871,Teo,11 check-ins best espresso in town! get the do...,5.0 star rating,"[11, check-ins, best, espresso, town!, double,...",11 check-ins best espresso town! double machia...


In [9]:
#start of scattertext library magic
corpus = st.CorpusFromPandas(shops, 
                             category_col='star_rating', 
                             text_col='joined_tokens',
                             nlp=nlp).build()

In [10]:
#getting a list of words correlated with high rating score
term_freq_df = corpus.get_term_freq_df()
term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(' 5.0 star rating ')
pprint(list(term_freq_df.sort_values(by='highratingscore', ascending=False).index[:10]))

['= =',
 'best gelato',
 'venezia',
 'coast',
 "anderson 's",
 'italy',
 'love love',
 'incredible',
 'pound',
 'sara']


In [11]:
#getting a list of words correlated with low rating score
term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(' 1.0 star rating ')
pprint(list(term_freq_df.sort_values(by='poorratingscore', ascending=False).index[:10]))

['particularly fabulous',
 'milk splendas',
 'understand business',
 'stars cuz',
 'splendas',
 'cancelled order',
 'policies',
 'understanding',
 'cancelled',
 'male']


In [12]:
#creating a dataframe ranking the words correlated with poorest ratings
term_freq_df.sort_values(by= 'poorratingscore', ascending = False)

Unnamed: 0_level_0,5.0 star rating freq,4.0 star rating freq,2.0 star rating freq,3.0 star rating freq,1.0 star rating freq,highratingscore,poorratingscore
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
particularly fabulous,0,0,0,0,8,0.347828,1.000000e+00
milk splendas,0,0,0,0,6,0.386662,9.997953e-01
understand business,0,0,0,0,6,0.386662,9.997953e-01
stars cuz,0,0,0,0,6,0.386662,9.997953e-01
splendas,0,0,0,1,6,0.367032,9.997839e-01
cancelled order,0,0,0,1,6,0.367032,9.997839e-01
policies,1,0,0,0,6,0.399203,9.997839e-01
understanding,1,0,0,2,7,0.338671,9.997676e-01
cancelled,0,0,1,1,6,0.347828,9.997044e-01
male,2,3,1,0,12,0.227176,9.996268e-01


In [13]:
html = st.produce_scattertext_explorer(corpus,
         category=' 1.0 star rating ',
         category_name='Poor Yelp Rating',
         not_category_name='High Yelp Rating',
         width_in_pixels=1000,
         metadata=shops['coffee_shop_name'])
open("Yelp-CoffeeShop-Visualization.html", 'wb').write(html.encode('utf-8'))

4840268