In [1]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [2]:
import pandas as pd

import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

from sklearn.feature_extraction.text import CountVectorizer

import spacy # texthero, spacy, and nltk do not seem to align; pick 1-2

import string

In [3]:
replace_str1 = '<span class="css-1egxyvc" data-font-weight="bold">croissant</span>'
replace_str2 = '<span class="css-1egxyvc" data-font-weight="bold">croissants</span>'
replace_str3 = '<span class="css-1egxyvc" data-font-weight="bold">Croissant</span>'

In [4]:
reviews = []

with open("../archive/reviews.txt", "r") as the_file:
    while True:
        line = the_file.readline()
        if line == "":
            break
        elif len(line) > 3: # \n
            line = line.replace(replace_str1, "croissant")
            line = line.replace(replace_str2, "croissant")
            line = line.replace(replace_str3, "croissant")
            reviews.append(line)

print(len(reviews))

90


In [5]:
print(reviews[1:4])

['lost larson bakery : These are the best cinnamon rolls ever. The croissant are also delicious and the service is very efficient”\n', 'bang bang pie and biscuits : reminded me of a croissant) and light. The actual filling was very good and you can clearly taste”\n', 'good ambler : minutes. - Ham and cheese croissant: croissant was super flaky and the sesame seeds were a nice touch”\n']


In [6]:
# I like this version better but pandas words differently, so later we conform
dict_reviews = {review.split(' : ')[0]: review.split(' : ')[1] for review in reviews}

names = list(dict_reviews.keys())
words = list(dict_reviews.values())
index = ['idx'+str(idx) for idx in range(0, len(names))]
dict_for_pd = {'idx': index, 'BakeryName': names, 'Review': words}

df = pd.DataFrame.from_dict(dict_for_pd)
df.set_index("idx", inplace = True)
df.head()

Unnamed: 0_level_0,BakeryName,Review
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
idx0,hendrickx belgian bread crafter,The almond chocolate croissant and french coun...
idx1,lost larson bakery,These are the best cinnamon rolls ever. The cr...
idx2,bang bang pie and biscuits,reminded me of a croissant) and light. The act...
idx3,good ambler,minutes. - Ham and cheese croissant: croissant...
idx4,p%C3%A2tisserie coralie,is a croissant desert. Search no more. Very fr...


In [7]:
# some peering
pd.options.display.max_colwidth = None
df[df['Review'].str.contains("flak")]

Unnamed: 0_level_0,BakeryName,Review
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
idx3,good ambler,minutes. - Ham and cheese croissant: croissant was super flaky and the sesame seeds were a nice touch”\n
idx16,hoosier mama pie company,and cream admixture and a flaky buttery crust almost like that of a croissant. Indecisive and gluttonous as”\n
idx33,caffe umbria,"are the highlight of the show, you absolutely have to get a croissant - it was so flaky and buttery and the muffin was also moist and had a hint of cinnamon!!!”\n"
idx39,"abc bakery and deli norridge?hrid=SDxAypM6HgjzV47KHlp0TQ&amp;osq=croissant"">more</a></span></p","of the seductive sweets. I chose a nutella croissant! Fluffy, flaky, sticky hazelnut spread glory. Yes”\n"


In [8]:
# hoosier mama is not a review on a croissant
# not a huge nutella fan, it's just sugar. too easy
# caffe umbria and good ambler are the winners based on this.
# but let's do more with these reviews
# while we r here. 

In [9]:
df[df['Review'].str.contains("artisan")]

Unnamed: 0_level_0,BakeryName,Review
idx,Unnamed: 1_level_1,Unnamed: 2_level_1


In [10]:
# tokenizing, filtering stop words, stemming, and lower casing

stop_words = set(stopwords.words("english"))
stop_words = stop_words.union({'!', '!!!', '!!!”', '!)', '!),', '!?', '!”', '"', '">', '.', "'", '\n'\
                               '&', '(', ')', '),', ').', '+', ',', '-', '--', '&', '...', '.”', '/',
                               '”', '…', '=', '/', '<'})

stemmer = SnowballStemmer('english')

In [11]:
df['Review_Prepped'] = df.Review.apply(lambda xyz: 
                                       [stemmer.stem(word.lower()) 
                                        for word in wordpunct_tokenize(xyz) 
                                        if word.lower() not in stop_words])

In [12]:
df.head()

Unnamed: 0_level_0,BakeryName,Review,Review_Prepped
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
idx0,hendrickx belgian bread crafter,The almond chocolate croissant and french country bread were quite authentic and wonderfully rustic.”\n,"[almond, chocol, croissant, french, countri, bread, quit, authent, wonder, rustic]"
idx1,lost larson bakery,These are the best cinnamon rolls ever. The croissant are also delicious and the service is very efficient”\n,"[best, cinnamon, roll, ever, croissant, also, delici, servic, effici]"
idx2,bang bang pie and biscuits,reminded me of a croissant) and light. The actual filling was very good and you can clearly taste”\n,"[remind, croissant, light, actual, fill, good, clear, tast]"
idx3,good ambler,minutes. - Ham and cheese croissant: croissant was super flaky and the sesame seeds were a nice touch”\n,"[minut, ham, chees, croissant, :, croissant, super, flaki, sesam, seed, nice, touch]"
idx4,p%C3%A2tisserie coralie,"is a croissant desert. Search no more. Very friendly staff, great tea and coffee. A truly welcome addition to the neighborhood. Best of luck.”\n","[croissant, desert, search, friend, staff, great, tea, coffe, truli, welcom, addit, neighborhood, best, luck]"


semantic analysis ?

if someone writes a review on a croissant in chicago, 
what are they likely to say about it. .. https://www.nltk.org/howto/classify.html
or https://www.nltk.org/howto/probability.html

>>> from nltk.classify import SklearnClassifier
>>> from sklearn.naive_bayes import BernoulliNB
>>> from sklearn.svm import SVC
>>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
...               ({"a": 5, "b": 2, "c": 1}, "ham"),
...               ({"a": 0, "b": 3, "c": 4}, "spam"),
...               ({"a": 5, "b": 1, "c": 1}, "ham"),
...               ({"a": 1, "b": 4, "c": 3}, "spam")]
>>> classif = SklearnClassifier(BernoulliNB()).train(train_data)
>>> test_data = [{"a": 3, "b": 2, "c": 1},
...              {"a": 0, "b": 3, "c": 7}]
>>> classif.classify_many(test_data)
['ham', 'spam']
>>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
>>> classif.classify_many(test_data)
['ham', 'spam']

In [13]:
nlp = spacy.load("en_core_web_sm")

items_to_return = []

def many_things(string_of_text):
    """
    uses the spacy package to 
    label words in case its important
    to differentiate can and can in the following
    sent. as ex. . .
    
    i can kick the can down the road
    
    this actually does not work lol
    """
    nlp_mjx = nlp(string_of_text)
    for ent in nlp_mjx.ents:
        info = str(ent.text) + "|" + str(ent.label_)
        items_to_return.append(info)
    return items_to_return

#df['spacy_NLP_ids'] = df['Review'].apply(lambda mjx: many_things(mjx))
#df['spacy_NLP_ids_str'] = df['spacy_NLP_ids'].apply(lambda bbg: " ".join(bbg))

In [14]:
df['Review_Prepped_str'] = df['Review_Prepped'].apply(lambda bbg: " ".join(bbg))
df['word_count'] = df['Review_Prepped_str'].apply(lambda x : nltk.FreqDist(nltk.word_tokenize(x))) #previously Review

In [15]:
pd.concat(map(pd.Series, df['word_count'])).index.value_counts().to_dict()

{'croissant': 85,
 'chocol': 14,
 'coffe': 14,
 'good': 12,
 'latt': 12,
 'chees': 12,
 'almond': 11,
 'delici': 11,
 'also': 10,
 'ham': 8,
 'best': 8,
 'like': 8,
 'great': 7,
 'sandwich': 7,
 'made': 7,
 'one': 7,
 '”': 7,
 'pastri': 6,
 'got': 6,
 'cake': 6,
 '…': 6,
 'ever': 6,
 'class': 5,
 'friend': 5,
 'get': 5,
 'well': 5,
 'breakfast': 5,
 'go': 5,
 'scone': 5,
 'realli': 5,
 'think': 5,
 'flaki': 5,
 'tast': 5,
 'bread': 5,
 'recommend': 5,
 'even': 4,
 '<': 4,
 'cinnamon': 4,
 'tri': 4,
 'seat': 4,
 'awesom': 4,
 'day': 4,
 'amaz': 4,
 'fresh': 4,
 'blueberri': 4,
 '>': 4,
 'outsid': 4,
 'css': 4,
 'world': 4,
 'bold': 4,
 'everyth': 4,
 'span': 4,
 "''": 4,
 'muffin': 4,
 'data': 4,
 'donut': 4,
 '/': 4,
 'enjoy': 4,
 'staff': 4,
 '1egxyvc': 4,
 '=': 4,
 'font': 4,
 'weight': 4,
 '$': 4,
 'tea': 4,
 'place': 4,
 'order': 4,
 'servic': 3,
 'alway': 3,
 'back': 3,
 ';': 3,
 'favorit': 3,
 'need': 3,
 'select': 3,
 'local': 3,
 'danish': 3,
 'super': 3,
 'cooki': 3,
 'return'

In [16]:
# N E ways

df['label'] = ""
df.to_csv("unlabeled_df.csv")

In [17]:
###

In [18]:
cv = CountVectorizer(analyzer=lambda abc: abc)
counted_values = cv.fit_transform(df['Review_Prepped']).toarray() #previously ReviewPrepped
cv_df = pd.DataFrame(counted_values, columns=cv.get_feature_names())
cv_df.iloc[0:5, 0:5]

#https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db

Unnamed: 0,$,12,14,16oz,18
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
