In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from collections import Counter
from IPython.display import display_html
import os
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
sns.set(style="whitegrid")

# supervised learning models
from sklearn import ensemble
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

# deep neutral network
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# text manipulation tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF
import spacy
#import wordcould

In [7]:
PATH = 'data/'
biz=pd.read_csv(os.path.join(PATH,"yelp_business.csv"))
reviews=pd.read_csv(os.path.join(PATH,"yelp_review.csv"))

In [8]:
col = ['neighborhood', 'address', 'latitude', 'longitude', 'stars']
biz.drop(columns=col, inplace=True)    

biz.categories = biz.categories.apply(lambda x: x.split(";"))

biz = biz[
    (biz.city == 'Cleveland') & 
    (biz.state == 'OH') & 
    (biz.is_open == 1) & 
    (biz.categories.apply(
        lambda x: True if 'Restaurants' in x else False
    ))]

In [9]:


df = pd.merge(reviews, biz, how='inner', on='business_id')
df.head()



Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,name,city,state,postal_code,review_count,is_open,categories
0,OPZsR2jCG72uoDNjU71DQQ,qYbWTWH5leltA0bzWAOnmA,meXjqyhTNLFmknY39y2sMg,5,2014-09-11,Solid beers -- Christmas Ale defines my holida...,1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv..."
1,fxGwEiSYDtAen8BNuVGGxg,8Az_JgEpXqAii_5EDkw2tw,meXjqyhTNLFmknY39y2sMg,3,2013-10-13,Meh. It was OK. A bartender the night before...,0,1,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv..."
2,Gweb4pADeQ26WnaiKEZ7GQ,T9tEic49JZjN4nCUcDvrRQ,meXjqyhTNLFmknY39y2sMg,4,2014-01-15,"Oh Christmas Ale, oh Christmas Ale, how lovely...",1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv..."
3,P1vhwPI56SeZEz10ywaS7w,W1p8_CFW5FISSihmQo5Qzw,meXjqyhTNLFmknY39y2sMg,3,2012-02-09,What is the big deal about this place? The foo...,2,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv..."
4,1kQvQlBX0V5_rGddBh9-rQ,Y_PP05RRdzbKRYfDCCfh8w,meXjqyhTNLFmknY39y2sMg,5,2017-04-30,Great Lakes Brewing Company is one of my favor...,0,0,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv..."


In [5]:
# removes punctation, stops words, and lemmatizes
def lemma(text):
    lem_text = []
    
    for token in text:
        if (not token.is_punct and 
            not token.is_stop and 
            not token.like_email and
            not token.like_url and
            token.lemma not in nlp.Defaults.stop_words): 
            lem_text.append(token.lemma_.lower())
    
    return " ".join(lem_text)

In [6]:
# i never have this installed
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

[38;5;2m[+] Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


## Super Slow Way

In [12]:
%%time

# let's create lemmatized sentences

df['lem_join'] = df.text.apply(lambda x: lemma(nlp(x)))

df.head()



Wall time: 21min 18s


Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,name,city,state,postal_code,review_count,is_open,categories,proc,lem_join_kd,lem_join
0,OPZsR2jCG72uoDNjU71DQQ,qYbWTWH5leltA0bzWAOnmA,meXjqyhTNLFmknY39y2sMg,5,2014-09-11,Solid beers -- Christmas Ale defines my holida...,1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Solid, beers, --, Christmas, Ale, defines, my...",solid beer christmas ale define holiday season...,solid beer christmas ale define holiday season...
1,fxGwEiSYDtAen8BNuVGGxg,8Az_JgEpXqAii_5EDkw2tw,meXjqyhTNLFmknY39y2sMg,3,2013-10-13,Meh. It was OK. A bartender the night before...,0,1,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Meh, ., , It, was, OK, ., , A, bartender, t...",meh ok bartender night tell cleveland eat ...,meh ok bartender night tell cleveland eat ...
2,Gweb4pADeQ26WnaiKEZ7GQ,T9tEic49JZjN4nCUcDvrRQ,meXjqyhTNLFmknY39y2sMg,4,2014-01-15,"Oh Christmas Ale, oh Christmas Ale, how lovely...",1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Oh, Christmas, Ale, ,, oh, Christmas, Ale, ,,...",oh christmas ale oh christmas ale lovely taste...,oh christmas ale oh christmas ale lovely taste...
3,P1vhwPI56SeZEz10ywaS7w,W1p8_CFW5FISSihmQo5Qzw,meXjqyhTNLFmknY39y2sMg,3,2012-02-09,What is the big deal about this place? The foo...,2,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(What, is, the, big, deal, about, this, place,...",big deal place food overprice beer,big deal place food overprice beer
4,1kQvQlBX0V5_rGddBh9-rQ,Y_PP05RRdzbKRYfDCCfh8w,meXjqyhTNLFmknY39y2sMg,5,2017-04-30,Great Lakes Brewing Company is one of my favor...,0,0,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Great, Lakes, Brewing, Company, is, one, of, ...",great lakes brewing company favorite place wor...,great lakes brewing company favorite place wor...


In [12]:
# we can't have lists in feather format
#df.drop(columns='categories', inplace=True)  

# create feather backup
df.reset_index(drop=True).to_feather('yelp_reviews.feather')

## super fast way

This is almost 5 times faster. My computer has 12 cores and 32 GB of ram. You may need to adjust `n_process` and `bach_size` so you can fit your computer

In [10]:
%%time
df['proc'] = list(nlp.pipe(iter(df.text), n_process=12, batch_size=1000))
df['lem_join_kd'] = df.proc.apply(lambda x: lemma(x))

# save it here

Wall time: 4min 36s


In [11]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,name,city,state,postal_code,review_count,is_open,categories,proc,lem_join_kd
0,OPZsR2jCG72uoDNjU71DQQ,qYbWTWH5leltA0bzWAOnmA,meXjqyhTNLFmknY39y2sMg,5,2014-09-11,Solid beers -- Christmas Ale defines my holida...,1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Solid, beers, --, Christmas, Ale, defines, my...",solid beer christmas ale define holiday season...
1,fxGwEiSYDtAen8BNuVGGxg,8Az_JgEpXqAii_5EDkw2tw,meXjqyhTNLFmknY39y2sMg,3,2013-10-13,Meh. It was OK. A bartender the night before...,0,1,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Meh, ., , It, was, OK, ., , A, bartender, t...",meh ok bartender night tell cleveland eat ...
2,Gweb4pADeQ26WnaiKEZ7GQ,T9tEic49JZjN4nCUcDvrRQ,meXjqyhTNLFmknY39y2sMg,4,2014-01-15,"Oh Christmas Ale, oh Christmas Ale, how lovely...",1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Oh, Christmas, Ale, ,, oh, Christmas, Ale, ,,...",oh christmas ale oh christmas ale lovely taste...
3,P1vhwPI56SeZEz10ywaS7w,W1p8_CFW5FISSihmQo5Qzw,meXjqyhTNLFmknY39y2sMg,3,2012-02-09,What is the big deal about this place? The foo...,2,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(What, is, the, big, deal, about, this, place,...",big deal place food overprice beer
4,1kQvQlBX0V5_rGddBh9-rQ,Y_PP05RRdzbKRYfDCCfh8w,meXjqyhTNLFmknY39y2sMg,5,2017-04-30,Great Lakes Brewing Company is one of my favor...,0,0,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...","(Great, Lakes, Brewing, Company, is, one, of, ...",great lakes brewing company favorite place wor...


## Stuck Notebook

I just copied and pasted a lot of this until the last few cells

In [14]:


df = pd.read_feather('yelp_reviews.feather')
df['target'] = df['stars'].apply(lambda x: 'positive' if x >= 4 else 'negative')



In [15]:


# declare X & Y
Y = df['target']
X = df['lem_join']

# let's stratify the data so we get a fair balance
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42, stratify=Y
)



In [16]:


def BoW_generator(text):
    
    # get the top 2000 words
    n_top_words = 2000
    count_vec = CountVectorizer(max_features=n_top_words)
    mask = count_vec.fit_transform(text)
    
    # create a dataframe
    word_counts = pd.DataFrame(
        mask.toarray().reshape(-1, n_top_words), 
        columns=count_vec.get_feature_names()
    )  
    
    return word_counts



In [17]:


# get the top 2000 words
n_top_words = 2000
count_vec = CountVectorizer(max_features=n_top_words)

# apply to X values
X_train_bow = count_vec.fit_transform(X_train)
X_test_bow = count_vec.transform(X_test)



In [18]:
tfidf_vec = TfidfVectorizer(
    max_df=0.5, 
    min_df=2,
    stop_words='english', 
    lowercase=True,
    use_idf=True,
    norm=u'l2',
    smooth_idf=True
)

In [19]:


# applying to X values
X_train_tfidf=tfidf_vec.fit_transform(X_train)
X_test_tfidf=tfidf_vec.transform(X_test)



In [20]:
def model_metrics(model, X, y):
    model_sum = {}
    
    # get cross validation score & mean of CV score
    cv_score = cross_val_score(model, X, y, cv=10, n_jobs=-1)
    
    model_sum['cv_scores'] = list(cv_score)
    model_sum['cv_mean'] = np.mean(cv_score)
    
    # get confusion matrix metrics
    y_pred = model.predict(X)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    
    model_sum['true negatives'] = tn
    model_sum['false positives'] = fp
    model_sum['false negatives'] = fn
    model_sum['true positives'] = tp
    model_sum['accuracy'] = (tp+tn)/len(y_pred)
    model_sum['f1 score'] = 2*tp/(2*tp+fp+fn)
    model_sum['class_report'] = classification_report(y, y_pred)
    
    return model_sum

In [21]:
def format_metrics(model_sum):
    # print cross-validation scores
    print("Cross Validation Scores\n" + 23*"=" + "\n{}\n".format(model_sum['cv_scores']))
    print("Average CV = {}\n".format(model_sum['cv_mean']))
    
    # print classification report
    print("Classification Report\n" + 21*"=" + "\n" + model_sum['class_report'])
    
    # print confusion matrix results
    print("Confusion Matrix\n" + 16*"=" + '\nTrue Positives = {}\nTrue Negatives'\
          '= {}\nFalse Positives = {}\nFalse Negatives = {}'\
          .format(model_sum['true positives'], 
                  model_sum['true negatives'], 
                  model_sum['false positives'], 
                  model_sum['false negatives']))

In [22]:
print(tf.__version__)
print(tf.executing_eagerly())

2.1.0
True


### Warning!!!!
You had version of 1.13 of TF. This is pretty old. You should be using version 2.x. 

I made an `environment.yml` file with my conda install. I ran this entire notebook on windows 10 with it. There were no installation issues.

This repo should have everything you need in it. Clone it and create an environment with my build. See instructions here:
https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file

#### TLDR: 
```
git clone git@github.com:kd2718/yelp_getting_started.git
cd yelp_getting_started
conda env create -f environment.yml -n yelp
```

In [31]:
# Modified your function. X is more generic
def create_keras(X):
    input_dim = X.shape[1]
    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                         optimizer='adam', 
                         metrics=['accuracy'], 
                  run_eagerly=True)
    
    #print(model.summary())
    return model

# partials will always call the function with the designated input
from functools import partial

bow_keras = partial(create_keras, X_train_bow)
tfidf_keras = partial(create_keras,X_train_tfidf)

In [35]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# just running 2 epochs for speed
keras_model1 = KerasClassifier(
    build_fn=bow_keras, 
    epochs=2, 
    batch_size=1000, 
    verbose=0)

cross_val_score(keras_model1, X_train_bow.toarray(), y_train.factorize()[0], cv=3)

array([0.8547231 , 0.8533774 , 0.84887892])

In [36]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
keras_model2 = KerasClassifier(
    build_fn=tfidf_keras, 
    epochs=2, 
    batch_size=1000, 
    verbose=0)

cross_val_score(keras_model1, X_train_bow.toarray(), y_train.factorize()[0], cv=3)

array([0.84038514, 0.84866804, 0.85696208])

In [14]:
from ernie import SentenceClassifier, Models

In [12]:
from transformers import TFAutoModel

In [13]:
!which python

/home/koryd/miniconda3/bin/python


In [27]:
classifier = SentenceClassifier(model_name=Models.BertBaseUncased, max_length=64, labels_no=2)

In [21]:
classifier.load_dataset(df.text, validation_split=0.2)

AttributeError: 'Series' object has no attribute 'columns'

In [73]:
df_polar = df[df.stars.isin( [1,5])][['text', 'stars']]

df_polar = df_polar.sample(1000)

In [53]:
from spacy.pipeline import SentenceSegmenter
sent = SentenceSegmenter('english')

In [179]:
import spacy
from spacy.lang.en import English

nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp.pipe(df_polar.text, n_process=12, batch_size=100)
#col_list = ['sents', 'score']
#df_sents = pd.DataFrame(columns=col_list)
#df_polar['splits'] = list(doc)





#next(doc).sents
#for sent in doc.sents:
#    print(sent.text)

In [180]:
df_polar_sents = pd.DataFrame(columns=['sent', 'score', 'og_text'])
for dfp, dc in zip(df_polar.iterrows(), doc):

    for sent in dc.sents:
        strr = dfp[1].stars
        strr = 1 if strr == 5 else 0
        snt = sent.string
        df_temp = pd.Series({'sent': snt, 'score': strr, 'og_text':dfp[1].text })
        df_polar_sents = df_polar_sents.append(df_temp, ignore_index=True)
doc.close()

In [181]:
df_polar_sents

Unnamed: 0,sent,score,og_text
0,We visited the restaurant for my birthday on S...,0,We visited the restaurant for my birthday on S...
1,We were seated in the bar.,0,We visited the restaurant for my birthday on S...
2,Our server was not friendly or attentative du...,0,We visited the restaurant for my birthday on S...
3,"\nAs far as the food, some of our food came c...",0,We visited the restaurant for my birthday on S...
4,My food was inedible.,0,We visited the restaurant for my birthday on S...
...,...,...,...
7522,"\nJodi Rae, who owns and runs Daily Press is a...",1,I come here several times a month and always l...
7523,She is always friendly and happy to see us.,1,I come here several times a month and always l...
7524,I always enjoy seeing her cute little dog.,1,I come here several times a month and always l...
7525,\nThanks for opening up in our neighborhood Jo...,1,I come here several times a month and always l...


In [182]:
classifier.load_dataset(df_polar_sents[['sent', 'score']], validation_split=0.2)

In [183]:
classifier.fine_tune(epochs=4, learning_rate=2e-5, training_batch_size=32, validation_batch_size=64)

Train for 188 steps, validate for 23 steps
 30/188 [===>..........................] - ETA: 17:51 - loss: 0.5633 - accuracy: 0.7478

KeyboardInterrupt: 