In [19]:
#make compatible with Python 2 and Python 3
from __future__ import print_function, division, absolute_import

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
# regular expressions, text parsing, and ML classifiers
import re
import nltk
import bs4 as bs
import numpy as np
import pandas as pd
 

# download NLTK classifiers
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator

eng_stopwords = stopwords.words('english')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Process data

In [21]:
reviews_data = pd.read_csv('austin_review.csv')
austin_rest = pd.read_csv('austin_rest.csv')

In [22]:
## takes in a resturant name (user inputs), or cusine type
## match the Business id from resturant df to review df, get all reviews, and return a filtered review df

def filter_reviews(rest_name, cusine=None, rest_df=austin_rest, rev_df=reviews_data):
    rest_id = austin_rest.loc[austin_rest.name == rest_name].business_id.values[0]
    train = rev_df.loc[reviews_data.business_id == rest_id]
    return train

In [23]:
train = filter_reviews("Franklin Barbecue")

## Preparing data for classification

In [24]:
#from google.colab import drive
#drive.mount('/content/drive')

In [25]:
#from google.colab import drive
#drive.mount('/content/drive')

In [26]:
def review_cleaner(review, lemmatize=True, stem=False):
    '''
        Clean and preprocess a review.
            1. Remove HTML tags
            2. Extract emoticons
            3. Use regex to remove all special characters (only keep letters)
            4. Make strings to lower case and tokenize / word split reviews
            5. Remove English stopwords
            6. Lemmatize
            7. Rejoin to one string
        
        @review (type:str) is an unprocessed review string
        @return (type:str) is a 6-step preprocessed review string
    '''
    
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()

    cleaned_reviews=[]
    for i,review in enumerate(train['text']):
        # batching step notification
        if( (i+1)%1000 == 0 ):
            print("Done with %d reviews" %(i+1))
        
        
        #1. Remove HTML tags
        review = bs.BeautifulSoup(review).text    

        #2. Use regex to find emoticons
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)

        #3. Remove punctuation
        review = re.sub("[^a-zA-Z]", " ",review)

        #4. Tokenize into words (all lower case)
        review = review.lower().split()

        #5. Remove stopwords
        eng_stopwords = set(stopwords.words("english"))
        
        #6. Lemmatize 
        clean_review=[]
        for word in review:
            if word not in eng_stopwords:
                if lemmatize is True:
                    word=wnl.lemmatize(word)
                elif stem is True:
                    if word == 'oed':
                        continue
                    word=ps.stem(word)
                clean_review.append(word)

        #7. Join the review to one sentence
        review_processed = ' '.join(clean_review+emoticons)
        cleaned_reviews.append(review_processed)
    

    return(cleaned_reviews)

## Train and validate sentiment analysis model using Random Forest Classifier (RFC)

In [27]:
from sklearn import metrics                          # evaluating model
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#CountVectorizer can actucally handle a lot of the preprocessing for us
from sklearn.feature_extraction.text import CountVectorizer

# seed
np.random.seed(0)

In [28]:
def train_predict_sentiment(cleaned_reviews, y=train["stars"], ngram=1, max_features=1000):
    '''
        This function will:
            1. split data into train and test set.
            2. get n-gram counts from cleaned reviews 
            3. train a random forest model using train n-gram counts and y (labels)
            4. test the model on your test split
            5. print accuracy of sentiment prediction on test and training data
            6. print confusion matrix on test data results

            To change n-gram type, set value of ngram argument
            To change the number of features you want the countvectorizer to generate, set the value of max_features argument
            
            @cleaned_review (type:str) is preprocessed string from review_cleaner()
            @return none
    '''

    print("Creating the bag of words model!\n")
    # CountVectorizer" is scikit-learn's bag of words tool, here we show more keywords 
    vectorizer = CountVectorizer(ngram_range=(1, ngram),
                                 analyzer = "word",   
                                 tokenizer = None,    
                                 preprocessor = None, 
                                 stop_words = None,   
                                 max_features = max_features) 
    
    # train / test split
    X_train, X_test, y_train, y_test = train_test_split(cleaned_reviews, y, random_state=0, test_size=.2)

    # Then we use fit_transform() to fit the model / learn the vocabulary,
    # then transform the data into feature vectors.
    # The input should be a list of strings. .toarraty() converts to a numpy array
    
    train_bag = vectorizer.fit_transform(X_train).toarray()
    test_bag = vectorizer.transform(X_test).toarray()

    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 50 trees
    forest = RandomForestClassifier(n_estimators = 50) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)

    # predict
    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)
    
    # validation
    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    
    print(" The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
    print()
    print('CONFUSION MATRIX:')
    print('         Predicted')
    print('          neg pos')
    print(' Actual')
    c=confusion_matrix(y_test, test_predictions)
    print('     neg  ',c[0])
    print('     pos  ',c[1])

    #Extract feature importance
    print('\nTOP TEN IMPORTANT FEATURES:')
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_10 = indices[:20]
    print([vectorizer.get_feature_names()[ind] for ind in top_10])

## Train and test  Model

<br>

**Preprocess data**

In [29]:
# Clean the reviews in the training set 'train' using review_cleaner function defined above
# Here we use the original reviews without lemmatizing and stemming
original_clean_reviews_lemmatize = review_cleaner(train['text'], lemmatize=True, stem=False)

Done with 1000 reviews
Done with 2000 reviews
Done with 3000 reviews
Done with 4000 reviews
Done with 5000 reviews


<br>

**Train RFC**

In [30]:
train_predict_sentiment(cleaned_reviews=original_clean_reviews_lemmatize, y=train["stars"], ngram=2, max_features=1000)

Creating the bag of words model!

Training the random forest classifier!

 The training accuracy is:  0.9995069033530573 
 The validation accuracy is:  0.7438423645320197

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [ 1  0  0  0 23]
     pos   [ 0  0  1  1 27]

TOP TEN IMPORTANT FEATURES:
['brisket', 'good', 'better', 'line', 'best', 'wait', 'bbq', 'ever', 'hour', 'star', 'franklin', 'food', 'rib', 'long', 'place', 'meat', 'would', 'get', 'time', 'pretty good']


In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from math import radians, cos, sin, asin, sqrt
#have to pip install geopandas in terminal before importing 
#have to pip install haversine in terminal before importing
!pip install haversine 
from haversine import haversine, Unit



In [32]:
### creating closest restaurants algorithm
#will be using data cleaned from before, open_restaurants
restaurants = pd.read_csv('open_restaurants.csv')
restaurants = restaurants.drop('is_open', axis = 1)
restaurants.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,name,latitude,longitude,stars,review_count,attributes,categories,hours
0,1,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,30.234533,-97.877262,4.5,395,"{'Ambience': ""{'touristy': False, 'hipster': F...","Bars, Nightlife, Cocktail Bars, Seafood, Resta...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ..."
1,2,bRsDZ44CD3uhGnRY3NeQhQ,Wendy's,30.441875,-97.746581,2.0,46,"{'RestaurantsPriceRange2': '1', 'OutdoorSeatin...","Fast Food, Restaurants, Burgers","{'Monday': '6:30-1:0', 'Tuesday': '6:30-1:0', ..."
2,5,Pk4ZwXwUU50BDn5gqw_rKg,Johnny Carino's,30.162081,-97.789132,3.0,136,"{'RestaurantsGoodForGroups': 'True', 'Business...","Italian, Salad, Pizza, Nightlife, Restaurants,...","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3..."
3,6,Ieelu69Y23nbjKG3OGfwnw,McDonald's,30.232133,-97.823183,1.5,9,"{'RestaurantsTakeOut': 'True', 'RestaurantsRes...","Restaurants, Coffee & Tea, Food, Burgers, Fast...","{'Monday': '7:0-22:30', 'Tuesday': '7:0-22:30'..."
4,7,IFB2K3BEZ2L_Mv5AbUD26Q,Chispas,30.266996,-97.745362,3.5,119,"{'RestaurantsTakeOut': 'True', 'OutdoorSeating...","Tex-Mex, Mexican, Tacos, Restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."


In [33]:
#given a restaurant, want to create a list of restaurants within a given mile radius
#or rather a list of 10 closest restaurants
# will use imported haversine formula in form haversine((lat1, long1), (lat2, long2))
practice = 'Pk4ZwXwUU50BDn5gqw_rKg'
lat1 = restaurants[restaurants['business_id']==practice]['latitude']
long1 = restaurants[restaurants['business_id']==practice]['longitude']

coord1 = (lat1, long1)

practice2 = 'fTgnVCCu6k_Ds25Nz73s5Q'
lat2 = restaurants[restaurants['business_id']==practice2]['latitude']
long2 = restaurants[restaurants['business_id']==practice2]['longitude']

coord2 = (lat2, long2)
haversine(coord1, coord2, unit = 'mi')

7.882687089753406

In [34]:
# given business_id of restaurant return closest restaurants
# takes in business ID and integer of restaurants selected (5 or 10)
def closest_restaurants(business_ID, int seleted):
    counter = 0
    restaurant_list = []
    lat1 = restaurants[restaurants['business_id'] == business_ID]['latitude']
    long1 = restaurants[restaurants['business_id']== business_ID]['longitude']
    business_coord = (lat1, long1)
    remaining_IDs = list(restaurants[restaurants['business_id'] != business_ID]['business_id'])
    for ID in remaining_IDs:
        lat2 = restaurants[restaurants['business_id'] == ID]['latitude']
        long2 = restaurants[restaurants['business_id']== ID]['longitude']
        i_coord = (lat2, long2)
        #unit = 'mi' ensures distance is given in miles
        dist = haversine(business_coord, i_coord, unit = 'mi')
        if counter < selected:
            restaurant_list.append(ID)
        elif counter > selected:
            for i in range(0, selected):
                lat3 = restaurants[restaurants['business_id'] == restaurant_list[i]]['latitude']
                long3 = restaurants[restaurants['business_id']== restaurant_list[i]]['longitude']
                i_coord2 = (lat3, long3)
                i_dist = haversine(business_coord, i_coord2, unit = 'mi')
                if dist < i_dist:
                    restaurant_list[i] = ID
                    break
        counter += 1
    return restaurant_list

In [35]:
pip install anvil-uplink

Collecting argparse
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0


In [36]:
import anvil.server

anvil.server.connect("YIXTXTLMQSSUKJTGS4UCCS7G-NXLX35O2PI46VEKK")

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


In [37]:
lister = closest_restaurants('IFB2K3BEZ2L_Mv5AbUD26Q')
restaurants[restaurants['business_id'].isin(lister)]

Unnamed: 0.1,Unnamed: 0,business_id,name,latitude,longitude,stars,review_count,attributes,categories,hours
480,828,LE46JAgAQ6zLsotQ17i8Mg,Fourth & CO,30.26698,-97.745256,4.0,17,"{'BikeParking': 'True', 'WheelchairAccessible'...","Cocktail Bars, Nightlife, Chicken Wings, Resta...","{'Monday': '16:0-2:0', 'Tuesday': '16:0-2:0', ..."
2002,3487,4-RkxDM200qfSG756uVmtQ,Péché,30.26698,-97.745256,4.0,1135,"{'GoodForKids': 'False', 'RestaurantsGoodForGr...","Nightlife, Lounges, Cocktail Bars, Restaurants...","{'Monday': '17:0-2:0', 'Tuesday': '17:0-2:0', ..."
2285,3981,GZfHsTQAabxdXB_z30h6kw,DeSano Pizzeria Napoletana,30.26616,-97.745872,4.5,124,"{'BusinessAcceptsCreditCards': 'True', 'DogsAl...","Italian, Pizza, Restaurants","{'Monday': '0:0-0:0', 'Tuesday': '11:30-22:0',..."
2401,4185,9al-eBiJVbuhuFzzuQpiSQ,Gloria's Latin Cuisine,30.268909,-97.745333,3.5,348,"{'BusinessParking': ""{'garage': True, 'street'...","Mexican, Latin American, Salvadoran, Tex-Mex, ...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
2499,4374,pCIdgI3QYpzgrtLAmDemuA,Truluck’s Ocean’s Finest Seafood and Crab,30.266884,-97.74498,4.5,843,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, Buffets, Seafood, Steakhouses","{'Monday': '0:0-0:0', 'Tuesday': '16:30-21:0',..."
2575,4497,NL_Tg1VV5d9iztDqBrw5jQ,Jessica's Original Cheesesteak,30.267197,-97.744789,2.5,14,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Sandwiches, Restaurants, Food, Street Vendors","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
2608,4547,74ZOTBit9H9Z_YAP7YaE1Q,Manuel's,30.265578,-97.743707,3.5,568,"{'GoodForKids': 'True', 'RestaurantsGoodForGro...","Restaurants, Mexican, Breakfast & Brunch","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
2620,4568,c-6SPBhWeA0xFrbnM0sjJg,Chi'Lantro,30.26583,-97.744641,4.5,6,"{'RestaurantsTakeOut': 'True', 'RestaurantsDel...","Korean, Barbeque, Restaurants","{'Monday': '7:0-15:0', 'Tuesday': '7:0-15:0', ..."
2735,4774,2vppZx0rTDZtCzw-NljdRQ,The Capital Grille,30.266248,-97.744589,4.0,338,"{'Ambience': ""{'romantic': False, 'intimate': ...","Wine Bars, Bars, Nightlife, American (Traditio...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
2768,4844,Fjt54fSz56goWef5n42HWA,Shiner's Saloon,30.267036,-97.743559,4.0,176,"{'RestaurantsGoodForGroups': 'True', 'NoiseLev...","Restaurants, American (Traditional), Nightlife...","{'Monday': '16:0-2:0', 'Tuesday': '16:0-2:0', ..."


In [38]:
s = pd.Series(restaurants[restaurants['business_id'].isin(lister)]['name'])
for value in s.items():
  print(value[1])

Fourth & CO
Péché
DeSano Pizzeria Napoletana
Gloria's Latin Cuisine
Truluck’s Ocean’s Finest Seafood and Crab
Jessica's Original Cheesesteak
Manuel's
Chi'Lantro
The Capital Grille
Shiner's Saloon


In [40]:
@anvil.server.callable
def find_competitors(text):
  selected_id = restaurants[restaurants['name'] == text]['business_id'].item()
  closest_ids = closest_restaurants(selected_id, 10)
  competitor_series = restaurants[restaurants['business_id'].isin(closest_ids)]['name']
  s = pd.Series(competitor_series)
  lister = []
  for value in s.items():
    lister = np.append(lister, value[1])

  return lister
    


In [39]:
s