In [1]:
#make compatible with Python 2 and Python 3
from __future__ import print_function, division, absolute_import

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# regular expressions, text parsing, and ML classifiers
import re
import nltk
import bs4 as bs
import numpy as np
import pandas as pd
 

# download NLTK classifiers
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator

eng_stopwords = stopwords.words('english')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lennonzheng/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lennonzheng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lennonzheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lennonzheng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Process data

In [3]:
reviews_data = pd.read_csv("/Users/lennonzheng/Downloads/DataX/Project/yelp_dataset/austin_only/austin_review.csv")
austin_rest = pd.read_csv("/Users/lennonzheng/Downloads/DataX/Project/yelp_dataset/austin_only/austin_rest.csv")

In [30]:
## takes in a resturant name (user inputs), or cusine type
## match the Business id from resturant df to review df, get all reviews, and return a filtered review df

def filter_reviews(rest_name, cusine=None, rest_df=austin_rest, rev_df=reviews_data):
    rest_id = austin_rest.loc[austin_rest.name == rest_name].business_id.values[0]
    train = rev_df.loc[reviews_data.business_id == rest_id]
    return train

In [56]:
train = filter_reviews("Franklin Barbecue")

## Preparing data for classification

In [57]:
def review_cleaner(review, lemmatize=True, stem=False):
    '''
        Clean and preprocess a review.
            1. Remove HTML tags
            2. Extract emoticons
            3. Use regex to remove all special characters (only keep letters)
            4. Make strings to lower case and tokenize / word split reviews
            5. Remove English stopwords
            6. Lemmatize
            7. Rejoin to one string
        
        @review (type:str) is an unprocessed review string
        @return (type:str) is a 6-step preprocessed review string
    '''
    
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()

    cleaned_reviews=[]
    for i,review in enumerate(train['text']):
        # batching step notification
        if( (i+1)%1000 == 0 ):
            print("Done with %d reviews" %(i+1))
        
        
        #1. Remove HTML tags
        review = bs.BeautifulSoup(review).text    

        #2. Use regex to find emoticons
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)

        #3. Remove punctuation
        review = re.sub("[^a-zA-Z]", " ",review)

        #4. Tokenize into words (all lower case)
        review = review.lower().split()

        #5. Remove stopwords
        eng_stopwords = set(stopwords.words("english"))
        
        #6. Lemmatize 
        clean_review=[]
        for word in review:
            if word not in eng_stopwords:
                if lemmatize is True:
                    word=wnl.lemmatize(word)
                elif stem is True:
                    if word == 'oed':
                        continue
                    word=ps.stem(word)
                clean_review.append(word)

        #7. Join the review to one sentence
        review_processed = ' '.join(clean_review+emoticons)
        cleaned_reviews.append(review_processed)
    

    return(cleaned_reviews)

## Train and validate sentiment analysis model using Random Forest Classifier (RFC)

In [58]:
from sklearn import metrics                          # evaluating model
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#CountVectorizer can actucally handle a lot of the preprocessing for us
from sklearn.feature_extraction.text import CountVectorizer

# seed
np.random.seed(0)

In [59]:
def train_predict_sentiment(cleaned_reviews, y=train["stars"], ngram=1, max_features=1000):
    '''
        This function will:
            1. split data into train and test set.
            2. get n-gram counts from cleaned reviews 
            3. train a random forest model using train n-gram counts and y (labels)
            4. test the model on your test split
            5. print accuracy of sentiment prediction on test and training data
            6. print confusion matrix on test data results

            To change n-gram type, set value of ngram argument
            To change the number of features you want the countvectorizer to generate, set the value of max_features argument
            
            @cleaned_review (type:str) is preprocessed string from review_cleaner()
            @return none
    '''

    print("Creating the bag of words model!\n")
    # CountVectorizer" is scikit-learn's bag of words tool, here we show more keywords 
    vectorizer = CountVectorizer(ngram_range=(1, ngram),
                                 analyzer = "word",   
                                 tokenizer = None,    
                                 preprocessor = None, 
                                 stop_words = None,   
                                 max_features = max_features) 
    
    # train / test split
    X_train, X_test, y_train, y_test = train_test_split(cleaned_reviews, y, random_state=0, test_size=.2)

    # Then we use fit_transform() to fit the model / learn the vocabulary,
    # then transform the data into feature vectors.
    # The input should be a list of strings. .toarraty() converts to a numpy array
    
    train_bag = vectorizer.fit_transform(X_train).toarray()
    test_bag = vectorizer.transform(X_test).toarray()

    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 50 trees
    forest = RandomForestClassifier(n_estimators = 50) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)

    # predict
    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)
    
    # validation
    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    
    print(" The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
    print()
    print('CONFUSION MATRIX:')
    print('         Predicted')
    print('          neg pos')
    print(' Actual')
    c=confusion_matrix(y_test, test_predictions)
    print('     neg  ',c[0])
    print('     pos  ',c[1])

    #Extract feature importance
    print('\nTOP TEN IMPORTANT FEATURES:')
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_10 = indices[:20]
    print([vectorizer.get_feature_names()[ind] for ind in top_10])

## Train and test  Model

<br>

**Preprocess data**

In [60]:
# Clean the reviews in the training set 'train' using review_cleaner function defined above
# Here we use the original reviews without lemmatizing and stemming
original_clean_reviews_lemmatize = review_cleaner(train['text'], lemmatize=True, stem=False)

Done with 1000 reviews
Done with 2000 reviews
Done with 3000 reviews
Done with 4000 reviews
Done with 5000 reviews


<br>

**Train RFC**

In [61]:
train_predict_sentiment(cleaned_reviews=original_clean_reviews_lemmatize, y=train["stars"], ngram=2, max_features=1000)

Creating the bag of words model!

Training the random forest classifier!

 The training accuracy is:  0.9995069033530573 
 The validation accuracy is:  0.7438423645320197

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [ 1  0  0  0 23]
     pos   [ 0  0  1  1 27]

TOP TEN IMPORTANT FEATURES:
['brisket', 'good', 'better', 'line', 'best', 'wait', 'bbq', 'ever', 'hour', 'star', 'franklin', 'food', 'rib', 'long', 'place', 'meat', 'would', 'get', 'time', 'pretty good']
