# Install Packages

In [None]:
# pip install transformers
# pip install nltk

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf
from sklearn.metrics import classification_report
import ast
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math



# Read Data

In [2]:
path_business = 'business_data.csv'
data_business = pd.read_csv(path_business)

In [3]:
## Drop unnecessary columns ##
data_business.drop(columns=['address', 'postal_code', 'stars', 'is_open', 'attributes', 'hours'], axis=1, inplace=True)

In [4]:
path_reviews = 'xaa0_cleaned.csv'
data_reviews = pd.read_csv(path_reviews)

# Create Labels 

In [5]:
## REMOVE 3.0 STAR REVIEWS ##
df_labeled = data_reviews[data_reviews.stars != 3.0].copy()

## Sentiment classification ##
## df.stars = 1.0 or 2.0 -> negative sentiment (label = 0)
## df.stars = 4.0 or 5.0 -> positive sentiment (label = 1)

df_labeled['sentiment'] = np.where(df_labeled['stars'] < 3.0, 0, 1)

## Shuffle Dataset ##
df_shuffled = shuffle(df_labeled, random_state = 42)

## Balance Dataset ##
df_negative = df_shuffled[df_shuffled.sentiment == 0][:100000]
df_positive = df_shuffled[df_shuffled.sentiment == 1][:100000]
df = pd.concat([df_negative, df_positive])

In [6]:
## Drop unnecessary columns ##
df.drop(df.columns[:2], axis=1, inplace=True)

In [7]:
df = pd.merge(df, data_business, left_on='business_id', right_on='business_id', how='left')

# Extract Revelant Rows

In [241]:
# Keep data that only contain these words (food)
def find_food(col):
    food_corpus = ['food','dish','burger','chicken','sauce',\
                'fry','taste','cheese','flavor','salad','love','try',\
                'sandwich','pizza', 'lunch', 'breakfast',\
                'dinner', 'delicious', 'hoagie', 'yummy']
    return any(w in col for w in food_corpus)


In [243]:
df_food = df[df['cleaned_no_stopwords'].apply(find_food)]

In [None]:
# Keep data that only contain these words (service)
def find_service(col):
    service_corpus = ['clean','location','staff','place',\
                    'well','friendly','service','order',\
                    'time', 'wait','come','customer', 'nice', \
                    'table', 'customer service',\
                    'waiter', 'waitress', 'he was', 'she was']
    return any(w in col for w in service_corpus)


In [None]:
df_service = df[df['cleaned_no_stopwords'].apply(find_service)]

# Split Dataset to Train and Test

### Food

In [244]:
def get_next_adj(pos_tag, index_target):
    for i, j in enumerate(pos_tag[index_target+1:]):
        if j in ['JJ', 'JJR', 'JJS', 'VBD', 'VBN', 'RB', 'RBR', 'RBS', 'NN', 'NNP']:
            return i+index_target+1

In [13]:
stop_words = set(stopwords.words('english'))

In [292]:
def get_food_sent(text):
    food_corpus = ['food','dish','burger','chicken','sauce',\
                'fry','taste','cheese','flavor','salad','love','try',\
                'sandwich','pizza', 'lunch', 'breakfast',\
                'dinner', 'delicious', 'hoagie', 'yummy']
    service_corpus = ['clean','location','staff','place',\
                    'well','friendly','service','order',\
                    'time', 'wait','come','customer', 'nice', \
                    'table', 'customer service',\
                    'waiter', 'waitress', 'he was', 'she was']
    s = ''
    for sentence in text.replace('!', '.').split("."):
        food = False
        service = False

        if any(w.lower() in sentence.lower() for w in food_corpus):
            food = True
        if any(r.lower() in sentence.lower() for r in service_corpus):
            service = True
        if food == True and service == False:
            s += sentence + '.'
        if food == True and service == True: # if contains both then do something 
            tokenized = sent_tokenize(sentence) # need to tokenize sentence that contains both 
            for i in tokenized:
                wordsList = nltk.word_tokenize(i)
                wordsList = [w for w in wordsList if not w in stop_words]
                tagged = nltk.pos_tag(wordsList)
            unzipped = list(zip(*tagged))
            for i in food_corpus:
                try: 
                    index_target = unzipped[0].index(i)
                    # if sentence starts with adjective # if food_words is last
                    # sometimes sentence will start with a differennt POS but follow immediately with an adjective
                    if index_target == 0: # if food_words is first
                        s += ' ' + ' '.join([unzipped[0][index_target], unzipped[0][index_target+1]]) + '.'
                    elif index_target == (len(unzipped[0])-1):
                        print()
                        s += ' ' + ' '.join([unzipped[0][index_target-1], unzipped[0][index_target]]) + '.'                    
                    elif unzipped[1][0] in ['JJ', 'JJR', 'JJS', 'VBD', 'VBN', 'RB', 'RBR', 'RBS', 'NN','NNP'] or unzipped[1][1] in ['JJ', 'JJR', 'JJS', 'VBD', 'VBN', 'RB', 'RBR', 'RBS', 'NN', 'NNP']: 
                        s += ' ' + ' '.join([unzipped[0][index_target-1], unzipped[0][index_target+1]]) + '.'
                    else: 
                        index_adj = get_next_adj(unzipped[1], index_target)
                        s += ' ' + ' '.join([unzipped[0][index_target], unzipped[0][index_adj]]) + '.'
                except: 
                    pass
    return s

In [85]:
# get_food_sent('We ordered breakfast two mornings -- it arrived quickly, was hot and the staff was very friendly.')

' ordered breakfast.'

In [None]:
df_food['food_sentences'] = df_food['text'].apply(get_food_sent)

In [247]:
df_food.reset_index(inplace=True)

In [248]:
# Due to BERT's long runtime, the train-test split will be different
dff_train_n = df_food[df_food.sentiment == 0][:4000]
dff_train_p = df_food[df_food.sentiment == 1][:4000]

dff_test_n = df_food[df_food.sentiment == 0][4000:]
dff_test_p = df_food[df_food.sentiment == 1][4000:]

dff_train = pd.concat([dff_train_n, dff_train_p])
dff_test = pd.concat([dff_test_n, dff_test_p])

### Service

In [312]:
def get_service_sent(text):
    food_corpus = ['food','dish','burger','chicken','sauce',\
                'fry','taste','cheese','flavor','salad','love','try',\
                'sandwich','pizza', 'lunch', 'breakfast',\
                'dinner', 'delicious', 'hoagie', 'yummy']
    service_corpus = ['clean','location','staff','place',\
                    'well','friendly','service','order',\
                    'time', 'wait','come','customer', 'nice', \
                    'table', 'customer service',\
                    'waiter', 'waitress', 'he was', 'she was']
    s = ''
    for sentence in text.replace('!', '.').split("."):
        food = False
        service = False

        if any(w.lower() in sentence.lower() for w in food_corpus):
            food = True
        if any(r.lower() in sentence.lower() for r in service_corpus):
            service = True
        if food == False and service == True:
            s += sentence + '.'
        if food == True and service == True: # if contains both then do something 
            tokenized = sent_tokenize(sentence) # need to tokenize sentence that contains both 
            for i in tokenized:
                wordsList = nltk.word_tokenize(i)
                wordsList = [w for w in wordsList if not w in stop_words]
                tagged = nltk.pos_tag(wordsList)
            unzipped = list(zip(*tagged))
            for i in service_corpus:
                try: 
                    index_target = unzipped[0].index(i)
                    # if sentence starts with adjective # if food_words is last
                    # sometimes sentence will start with a differennt POS but follow immediately with an adjective
                    if index_target == 0: # if food_words is first
                        s += ' ' + ' '.join([unzipped[0][index_target], unzipped[0][index_target+1]]) + '.'
                    elif index_target == (len(unzipped[0])-1):
                        s += ' ' + ' '.join([unzipped[0][index_target-1], unzipped[0][index_target]]) + '.'                    
                    elif unzipped[1][0] in ['JJ', 'JJR', 'JJS', 'VBD', 'VBN', 'RB', 'RBR', 'RBS', 'NN','NNP'] or unzipped[1][1] in ['JJ', 'JJR', 'JJS', 'VBD', 'VBN', 'RB', 'RBR', 'RBS', 'NN', 'NNP']: 
                        s += ' ' + ' '.join([unzipped[0][index_target], unzipped[0][index_target+1]]) + '.'
                    else: 
                        index_adj = get_next_adj(unzipped[1], index_target)
                        s += ' ' + ' '.join([unzipped[0][index_target], unzipped[0][index_adj]]) + '.'
                except: 
                    pass
    return s

In [95]:
df_service['service_sentences'] = df_service['text'].apply(get_service_sent)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_service['service_sentences'] = df_service['text'].apply(get_service_sent)


In [None]:
# text = 'When we sat down we had a great waiter but we were disappointed in the food.'

In [96]:
# Due to BERT's long runtime, the train-test split will be different
dfs_train_n = df_service[df_service.sentiment == 0][:5000]
dfs_train_p = df_service[df_service.sentiment == 1][:5000]

dfs_test_n = df_service[df_service.sentiment == 0][5000:]
dfs_test_p = df_service[df_service.sentiment == 1][5000:]

dfs_train = pd.concat([dfs_train_n, dfs_train_p])
dfs_test = pd.concat([dfs_test_n, dfs_test_p])

# BERT for food

In [249]:
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(dff_train['food_sentences'].values.tolist(), dff_train['sentiment'], test_size=.3, random_state = 0)


In [250]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


In [251]:
train_encodings = tokenizer(training_sentences, truncation=True, padding=True)
validation_encodings = tokenizer(validation_sentences, truncation=True, padding=True)


In [252]:
train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), training_labels) )
validation_dataset = tf.data.Dataset.from_tensor_slices( (dict(validation_encodings), validation_labels) )


In [253]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_215']
You should probably TRAIN this model on a down-stream task to be able to use 

In [254]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=1,
          batch_size=16,
          validation_data=validation_dataset.shuffle(100).batch(16))





<keras.callbacks.History at 0x7fafda87a280>

## Predictor for Food

In [198]:
def make_food_prediction(text):
    text = get_food_sent(text)
    p = tokenizer.encode(text,
        truncation=True,
        padding=True,
        return_tensors="tf")

    tf_output = model.predict(p)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    labels = ['Negative','Positive']
    label = tf.argmax(tf_prediction, axis=1)
    label = label.numpy()
    return labels[label[0]]

In [293]:
test_sentence = "burgers were gross but staff was nice"

In [294]:
make_food_prediction(test_sentence)



'Negative'

## Accuracy for Food

In [255]:
test_encodings = tokenizer(dff_test['food_sentences'].tolist(), truncation=True, padding=True)

In [257]:
test_dataset = tf.data.Dataset.from_tensor_slices( (dict(test_encodings), dff_test['sentiment'].tolist()) )


In [258]:
pred = model.predict(test_dataset)





   482/128878 [..............................] - ETA: 11:20:08

KeyboardInterrupt: 

In [162]:
dff_test['pred'] = tf.argmax(tf.nn.softmax(pred['logits'], axis=1), axis =1)

In [123]:
print(classification_report(dff_test['pred'], dff_test['sentiment']))


              precision    recall  f1-score   support

           0       0.90      0.76      0.83     69857
           1       0.76      0.90      0.83     59021

    accuracy                           0.83    128878
   macro avg       0.83      0.83      0.83    128878
weighted avg       0.84      0.83      0.83    128878



## Save the Model and Test

In [259]:
model.save_pretrained("food_model")

In [262]:
food_model = TFDistilBertForSequenceClassification.from_pretrained("food_model")

Some layers from the model checkpoint at food_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_215']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at food_model and are newly initialized: ['dropout_255']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [296]:
text = "the staff was really nice but the food was terrible"
text = get_food_sent(text)
p = tokenizer.encode(text,
    truncation=True,
    padding=True,
    return_tensors="tf")

tf_output = food_model.predict(p)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive']
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
labels[label[0]]



'Negative'

# BERT for Service

In [167]:
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(dfs_train['service_sentences'].values.tolist(), dfs_train['sentiment'], test_size=.3, random_state=0)


In [168]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


In [169]:
train_encodings = tokenizer(training_sentences, truncation=True, padding=True)
validation_encodings = tokenizer(validation_sentences, truncation=True, padding=True)

In [170]:
train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), training_labels) )

validation_dataset = tf.data.Dataset.from_tensor_slices( (dict(validation_encodings), validation_labels) )

In [171]:
model_2 = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use i

In [172]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model_2.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model_2.fit(train_dataset.shuffle(100).batch(16),
          epochs=1,
          batch_size=16,
          validation_data=validation_dataset.shuffle(100).batch(16))





<keras.callbacks.History at 0x7fad7ef229d0>

## Predictor for Service

In [221]:
def make_service_prediction(text):
    text = get_service_sent(text)
    p = tokenizer.encode(text,
        truncation=True,
        padding=True,
        return_tensors="tf")

    tf_output = model_2.predict(p)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    labels = ['Negative','Positive']
    label = tf.argmax(tf_prediction, axis=1)
    label = label.numpy()
    return labels[label[0]]

In [310]:
test_sentence = "the staff was really nice but the food was terrible"

In [313]:
make_service_prediction(test_sentence)



'Positive'

# Accuracy for Service

In [187]:
test_encodings = tokenizer(dfs_test['service_sentences'].tolist(), truncation=True, padding=True)

In [188]:
test_dataset = tf.data.Dataset.from_tensor_slices( (dict(test_encodings), dfs_test['sentiment'].tolist()) )

In [192]:
pred = model.predict(test_dataset)



In [193]:
dfs_test['pred'] = tf.argmax(tf.nn.softmax(pred['logits'], axis=1), axis=1)

In [194]:
print(classification_report(dfs_test['pred'], dfs_test['sentiment']))


              precision    recall  f1-score   support

           0       0.93      0.75      0.83    103178
           1       0.68      0.90      0.78     61529

    accuracy                           0.81    164707
   macro avg       0.80      0.82      0.80    164707
weighted avg       0.83      0.81      0.81    164707



## Save the Model and Test

In [239]:
model_2.save_pretrained("service_model")

In [261]:
service_model = TFDistilBertForSequenceClassification.from_pretrained("service_model")

Some layers from the model checkpoint at service_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_59']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at service_model and are newly initialized: ['dropout_235']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [314]:
text = 'the staff was really nice but the food was terrible'
text = get_service_sent(text)
p = tokenizer.encode(text,
    truncation=True,
    padding=True,
    return_tensors="tf")

tf_output = service_model.predict(p)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive']
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
labels[label[0]]



'Positive'