In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from collections import Counter
from IPython.display import display_html
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
sns.set(style="whitegrid")

# supervised learning models
from sklearn import ensemble
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

# deep neutral network
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# enable eager execution for Keras
tf.compat.v1.enable_eager_execution(
    config=None, device_policy=None, execution_mode=None
)

# text manipulation tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF
import spacy
from wordcloud import WordCloud

In [2]:
df = pd.read_feather('yelp_reviews.feather')
df['target'] = df['stars'].apply(lambda x: 'positive' if x >= 4 else 'negative')

## Train-test split the data

In [3]:
# declare X & Y
Y = df['target']
X = df['lem_join']

# let's stratify the data so we get a fair balance
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42, stratify=Y
)

## Prep BoW

In [4]:
def BoW_generator(text):
    
    # get the top 2000 words
    n_top_words = 2000
    count_vec = CountVectorizer(max_features=n_top_words)
    mask = count_vec.fit_transform(text)
    
    # create a dataframe
    word_counts = pd.DataFrame(
        mask.toarray().reshape(-1, n_top_words), 
        columns=count_vec.get_feature_names()
    )  
    
    return word_counts

In [5]:
# get the top 2000 words
n_top_words = 2000
count_vec = CountVectorizer(max_features=n_top_words)

# apply to X values
X_train_bow = count_vec.fit_transform(X_train)
X_test_bow = count_vec.transform(X_test)

## Prep Tf-Idf

In [6]:
tfidf_vec = TfidfVectorizer(
    max_df=0.5, 
    min_df=2,
    stop_words='english', 
    lowercase=True,
    use_idf=True,
    norm=u'l2',
    smooth_idf=True
)

In [7]:
# applying to X values
X_train_tfidf=tfidf_vec.fit_transform(X_train)
X_test_tfidf=tfidf_vec.transform(X_test)

# Phase 1: Sentiment Analysis of Yelp Reviews

In [8]:
def model_metrics(model, X, y):
    model_sum = {}
    
    # get cross validation score & mean of CV score
    cv_score = cross_val_score(model, X, y, cv=10, n_jobs=-1)
    
    model_sum['cv_scores'] = list(cv_score)
    model_sum['cv_mean'] = np.mean(cv_score)
    
    # get confusion matrix metrics
    y_pred = model.predict(X)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    
    model_sum['true negatives'] = tn
    model_sum['false positives'] = fp
    model_sum['false negatives'] = fn
    model_sum['true positives'] = tp
    model_sum['accuracy'] = (tp+tn)/len(y_pred)
    model_sum['f1 score'] = 2*tp/(2*tp+fp+fn)
    model_sum['class_report'] = classification_report(y, y_pred)
    
    return model_sum

In [9]:
def format_metrics(model_sum):
    # print cross-validation scores
    print("Cross Validation Scores\n" + 23*"=" + "\n{}\n".format(model_sum['cv_scores']))
    print("Average CV = {}\n".format(model_sum['cv_mean']))
    
    # print classification report
    print("Classification Report\n" + 21*"=" + "\n" + model_sum['class_report'])
    
    # print confusion matrix results
    print("Confusion Matrix\n" + 16*"=" + '\nTrue Positives = {}\nTrue Negatives'\
          '= {}\nFalse Positives = {}\nFalse Negatives = {}'\
          .format(model_sum['true positives'], 
                  model_sum['true negatives'], 
                  model_sum['false positives'], 
                  model_sum['false negatives']))

## LSTM in Keras

In [10]:
print(tf.__version__)
print(tf.executing_eagerly())

1.13.2
True


In [11]:
def create_keras(X_train):
    input_dim = X_train_bow.shape[1]
    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                         optimizer='adam', 
                         metrics=['accuracy'], 
                  run_eagerly=True)
    
    print(model.summary())
    return model

In [12]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

keras_model1 = KerasClassifier(
    build_fn=create_keras(X_train_bow), 
    epochs=10, 
    batch_size=10, 
    verbose=0)

cross_val_score(keras_model1, X_train_bow, y_train.factorize()[0], cv=3)

Instructions for updating:
Colocations handled automatically by placer.


ValueError: When running a model in eager execution, the optimizer must be an instance of tf.train.Optimizer. Received: <tensorflow.python.keras.optimizers.Adam object at 0x0000025C8082EB88>