In [1]:
import pandas as pd
import numpy as np
import gc
import seaborn as sns
#from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm
#from tqdm import tqdm as tqdmflat
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import torch

import spacy
from spacy.lang.en import English
import os

from sklearn.model_selection import train_test_split, StratifiedKFold

from simpletransformers.classification import ClassificationModel

import logging


In [2]:
PATH = 'data/'
biz=pd.read_csv(os.path.join(PATH,"yelp_business.csv"))
reviews=pd.read_csv(os.path.join(PATH,"yelp_review.csv"))

In [3]:

col = ['neighborhood', 'address', 'latitude', 'longitude', 'stars']
biz.drop(columns=col, inplace=True)    

biz.categories = biz.categories.apply(lambda x: x.split(";"))

biz = biz[
    (biz.city == 'Cleveland') & 
    (biz.state == 'OH') & 
    (biz.is_open == 1) & 
    (biz.categories.apply(
        lambda x: True if 'Restaurants' in x else False
    ))]



In [4]:
df = pd.merge(reviews, biz, how='inner', on='business_id')

In [5]:
df['target'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

In [6]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,name,city,state,postal_code,review_count,is_open,categories,target
0,OPZsR2jCG72uoDNjU71DQQ,qYbWTWH5leltA0bzWAOnmA,meXjqyhTNLFmknY39y2sMg,5,2014-09-11,Solid beers -- Christmas Ale defines my holida...,1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...",1
1,fxGwEiSYDtAen8BNuVGGxg,8Az_JgEpXqAii_5EDkw2tw,meXjqyhTNLFmknY39y2sMg,3,2013-10-13,Meh. It was OK. A bartender the night before...,0,1,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...",0
2,Gweb4pADeQ26WnaiKEZ7GQ,T9tEic49JZjN4nCUcDvrRQ,meXjqyhTNLFmknY39y2sMg,4,2014-01-15,"Oh Christmas Ale, oh Christmas Ale, how lovely...",1,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...",1
3,P1vhwPI56SeZEz10ywaS7w,W1p8_CFW5FISSihmQo5Qzw,meXjqyhTNLFmknY39y2sMg,3,2012-02-09,What is the big deal about this place? The foo...,2,1,1,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...",0
4,1kQvQlBX0V5_rGddBh9-rQ,Y_PP05RRdzbKRYfDCCfh8w,meXjqyhTNLFmknY39y2sMg,5,2017-04-30,Great Lakes Brewing Company is one of my favor...,0,0,0,"""Great Lakes Brewing Company""",Cleveland,OH,44113,751,1,"[Breweries, Restaurants, Event Planning & Serv...",1


In [7]:
# Declare X & Y
Y = df['target']
X = df['text']

# Let's stratify the data so we get a fair balance
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42, stratify=Y, shuffle=True
)

X_trn, X_val, y_trn, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=2718, stratify=y_train, shuffle=True
)


In [86]:
# Set up parameters
params = {
    "fp16":False, 
    "use_early_stopping": True,
    "early_stopping_delta": 0.01,
    "early_stopping_metric": "mcc",
    "early_stopping_metric_minimize": False,
    "early_stopping_patience": 5,
    #"evaluate_during_training_steps": 1000, 
    #"max_seq_length": 30,
    "learning rate": 3e-5,
    #"train_batch_size": 8,
    #"eval_batch_size": 8, 
    #"silent": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 3
}

In [47]:
# Set up variables for kfold loop
n_splits = 10
kfold = StratifiedKFold(
    n_splits=n_splits, 
    shuffle=True, 
    random_state=42
    )

BERT_predict = np.zeros(X_train.shape[0])
BERT_proba = np.zeros(X_train.shape[0])
cv_score = []
fold = 0

In [None]:
%%time
# This step is memory intensive
gc.collect()

BERT_predict = np.zeros(X_train.shape[0])
BERT_proba = np.zeros(X_train.shape[0])
cv_score = []
fold = 0
fold = 0
# Begin kfold loop
for train_index, val_index in tqdm(kfold.split(X_train, y_train), total=n_splits):
  # Set up model
  BERT_model = ClassificationModel(
      #"distilbert", "distilbert-base-uncased", 
      "electra", "google/electra-small-discriminator",
      num_labels=2, args=params
      )

  # Create training set
  train_df = pd.merge(
      X_train.iloc[train_index], 
      y_train.iloc[train_index], 
      left_index=True, 
      right_index=True
      )

    
  # Create validation set
  val_df = pd.merge(
      X_train.iloc[val_index], 
      y_train.iloc[val_index], 
      left_index=True, 
      right_index=True
      )
  
  # Fit model
  BERT_model.train_model(train_df)
 
  # Calculate score
  result, model_outputs, wrong_predictions = BERT_model.eval_model(
      val_df, 
      acc=accuracy_score
      )
  
  # Calculate Probability
  pred, proba = BERT_model.predict(list(X_train))
  
  # Clear data to save memory
  del BERT_model
  gc.collect()

  # Add to Lists
  cv_score.append(result['acc'])
  BERT_predict += pred
  BERT_proba += proba[:,1]

  # Present Counter
  fold += 1
  print(f'\nFold {fold} out of {n_splits} complete!\n')
  
print('Evaluation Complete\n')

In [12]:
import torch
del BERT_model
gc.collect()
torch.cuda.empty_cache()

In [8]:
# Set up parameters
params = {
    "fp16":False, 
    "use_early_stopping": True,
    "early_stopping_delta": 0.01,
    "early_stopping_metric": "mcc",
    "early_stopping_metric_minimize": False,
    "early_stopping_patience": 5,
    #"evaluate_during_training_steps": 1000, 
    #"max_seq_length": 30,
    "learning rate": 3e-5,
    "train_batch_size": 50,
    "eval_batch_size": 50, 
    #"silent": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 5
}

In [9]:
gc.collect()

BERT_predict = np.zeros(X_train.shape[0])
BERT_proba = np.zeros(X_train.shape[0])
cv_score = []
fold = 0
fold = 0

BERT_model = ClassificationModel(
  "distilbert", "distilbert-base-uncased", 
  #"electra", "google/electra-small-discriminator",
  num_labels=2, args=params
  )

# Create training set
train_df = pd.merge(
  X_trn, 
  y_trn, 
  left_index=True, 
  right_index=True
  )


# Create validation set
val_df = pd.merge(
  X_val, 
  y_val, 
  left_index=True, 
  right_index=True
  )

test_df = pd.merge(
    X_test,
    y_test,
    left_index=True,
    right_index=True
)

train_df.columns = ['text', 'labels']
val_df.columns = ['text', 'labels']
test_df.columns = ['text', 'labels']

# Fit model
BERT_model.train_model(train_df, eval_df=val_df)

# Calculate score
result, model_outputs, wrong_predictions = BERT_model.eval_model(
  test_df, 
  acc=accuracy_score
  )

# Calculate Probability
pred, proba = BERT_model.predict(list(X_test))

# Clear data to save memory
#del BERT_model
gc.collect()
torch.cuda.empty_cache()

# Add to Lists
cv_score.append(result['acc'])
BERT_predict = pred
BERT_proba = proba[:,1]
print('done')

HBox(children=(FloatProgress(value=0.0, max=38413.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=4802.0, style=ProgressStyle(descr…

Running loss: 0.694625Running loss: 0.685928

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


Running loss: 0.645090



Running loss: 0.840242



KeyboardInterrupt: 

In [90]:
pred, proba = BERT_model.predict(X_test.values)

HBox(children=(FloatProgress(value=0.0, max=14228.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1779.0), HTML(value='')))




In [94]:
print(classification_report(pred, y_test))
BERT_predict = pred
BERT_proba = proba[:,1]

              precision    recall  f1-score   support

           0       0.81      0.86      0.83      4423
           1       0.93      0.91      0.92      9805

    accuracy                           0.89     14228
   macro avg       0.87      0.88      0.88     14228
weighted avg       0.89      0.89      0.89     14228



In [18]:
import torch
del BERT_model

torch.cuda.empty_cache()
gc.collect()

39633

In [8]:
params = {
    "fp16":False, 
    "use_early_stopping": True,
    "early_stopping_delta": 0.05,
    #"early_stopping_metric": "mcc",
    #"early_stopping_metric_minimize": False,
    "early_stopping_patience": 50,
    #"max_seq_length": 30,
    "learning rate": 1e-3,
    "train_batch_size": 50,
    "eval_batch_size": 50, 
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 100,
    "evaluate_during_training_verbose": True,
    #"silent": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 3,
    "do_lower_case": True
}

In [9]:
%%time
gc.collect()

BERT_predict = np.zeros(X_train.shape[0])
BERT_proba = np.zeros(X_train.shape[0])
cv_score = []
fold = 0
fold = 0

BERT_model = ClassificationModel(
  "distilbert", "distilbert-base-uncased", 
  #"electra", "google/electra-small-discriminator",
  num_labels=2, args=params
  )

# Create training set
train_df = pd.merge(
  X_trn, 
  y_trn, 
  left_index=True, 
  right_index=True
  )


# Create validation set
val_df = pd.merge(
  X_val, 
  y_val, 
  left_index=True, 
  right_index=True
  )

test_df = pd.merge(
    X_test,
    y_test,
    left_index=True,
    right_index=True
)

train_df.columns = ['text', 'labels']
val_df.columns = ['text', 'labels']
test_df.columns = ['text', 'labels']

# Fit model
BERT_model.train_model(train_df, eval_df=val_df)

# Calculate score
result, model_outputs, wrong_predictions = BERT_model.eval_model(
  test_df, 
  acc=accuracy_score
  )

# Calculate Probability
pred, proba = BERT_model.predict(list(X_test))

# Clear data to save memory
#del BERT_model
gc.collect()
torch.cuda.empty_cache()

# Add to Lists
cv_score.append(result['acc'])
BERT_predict = pred
BERT_proba = proba[:,1]
print('done')

HBox(children=(FloatProgress(value=0.0, max=38413.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=769.0, style=ProgressStyle(descri…

Running loss: 0.688588

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


Running loss: 0.397807



Running loss: 0.384651



Running loss: 0.210187


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=769.0, style=ProgressStyle(descri…

Running loss: 0.520770


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=769.0, style=ProgressStyle(descri…

Running loss: 0.003641



HBox(children=(FloatProgress(value=0.0, max=14228.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14228.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))


done


In [10]:
print(classification_report(pred, y_test))
BERT_predict = pred
BERT_proba = proba[:,1]

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      4635
           1       0.93      0.92      0.93      9593

    accuracy                           0.90     14228
   macro avg       0.89      0.89      0.89     14228
weighted avg       0.90      0.90      0.90     14228



In [11]:
model = ClassificationModel('distilbert', 'outputs/best_model', args={})
pred, proba = model.predict(X_test.values)
print(classification_report(pred, y_test))

HBox(children=(FloatProgress(value=0.0, max=14228.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))


              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4411
           1       0.94      0.92      0.93      9817

    accuracy                           0.91     14228
   macro avg       0.89      0.90      0.89     14228
weighted avg       0.91      0.91      0.91     14228



In [13]:
# Declare X & Y
Ys = df['stars'] - 1
Xs = df['text']

# Let's stratify the data so we get a fair balance
Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    Xs, Ys, test_size=0.25, random_state=42, stratify=Y, shuffle=True
)

Xs_trn, Xs_val, ys_trn, ys_val = train_test_split(
    Xs_train, ys_train, test_size=0.1, random_state=2718, stratify=y_train, shuffle=True
)


In [14]:
params = {
    "fp16":False, 
    "use_early_stopping": True,
    "early_stopping_delta": 0.05,
    #"early_stopping_metric": "mcc",
    #"early_stopping_metric_minimize": False,
    "early_stopping_patience": 50,
    #"max_seq_length": 30,
    "learning rate": 1e-3,
    "train_batch_size": 50,
    "eval_batch_size": 50, 
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 100,
    "evaluate_during_training_verbose": True,
    #"silent": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 3,
    "do_lower_case": True
}

In [15]:
%%time
gc.collect()

BERT_predict = np.zeros(Xs_train.shape[0])
BERT_proba = np.zeros(Xs_train.shape[0])
cv_score = []
fold = 0
fold = 0

BERT_model = ClassificationModel(
  "distilbert", "distilbert-base-uncased", 
  #"electra", "google/electra-small-discriminator",
  num_labels=5, args=params
  )

# Create training set
train_df = pd.merge(
  Xs_trn, 
  ys_trn, 
  left_index=True, 
  right_index=True
  )


# Create validation set
val_df = pd.merge(
  Xs_val, 
  ys_val, 
  left_index=True, 
  right_index=True
  )

test_df = pd.merge(
    Xs_test,
    ys_test,
    left_index=True,
    right_index=True
)

train_df.columns = ['text', 'labels']
val_df.columns = ['text', 'labels']
test_df.columns = ['text', 'labels']

# Fit model
BERT_model.train_model(train_df, eval_df=val_df)

# Calculate score
result, model_outputs, wrong_predictions = BERT_model.eval_model(
  test_df, 
  acc=accuracy_score
  )

# Calculate Probability
pred, proba = BERT_model.predict(list(Xs_test))

# Clear data to save memory
#del BERT_model
gc.collect()
torch.cuda.empty_cache()

# Add to Lists
cv_score.append(result['acc'])
BERT_predict = pred
BERT_proba = proba[:,1]
print('done')

HBox(children=(FloatProgress(value=0.0, max=38413.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=769.0, style=ProgressStyle(descri…

Running loss: 1.579074



KeyboardInterrupt: 

In [12]:
def format_metrics(model_sum):
    # Print cross-validation scores
    print("Cross Validation Scores\n" + 23*"=" + "\n{}\n".format(model_sum['cv_scores']))
    print("Average CV = {}\n".format(model_sum['cv_mean']))
    
    # Print classification report
    print("Classification Report\n" + 21*"=" + "\n" + model_sum['class_report'])
    
    # Print confusion matrix results
    print("Confusion Matrix\n" + 16*"=" + '\nTrue Positives = {}\nTrue Negatives'\
          '= {}\nFalse Positives = {}\nFalse Negatives = {}'\
          .format(model_sum['true positives'], 
                  model_sum['true negatives'], 
                  model_sum['false positives'], 
                  model_sum['false negatives']))

In [16]:
# Get average prediction of each model
#BERT_predict /= n_splits
#BERT_proba /= n_splits
    
# Create confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test.values, pred).ravel()
    
# Save metrics in a dictionary
BERT_results = {}
    
BERT_results['cv_scores'] = cv_score
BERT_results['cv_mean'] = np.mean(cv_score)
    
BERT_results['true negatives'] = tn
BERT_results['false positives'] = fp
BERT_results['false negatives'] = fn
BERT_results['true positives'] = tp
BERT_results['accuracy'] = (tp+tn)/len(BERT_predict)
BERT_results['f1 score'] = 2*tp/(2*tp+fp+fn)
BERT_results['class_report'] = classification_report(y_test, BERT_predict.round())

# Print results
print('\nELECTRA Classification Model')
format_metrics(BERT_results)


ELECTRA Classification Model
Cross Validation Scores
[0.8909193140286759]

Average CV = 0.8909193140286759

Classification Report
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      4672
           1       0.91      0.93      0.92      9556

    accuracy                           0.89     14228
   macro avg       0.88      0.87      0.88     14228
weighted avg       0.89      0.89      0.89     14228

Confusion Matrix
True Positives = 8842
True Negatives= 3834
False Positives = 838
False Negatives = 714
