In [1]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
!pip install sentencepiece



In [4]:
#!pip install tensorflow

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from sklearn.model_selection import StratifiedKFold

import tokenization

In [6]:
#Reading Data
am = pd.read_csv("/content/drive/My Drive/amazon_imdb_yelp/amazon_cells_labelled.txt",sep = '\t', header = None)
im = pd.read_csv("/content/drive/My Drive/amazon_imdb_yelp/imdb_labelled.txt",sep = '\t', header = None)
yp = pd.read_csv("/content/drive/My Drive/amazon_imdb_yelp/yelp_labelled.txt",sep = '\t', header = None)

In [7]:

column_names = ['Review', 'Sentiment']

In [8]:
am.columns = column_names
im.columns = column_names
yp.columns = column_names

In [9]:
data = am.append([im, yp], ignore_index=True)

In [10]:
data.shape

(2748, 2)

In [11]:
data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [12]:
import unicodedata

In [13]:
#Removal of accented text
def remove_accented_chars(x):
  removed=[]
  x = unicodedata.normalize('NFKD',x).encode('ascii','ignore').decode('utf-8','ignore')
  removed.append(x)

  return " ".join(removed)

In [15]:
data['Review'] = data['Review'].apply(remove_accented_chars)

In [18]:
#!pip install contractions

In [19]:
import contractions

In [21]:
#Contractions to expansions
data['Review'] = data['Review'].apply(lambda x: ' '.join([contractions.fix(t) for t in x.split()]))

In [22]:
import re

In [23]:
#Removal of web links
def remove_links(text):
    text = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',text)
    return text

In [24]:
data['Review'] = data['Review'].apply(lambda x:remove_links(x))

In [25]:
!pip install beautifulsoup4



In [26]:
from bs4 import BeautifulSoup

In [27]:
#Removal of html tags
data['Review'] = data['Review'].apply(lambda x: BeautifulSoup(x,'lxml').get_text())

In [28]:
#removal of special characters, numbers,white spaces
data['Review'] = data['Review'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",str(x)).split()))

In [29]:
#!pip install spacy

In [30]:
import spacy

In [31]:
#!python -m spacy download en_core_web_lg

In [32]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

In [33]:
nlp = spacy.load('en_core_web_lg')

In [34]:
sent = nlp.create_pipe('sentencizer')

In [35]:
nlp.add_pipe(sent, before='parser')

In [36]:
import string

In [37]:
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [38]:
#Removal of stopwords,puctuations
def text_data_preprocess(sentence):
    doc = nlp(sentence)
    token = "  ".join(str(doc).split())
    cleaned_tokens = []
    if token not in stopwords and token not in punc:
        cleaned_tokens.append(token.lower())
    return "  ".join(cleaned_tokens)

In [39]:
data['head'] = data['Review'].apply(text_data_preprocess)

In [40]:
data.head()

Unnamed: 0,Review,Sentiment,head
0,So there is no way for me to plug it in here i...,0,so there is no way for me to plug it ...
1,Good case Excellent value,1,good case excellent value
2,Great for the jawbone,1,great for the jawbone
3,Tied to charger for conversations lasting more...,0,tied to charger for conversations lasting...
4,The mic is great,1,the mic is great


In [41]:
data.to_csv('cleaned_amzn_ylp_imdb_data.csv',index=False)

In [42]:
#Bert Preprocessing

In [43]:
def bert_encode(texts):
    all_tokens = []
    all_masks = []
    all_segments = []
    max_len = 512
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [44]:
def build_model(bert_layer,max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [45]:

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [46]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [47]:
#using folds

skf = StratifiedKFold(n_splits=2)
for fold, (trn_idx, val_idx) in enumerate(skf.split(data['head'],data['Sentiment'])):
  print('\nFold {}\n'.format(fold))
  

  X_trn_encoded = bert_encode(data.loc[trn_idx, 'head'].str.lower())
  y_trn = data.loc[trn_idx, 'Sentiment']
  X_val_encoded = bert_encode(data.loc[val_idx, 'head'].str.lower())
  y_val = data.loc[val_idx, 'Sentiment']
        
            

  model = build_model(bert_layer,max_len=512)   
       
  model.fit(X_trn_encoded, y_trn, validation_data=(X_val_encoded, y_val), epochs=2,batch_size=2,verbose=1)
  



Fold 0

Epoch 1/2




Epoch 2/2

Fold 1

Epoch 1/2




Epoch 2/2


In [48]:
#saving a model
model.save('/content/drive/My Drive/amazon_imdb_yelp/review_bert_git.h5')