In [None]:
from google.colab import drive
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from sklearn import model_selection
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics import accuracy_score
nltk.download('punkt')
import codecs

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
drive.mount('/content/drive')

In [None]:
import os
import pprint # for pretty printing our device stats

if 'COLAB_TPU_ADDR' not in os.environ:
    print('ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!')
else:
    tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    print ('TPU address is', tpu_address)

    with tf.compat.v1.Session(tpu_address) as session:
      devices = session.list_devices()

    print('TPU devices:')
    pprint.pprint(devices)

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [None]:
!unzip glove.twitter.27B.zip

In [None]:
class GloveModels:
  def __init__(self,path):
    df = pd.read_csv(path)
    df['text'] = df['text'].apply(lambda x: str(x))
    self.dfs_train = df[df['kfold']!=0]
    self.dfs_valid = df[df['kfold']==0]



  def process(self):
    self.read_glove()
    self.data_per_fold_glove()
    return self.glove_wt,self.lls
    

  def read_glove(self):
      embeddings_index = {}
      f = open('glove.twitter.27B.100d.txt')
      for line in f:
          values = line.split()
          word = values[0]
          coefs = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = coefs
      f.close()

      print('Found %s word vectors.' % len(embeddings_index))
      self.glove_embeddings_index = embeddings_index
  
  def data_per_fold_glove(self):
      # self.data_glove = {}
      embed_dim = 100
      self.data_glove = self.create_glove_embedding_matrix(embed_dim)

  def create_glove_embedding_matrix(self,embed_dim):
    train = self.dfs_train
    test = self.dfs_valid
    x_train = train.text.values
    y_train = train.lbl_num.values
    x_test = test.text.values
    y_test = test.lbl_num.values
    word_tokenizer = Tokenizer()
    word_tokenizer.fit_on_texts(x_train)
    vocab_length = len(word_tokenizer.word_index) + 1
    x_train_emb = word_tokenizer.texts_to_sequences(x_train)
    x_test_emb = word_tokenizer.texts_to_sequences(x_test)
    
    word_count = lambda sentence: len(word_tokenize(sentence))
    longest_sentence = max(np.concatenate((x_train,x_test),axis = 0), key=word_count)
    length_long_sentence = len(word_tokenize(longest_sentence))
    # print('length_long_sentence: '+str(length_long_sentence))
    self.glove_wt = word_tokenizer
    self.lls = length_long_sentence
    x_train_padded = pad_sequences(x_train_emb, length_long_sentence, padding='post',truncating = 'post')
    x_test_padded = pad_sequences(x_test_emb, length_long_sentence, padding='post',truncating = 'post')

    # encoded_Y_tr = self.encoder.fit_transform(y_train)
    # encoded_Y_ts = self.encoder.fit_transform(y_test)
    dummy_y_tr = np_utils.to_categorical(y_train)
    dummy_y_ts = np_utils.to_categorical(y_test)

    embedding_matrix = np.zeros((vocab_length, embed_dim))
    for word, index in word_tokenizer.word_index.items():
        embedding_vector = self.glove_embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

    # print(x_train_padded)
    # print(x_test_padded)
    return ((tf.convert_to_tensor(x_train_padded,np.int32),tf.convert_to_tensor(dummy_y_tr,np.float32)),(tf.convert_to_tensor(x_test_padded,np.int32),tf.convert_to_tensor(dummy_y_ts,np.float32)),(vocab_length,embedding_matrix,embed_dim,length_long_sentence))

  def CONV1D(self,v):
      data = self.data_glove
      x_tr,y_tr = data[0]
      x_ts,y_ts = data[1]
      vl,mat,dim,len_sen = data[2]
      
      
      with strategy.scope():
          model = tf.keras.models.Sequential([
              tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_100D_CONV1D_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_200_CONV1D_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="FT_300_CONV1D_embedding"),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Dropout(0.5),
              tf.keras.layers.Flatten(),
              tf.keras.layers.Dense(512, activation='sigmoid'),
              tf.keras.layers.Dense(256, activation='sigmoid'),
              tf.keras.layers.Dense(128, activation='sigmoid'),
              tf.keras.layers.Dense(64, activation='sigmoid'),
              tf.keras.layers.Dense(v, activation='softmax')])

          model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(),
          metrics=['accuracy'])
      # x = model.to_json()
      print("\n CONV1D \n")
      model.summary()
      his = model.fit(x = x_tr,y=y_tr,validation_data=(x_ts,y_ts),epochs=100,callbacks=[EarlyStopping(patience=10)])
      # y_pred = model.predict(x_ts)
      return model,his#,accuracy_score(y_ts,y_pred)



  def BiRNN(self,v):
      data = self.data_glove
      x_tr,y_tr = data[0]
      x_ts,y_ts = data[1]
      vl,mat,dim,len_sen = data[2]
      
      
      with strategy.scope():
          model = tf.keras.models.Sequential([
              tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_100D_BiRNN_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_200_BiRNN_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="FT_300_BiRNN_embedding"),
              # tf.keras.layers.SeparableConv1D(32,3,activation='relu'),
              # tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.SeparableConv1D(32,3,activation='relu'),
              # tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.SeparableConv1D(32,3,activation='relu'),
              # tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.Dropout(0.5),
              tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(16,return_sequences=True, dropout=0.2, recurrent_dropout=0.3)),
              tf.keras.layers.Flatten(),
              tf.keras.layers.Dense(512, activation='sigmoid'),
              tf.keras.layers.Dense(256, activation='sigmoid'),
              tf.keras.layers.Dense(128, activation='sigmoid'),
              tf.keras.layers.Dense(64, activation='sigmoid'),
              tf.keras.layers.Dense(v, activation='softmax')])

          model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(),
          metrics=['accuracy'])
      # x = model.to_json()
      print("\n BiRNN \n")
      model.summary()
      his = model.fit(x = x_tr,y=y_tr,validation_data=(x_ts,y_ts),epochs=100,callbacks=[EarlyStopping(patience=10)])
      # y_pred = model.predict(x_ts)
      return model,his #,accuracy_score(y_ts,y_pred)

  
  def CONV1D_BiRNN(self,v):
      data = self.data_glove
      x_tr,y_tr = data[0]
      x_ts,y_ts = data[1]
      vl,mat,dim,len_sen = data[2]
      
      
      with strategy.scope():
          model = tf.keras.models.Sequential([
              tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_100D_CONV1D_BiRNN_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_200_CONV1D_BiRNN_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="FT_300_CONV1D_BiRNN_embedding"),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.Dropout(0.5),
              tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(16,return_sequences=True, dropout=0.2, recurrent_dropout=0.3)),
              tf.keras.layers.Flatten(),
              tf.keras.layers.Dense(512, activation='sigmoid'),
              tf.keras.layers.Dense(256, activation='sigmoid'),
              tf.keras.layers.Dense(128, activation='sigmoid'),
              tf.keras.layers.Dense(64, activation='sigmoid'),
              tf.keras.layers.Dense(v, activation='softmax')])

          model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(),
          metrics=['accuracy'])
      # x = model.to_json()
      print("\n CONV1D BiRNN \n")
      model.summary()
      his = model.fit(x = x_tr,y=y_tr,validation_data=(x_ts,y_ts),epochs=100,callbacks=[EarlyStopping(patience=10)])
      # y_pred = model.predict(x_ts)
      return model,his #,accuracy_score(y_ts,y_pred)

  def BiGRU(self,v):
      data = self.data_glove
      x_tr,y_tr = data[0]
      x_ts,y_ts = data[1]
      vl,mat,dim,len_sen = data[2]
      
      
      with strategy.scope():
          model = tf.keras.models.Sequential([
              tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_100D_BiGRU_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_200_BiGRU_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="FT_300_BiGRU_embedding"),
              # tf.keras.layers.SeparableConv1D(32,3,activation='relu'),
              # tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.SeparableConv1D(32,3,activation='relu'),
              # tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.SeparableConv1D(32,3,activation='relu'),
              # tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.Dropout(0.5),
              tf.keras.layers.Bidirectional(tf.keras.layers.GRU(16,return_sequences=True, dropout=0.2, recurrent_dropout=0.3)),
              tf.keras.layers.Flatten(),
              tf.keras.layers.Dense(512, activation='sigmoid'),
              tf.keras.layers.Dense(256, activation='sigmoid'),
              tf.keras.layers.Dense(128, activation='sigmoid'),
              tf.keras.layers.Dense(64, activation='sigmoid'),
              tf.keras.layers.Dense(v, activation='softmax')])

          model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(),
          metrics=['accuracy'])
      # x = model.to_json()
      print("\n BiGRU \n")
      model.summary()
      his = model.fit(x = x_tr,y=y_tr,validation_data=(x_ts,y_ts),epochs=100,callbacks=[EarlyStopping(patience=10)])
      # y_pred = model.predict(x_ts)
      return model,his #,accuracy_score(y_ts,y_pred)
      
  def CONV1D_BiGRU(self,v):
      data = self.data_glove
      x_tr,y_tr = data[0]
      x_ts,y_ts = data[1]
      vl,mat,dim,len_sen = data[2]
      
      
      with strategy.scope():
          model = tf.keras.models.Sequential([
              tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_100D_CONV1D_BiGRU_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="GL_200_CONV1D_BiGRU_embedding"),
              # tf.keras.layers.Embedding(vl,dim,weights=[mat],input_length=len_sen,trainable=False,name="FT_300_CONV1D_BiGRU_embedding"),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              tf.keras.layers.Conv1D(32,3,activation='relu'),
              tf.keras.layers.MaxPool1D(),
              # tf.keras.layers.Dropout(0.5),
              tf.keras.layers.Bidirectional(tf.keras.layers.GRU(16,return_sequences=True, dropout=0.2, recurrent_dropout=0.3)),
              tf.keras.layers.Flatten(),
              tf.keras.layers.Dense(512, activation='sigmoid'),
              tf.keras.layers.Dense(256, activation='sigmoid'),
              tf.keras.layers.Dense(128, activation='sigmoid'),
              tf.keras.layers.Dense(64, activation='sigmoid'),
              tf.keras.layers.Dense(v, activation='softmax')])

          model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(),
          metrics=['accuracy'])
      # x = model.to_json()
      print("\n CONV1D BiGRU \n")
      model.summary()
      his = model.fit(x = x_tr,y=y_tr,validation_data=(x_ts,y_ts),epochs=100,callbacks=[EarlyStopping(patience=10)])
      # y_pred = model.predict(x_ts)
      return model,his #,accuracy_score(y_ts,y_pred)




In [None]:
sent_models = {}
sent_models_clf = GloveModels('/content/drive/MyDrive/datasets/tr_sen_fin.csv')
sent_param_glove = {}
sent_param_glove['Tokenizer'],sent_param_glove['long_sen_num'] = sent_models_clf.process()
sent_models['CONV1D'] = sent_models_clf.CONV1D(5)
sent_models['CONV1D_BiRNN'] = sent_models_clf.CONV1D_BiRNN(5)
sent_models['CONV1D_BiGRU'] = sent_models_clf.CONV1D_BiGRU(5)
sent_models['BiGRU'] = sent_models_clf.BiGRU(5)
sent_models['BiRNN'] = sent_models_clf.BiRNN(5)
import pickle
with open('/content/drive/MyDrive/GLOVE_100/sentiment/sentiment_params.pickle','wb') as f:
  pickle.dump(sent_param_glove,f)
for _,x in enumerate(sent_models.keys()):
  sent_models[x][0].save('/content/drive/MyDrive/GLOVE_100/sentiment/sentiment_'+x+'.h5')

Found 1193514 word vectors.

 BiLSTM 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
GL_200_BiLSTM_embedding (Emb (None, 163, 200)          27423200  
_________________________________________________________________
bidirectional (Bidirectional (None, 163, 32)           27776     
_________________________________________________________________
flatten (Flatten)            (None, 5216)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               2671104   
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)  