In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertTokenizer, TFBertModel
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA
import re
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.layers import Dropout, GlobalAveragePooling1D
from tensorflow.keras.layers import Subtract, Concatenate
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [15]:
# import os

# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
tweets = pd.read_csv('/tweets.csv', encoding='cp1252', header=None)
tweets.columns = ['target','id','date','flag','username','text'] #Change column names to things that make sense
tweets = tweets.drop(columns=['id','date','flag','username']) #Remove unneeded columns from memory
tweets = tweets.replace({'target':{0:0,4:1}}) #Dataset has only 0=negative sent, 4=positive sent, remappping to 0,1 respectivly
# print(tweets.shape)
tweets = tweets.groupby('target').sample(250000,random_state=None)
tweets.head()

Unnamed: 0,target,text
303056,0,Hockey will be ending soon. That makes me sad
199034,0,@Kenichan @etherjammer Yeah~ I have *terrible...
226310,0,Life is always COMPLICATED!!
598410,0,@lilandtedsmum well at least spuds and grapes ...
158535,0,"@fadedmoon owww ppl are voting for suarez, fra..."


##### Clean the dataset

In [7]:
def clean_tweet(tweet):
    # Remove any non-alphabetic characters except basic punctuation
    tweet = re.sub(r"[^a-zA-Z0-9.,'!? ]", '', tweet)
    # Remove any excess whitespace
    tweet = re.sub(r"\s+", ' ', tweet).strip()
    return tweet

tweets['text'] = tweets['text'].apply(clean_tweet)
tweets.head()

Unnamed: 0,target,text
303056,0,Hockey will be ending soon. That makes me sad
199034,0,Kenichan etherjammer Yeah I have terrible anxi...
226310,0,Life is always COMPLICATED!!
598410,0,lilandtedsmum well at least spuds and grapes a...
158535,0,"fadedmoon owww ppl are voting for suarez, fran..."


In [8]:
tweets.value_counts('target')

target
0    250000
1    250000
dtype: int64

In [9]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(tweets['text'], tweets['target'], test_size=0.2, random_state=265)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(400000,) (100000,) (400000,) (100000,)


In [10]:
#print the first item in X_train and y_train
print(X_train.iloc[0])
print(y_train.iloc[0])


valgirl omg I really hope thats not hte case!!
0


In [11]:
%%time
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

NUM_TOP_WORDS = None # use entire vocabulary!
MAX_ART_LEN = np.max([len(tweet) for tweet in tweets['text']]) # maximum number of words in a tweet

#tokenize the text
tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(X_train)
# save as sequences with integers replacing words
sequences = tokenizer.texts_to_sequences(X_train)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

X = pad_sequences(sequences, maxlen=MAX_ART_LEN)
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_ART_LEN)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_ART_LEN)


#y_ohe = keras.utils.to_categorical(y_train, num_classes=2)
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', y_train.shape)
print(np.max(X))

Found 287386 unique tokens. Distilled to 287386 top words.
Shape of data tensor: (400000, 165)
Shape of label tensor: (400000,)
287386
CPU times: user 20.4 s, sys: 232 ms, total: 20.7 s
Wall time: 20.6 s


In [12]:
# Parameters for embedding layer
VOCAB_SIZE = len(word_index) + 1  # Add 1 for padding
EMBED_SIZE = 300  # You can change this size according to your preference

# Create model
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_SIZE, input_length=MAX_ART_LEN))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# Compile model
#make learning rate in adam optimizer smaller
adam_optimizer = keras.optimizers.Adam(learning_rate=1e-5)

#early stopping from val_loss
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model.compile(optimizer= 'adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print summary
model.summary()

# Train the model
model.fit(X, y_train, epochs=10, batch_size=256, validation_split=0.2, validation_data=(X_test, y_test), callbacks=early_stopping)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Once the model is trained, you can retrieve the learned embedding weights
embedding_weights = model.layers[0].get_weights()[0]
print("Embedding weights shape:", embedding_weights.shape)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 165, 300)          86216100  
                                                                 
 conv1d (Conv1D)             (None, 163, 64)           57664     
                                                                 
 max_pooling1d (MaxPooling1  (None, 32, 64)            0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 32, 64)            0         
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 1)                 2049      
                                                        

In [22]:
from keras.models import load_model
#ran this to save the model

# MODEL_PATH = './drive/My Drive/Colab/CNNEmbeddingTrainer.h5'

# # Now save model in drive
# model.save(MODEL_PATH)



In [24]:
# # Load Model
# model3 = load_model(MODEL_PATH)
# model3.summary()

# embedding_weights = model3.layers[0].get_weights()[0]
# print("Embedding weights shape:", embedding_weights.shape)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 165, 300)          86216100  
                                                                 
 conv1d (Conv1D)             (None, 163, 64)           57664     
                                                                 
 max_pooling1d (MaxPooling1  (None, 32, 64)            0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 32, 64)            0         
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 1)                 2049      
                                                        

In [29]:
#np.save('/content/drive/My Drive/Colab/embedding_weights.npy', embedding_weights)


In [19]:
EMBED_SIZE = 300
print('Found %s word vectors.' % len(embedding_weights))

# Now fill in the matrix using the ordering from the keras word tokenizer
found_words = 0
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embedding_weights[i]  # Use the learned embeddings
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        found_words += 1

print("Embedding Shape:", embedding_matrix.shape, "\n",
      "Total words found:", found_words, "\n",
      "Percentage:", 100 * found_words / embedding_matrix.shape[0])

Found 287387 word vectors.
Embedding Shape: (287387, 300) 
 Total words found: 287386 
 Percentage: 99.99965203714851


In [20]:
from tensorflow.keras.layers import Embedding

# save this embedding now
embedding_layer = Embedding(len(word_index) + 1,
    EMBED_SIZE,
    weights=[embedding_matrix], # here is the embedding getting saved
    input_length=MAX_ART_LEN,
    trainable=False)

In [21]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.layers import Embedding, Input, Concatenate
from tensorflow.keras.layers import Subtract
from tensorflow.keras.utils import plot_model
import tensorflow as tf

# The transformer architecture
class TransformerBlock(Layer): # inherit from Keras Layer
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.2):
        super().__init__()
        # setup the model heads and feedforward network
        self.att = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embed_dim)

        # make a two layer network that processes the attention
        self.ffn = Sequential()
        self.ffn.add( Dense(ff_dim, activation='relu') )
        self.ffn.add( Dense(embed_dim) )

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        # apply the layers as needed (similar to PyTorch)

        # get the attention output from multi heads
        # Using same inpout here is self-attention
        # call inputs are (query, value, key)
        # if only two inputs given, value and key are assumed the same
        attn_output = self.att(inputs, inputs)

        # create residual output, with attention
        out1 = self.layernorm1(inputs + attn_output)

        # apply dropout if training
        out1 = self.dropout1(out1, training=training)

        # place through feed forward after layer norm
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + ffn_output)

        # apply dropout if training
        out2 = self.dropout2(out2, training=training)
        #return the residual from Dense layer
        return out2

class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        # create two embeddings
        # one for processing the tokens (words)
        self.token_emb = Embedding(input_dim=vocab_size,
                                   output_dim=embed_dim)
        # another embedding for processing the position
        self.pos_emb = Embedding(input_dim=maxlen,
                                 output_dim=embed_dim)

    def call(self, x):
        # create a static position measure (input)
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        # positions now goes from 0 to 500 (for IMdB) by 1
        positions = self.pos_emb(positions)# embed these positions
        x = self.token_emb(x) # embed the tokens
        return x + positions # add embeddngs to get final embedding

In [27]:
class CustomTokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        # create two embeddings
        # one for processing the tokens (words)
        self.token_emb = Embedding(len(word_index)+1,
                                   EMBED_SIZE,
                                   weights=[embedding_matrix],
                                   input_length=MAX_ART_LEN)
        # another embedding for processing the position
        self.pos_emb = Embedding(MAX_ART_LEN,
                                 EMBED_SIZE,
                                 input_length=MAX_ART_LEN,
                                )

    def call(self, x):
        # create a static position measure (input)
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        # positions now goes from 0 to 500 (for IMdB) by 1
        positions = self.pos_emb(positions)# embed these positions
        x = self.token_emb(x) # embed the tokens
        return x + positions # add embeddngs to get final embedding

### Custom Transformer using our previous project's CNN to train embedding of dataset
#### 2 Heads, 128 Neurons

In [None]:
# def transformer_custom

embed_dim = 300  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
top_words = 100
NUM_CLASSES =  1

adam_optimizer = Adam(learning_rate=1e-5)
inputs = Input(shape=(X_train.shape[1],))
x = CustomTokenAndPositionEmbedding(X_train.shape[1], top_words, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)

x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(NUM_CLASSES, activation='sigmoid',
              kernel_initializer='glorot_uniform')(x)
xformer = Model(inputs=inputs, outputs=outputs)
model_xformer = Model(inputs=inputs, outputs=outputs)
print(model_xformer.summary())
xformer.compile(optimizer=adam_optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
history = xformer.fit(
    X_train, y_train, batch_size=128, epochs=30,
    validation_data=(X_test, y_test)
)




Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 162)]             0         
                                                                 
 custom_token_and_position_  (None, 162, 300)          86652900  
 embedding_4 (CustomTokenAn                                      
 dPositionEmbedding)                                             
                                                                 
 transformer_block_6 (Trans  (None, 162, 300)          742832    
 formerBlock)                                                    
                                                                 
 global_average_pooling1d_4  (None, 300)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_27 (Dropout)        (None, 300)               0   

### Custom Transformer using our previous project's CNN to train embedding of dataset
#### 3 Heads, 128 Neurons

In [28]:
# def transformer_custom

embed_dim = 300  # Embedding size for each token
num_heads = 3  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
top_words = 100
NUM_CLASSES =  1

adam_optimizer = Adam(learning_rate=1e-5)
inputs = Input(shape=(X_train.shape[1],))
x = CustomTokenAndPositionEmbedding(X_train.shape[1], top_words, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)

x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(NUM_CLASSES, activation='sigmoid',
              kernel_initializer='glorot_uniform')(x)
xformer = Model(inputs=inputs, outputs=outputs)
model_xformer = Model(inputs=inputs, outputs=outputs)
print(model_xformer.summary())
xformer.compile(optimizer=adam_optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
history = xformer.fit(
    X_train, y_train, batch_size=128, epochs=10,
    validation_data=(X_test, y_test)
)


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 165)]             0         
                                                                 
 custom_token_and_position_  (None, 165, 300)          86265600  
 embedding (CustomTokenAndP                                      
 ositionEmbedding)                                               
                                                                 
 transformer_block (Transfo  (None, 165, 300)          1103732   
 rmerBlock)                                                      
                                                                 
 global_average_pooling1d (  (None, 300)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_3 (Dropout)         (None, 300)               0   

### Transformer using Keras Embedding Layer
#### 2 heads 20 neurons

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
top_words = 100
NUM_CLASSES =  1

inputs = Input(shape=(X_train.shape[1],))
x = TokenAndPositionEmbedding(X_train.shape[1], top_words, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)

x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(20, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(NUM_CLASSES, activation='sigmoid',
              kernel_initializer='glorot_uniform')(x)

model_xformer = Model(inputs=inputs, outputs=outputs)
print(model_xformer.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 162)]             0         
                                                                 
 token_and_position_embeddi  (None, 162, 32)           8384      
 ng_1 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformer_block_3 (Trans  (None, 162, 32)           10656     
 formerBlock)                                                    
                                                                 
 global_average_pooling1d_1  (None, 32)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_17 (Dropout)        (None, 32)                0   

In [None]:
# optimizer = Adam(learning_rate=1e-5)
model_xformer.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

history = model_xformer.fit(
    X_train, y_train, batch_size=128, epochs=10,
    validation_data=(X_test, y_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#model.get_layer(*name u get from model.summary*)