In [1]:
from preprocess_data.make_dataset import *
from features.build_features import *
from models.model1 import *
from models.model2 import *

2023-07-15 21:08:43.591925: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


# 1. Processing raw data

In [2]:
# Training data
dataset_creator = DatasetCreator(project.train_raw_path, project.train_processed_path)
dataset_creator.make_dataset()

# Testing data
dataset_creator = DatasetCreator(project.test_raw_path, project.test_processed_path)
dataset_creator.make_dataset()

# 2. Model 1

## 2.1 Text processing
(for training our own embeddings)

In [3]:
# process and tokenise text data to build features
train_text_dict = preprocess_text(project.train_processed_path, 
                                  remove_apostrophes=False)
test_text_dict = preprocess_text(project.test_processed_path, 
                                 remove_apostrophes=False)

In [4]:
# find dimensions for embedding
vocab, vocab_size, max_tweet_size = find_vocab([train_text_dict, test_text_dict])
print('Vocabulary size: ', vocab_size)
print('Max tweet size: ', max_tweet_size)

Vocabulary size:  17021
Max tweet size:  43


In [5]:
# index dictionaries
indexed_dict_list, indexed_vocab = index_tweets([train_text_dict, test_text_dict])
train_indexed_dict = indexed_dict_list[0]
test_indexed_dict = indexed_dict_list[1]

## 2.2 Model fitting and prediction

In [6]:
model1 = Model1_Creator(vocab_size, max_tweet_size, train_indexed_dict, test_indexed_dict)
model1.build_model()
model1.fit_model()
loss, accuracy = model1.model_predict()

2023-07-15 21:09:10.408297: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 43, 100)           1702300   
                                                                 
 flatten (Flatten)           (None, 4300)              0         
                                                                 
 dense (Dense)               (None, 4)                 17204     
                                                                 
Total params: 1,719,504
Trainable params: 1,719,504
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Ep

## 2.3 Results Exploration

# 3. Model 2

## 3.1 Text Processing
(using GloVe embeddings)

In [7]:
# preprocess train_df and test_df
train_text_dict_2 = preprocess_text(project.train_processed_path, 
                                    remove_apostrophes=True)
test_text_dict_2 = preprocess_text(project.test_processed_path, 
                                   remove_apostrophes=True)

In [8]:
# find dimensions for embedding
vocab_2, vocab_size_2, max_tweet_size_2 = find_vocab([train_text_dict_2, test_text_dict_2])

In [9]:
# index dictionaries
indexed_dict_list_2, indexed_vocab_2 = index_tweets([train_text_dict_2, test_text_dict_2])
train_indexed_dict_2 = indexed_dict_list_2[0]
test_indexed_dict_2 = indexed_dict_list_2[1]

## 3.2 Model fitting and prediction

In [10]:
model2 = Model2_Creator(vocab_size_2, indexed_vocab_2, max_tweet_size_2,
                        train_indexed_dict_2, test_indexed_dict_2)
model2.load_glove()
model2.prepare_embeddings()
model2.build_model(is_trainable=False) # do not further train our embedding layer
model2.fit_model()
loss_2, accuracy_2 = model2.model_predict()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 43, 100)           1669100   
                                                                 
 flatten_1 (Flatten)         (None, 4300)              0         
                                                                 
 dense_1 (Dense)             (None, 4)                 17204     
                                                                 
Total params: 1,686,304
Trainable params: 17,204
Non-trainable params: 1,669,100
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 2

In [11]:
model21 = Model2_Creator(vocab_size_2, indexed_vocab_2, max_tweet_size_2,
                        train_indexed_dict_2, test_indexed_dict_2)
model21.load_glove()
model21.prepare_embeddings()
model21.build_model(is_trainable=True) # train pre-trained embedding layer
model21.fit_model()
loss_21, accuracy_21 = model21.model_predict()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 43, 100)           1669100   
                                                                 
 flatten_2 (Flatten)         (None, 4300)              0         
                                                                 
 dense_2 (Dense)             (None, 4)                 17204     
                                                                 
Total params: 1,686,304
Trainable params: 1,686,304
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
