<a href="https://colab.research.google.com/github/itsdawei/david-lee-twitter/blob/main/CAIS_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/CAIS++/Project/

DATASET_DIR = './dataset/data.csv'
EMBEDDINGS_DIR = './glove.6B.50d.txt'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/CAIS++/Project


In [14]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

EMBEDDING_DIM = 50

def load_valence_data(DATASET_PATH, embeddings_dir):
	# Load tweets, labels
  print("1 -- Loading tweets and labels")
  data = pd.read_csv(DATASET_PATH)
  valences = data['valence'].values
  authors = data['author'].values
  tweets = data['tweet'].values

	# Tokenize the tweets (convert sentence to sequence of words)
  print("2 -- Tokenizing the tweets: converting sentences to sequence of words")
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(tweets)
  sequences = tokenizer.texts_to_sequences(tweets)
  word_index = tokenizer.word_index

	# Pad sequences to ensure samples are the same size
  print("3 -- Padding sequences to ensure samples are the same size")
  training_data = pad_sequences(sequences)
  
  print("4 -- Loading pre-trained word embeddings. This may take a few minutes.")
  embeddings_index = {}
  f = open(embeddings_dir,'rb')
  for line in f:
      values = line.split()
      word = values[0].decode('UTF-8')
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  f.close()

  print("5 -- Finding word embeddings for words in our tweets.")
  # prepare word embedding matrix
  num_words = len(word_index)+1
  embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
  for word, i in word_index.items():
      if i >= num_words:
          continue
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector

  return tweets, training_data, valences, word_index, embedding_matrix


In [15]:
tweets, tweets_preprocessed, labels, word_index, embedding_matrix = load_valence_data(DATASET_DIR, EMBEDDINGS_DIR)

1 -- Loading tweets and labels
2 -- Tokenizing the tweets: converting sentences to sequence of words
3 -- Padding sequences to ensure samples are the same size
4 -- Loading pre-trained word embeddings. This may take a few minutes.
5 -- Finding word embeddings for words in our tweets.


In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(tweets_preprocessed,labels, train_size=0.8,random_state=0)

960000 640000 960000 640000
Epoch 1/10


ValueError: ignored

In [29]:
from keras.models import Sequential
from keras.layers import Embedding, Input
from keras.layers.merge import Concatenate
from keras.layers.core import Dense, Activation, Flatten
from keras.layers import Dropout, concatenate
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras import metrics
from keras.models import Model

model = Sequential()

# Add pre-trained embedding layer 
# converts word indices to GloVe word embedding vectors as they're fed in
model.add(Embedding(len(word_index) + 1,
                    EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=tweets_preprocessed.shape[1],
                    trainable=False))

# At this point, each individual training sample is now a sequence of word embedding vectors

# First LSTM layer (return sequence so that we can feed the output into the 2nd LSTM layer)
model.add(LSTM(64, return_sequences = True, activation='relu'))
model.add(Dropout(.2))

# Second LSTM layer 
# Don't return sequence this time, because we're feeding into a fully-connected layer
model.add(LSTM(64, activation='relu'))
model.add(Dropout(.2))

# Dense 1
model.add(Dense(32, activation='relu'))
model.add(Dropout(.2))

# Dense 2 (final vote)
model.add(Dense(1, activation = 'sigmoid'))

######################################

print(model.summary())

LOSS = 'binary_crossentropy' # because we're classifying between 0 and 1
OPTIMIZER = 'RMSprop' # RMSprop tends to work well for recurrent models

model.compile(loss = LOSS, optimizer = OPTIMIZER, metrics = [metrics.binary_accuracy])

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 118, 50)           34548050  
                                                                 
 lstm_8 (LSTM)               (None, 118, 64)           29440     
                                                                 
 dropout_3 (Dropout)         (None, 118, 64)           0         
                                                                 
 lstm_9 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 32)                2080      
                                                                 
 dropout_5 (Dropout)         (None, 32)               

In [None]:
TEST_SIZE = 0.5

EPOCHS = 10
BATCH_SIZE = 128

model.fit(x_train, y_train, 
          epochs = EPOCHS, 
          batch_size = BATCH_SIZE, 
          validation_split = TEST_SIZE)

Epoch 1/10
 273/3750 [=>............................] - ETA: 20:59 - loss: nan - binary_accuracy: 0.4807