In [21]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D, LSTM, Conv1D, Flatten
from tensorflow.keras.optimizers import Adam


from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [22]:
# load and pad data
def load_csv_and_embedding(filename, embedding_path, emb_dim, num_words = 10000, maxlen = 140, partial = True):
    train_df = pd.read_csv(filename)
    tweets = train_df["tweet"].values
    # temporary work around for empty tweet (emptied because of some cleaning)
    tweets = np.array(["I" if tweet == "" else tweet for tweet in tweets])
    labels = train_df["label"].values
    
    np.random.seed(1)  # Reproducibility!
    # take partial data for testing purpose
    if partial == True:
        shuffled_indices = np.random.permutation(len(tweets))
        partial_idx = int(0.005 * len(tweets))
        
        partial_indices = shuffled_indices[:partial_idx]
        
        tweets = tweets[partial_indices]
        labels = labels[partial_indices]
    
    # split dataset into training and validation 
    shuffled_indices = np.random.permutation(len(tweets))
    split_idx = int(0.8*len(tweets))
    
    train_indices = shuffled_indices[:split_idx]
    val_indices = shuffled_indices[split_idx:]
    
    x_train = tweets[train_indices]
    x_val = tweets[val_indices]
    
    y_train = labels[train_indices]
    y_val = labels[val_indices]
    
    print("x_train shape", x_train.shape)
    print("y_train shape", y_train.shape)
    print("x_val shape", x_val.shape)
    print("y_val shape", y_val.shape)
    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(tweets)
    
    x_train = tokenizer.texts_to_sequences(x_train) 
    x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
    x_val = tokenizer.texts_to_sequences(x_val) 
    x_val = pad_sequences(x_val, padding='post', maxlen=maxlen)
    
    # load embedding 
    vocab = len(tokenizer.word_index)+1
    emb_mat = np.zeros((vocab, emb_dim))
    #Initializing a zero matrix for each word, they will be compared to have their final embedding
    with open(embedding_path) as f:
      for line in f:
        word, *emb = line.split() 
        if word in tokenizer.word_index:
            ind=tokenizer.word_index[word]
            emb_mat[ind]=np.array(emb,dtype="float32")[:emb_dim]
            
    print("embeddding mat shape", emb_mat.shape)
    
    return x_train, y_train, x_val, y_val, emb_mat


In [23]:
train_datapath = 'data/raw_train.csv'
test_datapath = 'data/test.csv'
embedding_path = 'embedding/glove.twitter.27B.100d.txt'
emb_dim = 100

x_train, y_train, x_val, y_val, emb_mat = load_csv_and_embedding(train_datapath, embedding_path, emb_dim)

x_train shape (10000,)
y_train shape (10000,)
x_val shape (2500,)
y_val shape (2500,)
embeddding mat shape (19424, 100)


In [24]:
print(x_train)
print(y_train)

[[1246   34    3 ...    0    0    0]
 [  22   17  376 ...    0    0    0]
 [2565    5   62 ...    0    0    0]
 ...
 [  24  190   10 ...    0    0    0]
 [ 174  157    4 ...    0    0    0]
 [   9 1200   14 ...    0    0    0]]
[1 0 1 ... 1 1 1]


In [25]:
# create the LSTM model
vocab = emb_mat.shape[0]
emb_dim=emb_mat.shape[1]
maxlen=140
model= Sequential()
model.add(Embedding(input_dim=vocab, output_dim=emb_dim,weights=[emb_mat], input_length=maxlen, trainable=False))
model.add(MaxPool1D())
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences = False))
model.add(Dense(16,activation="relu"))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 140, 100)          1942400   
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 70, 100)          0         
 1D)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense_8 (Dense)             (None, 16)                1616      
                                                                 
 dense_9 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2,024,433
Trainable params: 82,033
Non-trainable params: 1,942,400
_______________________________________

In [27]:
history = model.fit(x_train, y_train,epochs=100,verbose=True,batch_size=128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

KeyboardInterrupt: 

In [None]:
test_score=model.evaluate(x_val,y_val)

In [None]:
# CNN model
vocab = emb_mat.shape[0]
emb_dim=emb_mat.shape[1]
maxlen=140
cnn_model= Sequential()
cnn_model.add(Embedding(input_dim=vocab, output_dim=emb_dim, input_length=maxlen))
cnn_model.add(Conv1D(64, 5, activation='relu'))
cnn_model.add(MaxPool1D(5))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(MaxPool1D(5))
cnn_model.add(Dense(16,activation="relu"))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
cnn_model.summary()

In [None]:
history = cnn_model.fit(x_train, y_train, epochs=100,verbose=True,batch_size=128)

In [None]:
test_score=cnn_model.evaluate(x_val,y_val)