In [2]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [127]:
import json
import tensorflow as tf
import csv
import random
import numpy as np
import pandas as pd
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

In [128]:
DATASET_NAME = "training_cleaned.csv"
BASE_URL = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com"
path_pwd = !pwd
BASE_PATH = os.path.join(path_pwd[0], 'datasets')

In [129]:
!mkdir -p $BASE_PATH
print(BASE_PATH)

/home/laura/data/nlp_in_tensorflow/week_3_sequence_models/datasets


In [130]:
full_url = os.path.join(BASE_URL, DATASET_NAME)
dataset_path = os.path.join(BASE_PATH, DATASET_NAME)

In [131]:
''' 
Dataset:  https://www.kaggle.com/kazanova/sentiment140
Cleaned the Stanford dataset to remove LATIN1 encoding to make it easier for Python CSV reader
You can do that yourself with:
iconv -f LATIN1 -t UTF8 training.1600000.processed.noemoticon.csv -o training_cleaned.csv
'''

# Download dataset
!wget --no-check-certificate $full_url -O $dataset_path

--2019-12-09 22:06:40--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.193.128, 2a00:1450:400b:c01::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.193.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 238942690 (228M) [application/octet-stream]
Saving to: ‘/home/laura/data/nlp_in_tensorflow/week_3_sequence_models/datasets/training_cleaned.csv’


2019-12-09 22:06:50 (23.9 MB/s) - ‘/home/laura/data/nlp_in_tensorflow/week_3_sequence_models/datasets/training_cleaned.csv’ saved [238942690/238942690]



In [132]:
# Read the csv as a dataframe 
column_names = ['target', 'id', 'date', 'flag','user', 'text']
tweets_data = pd.read_csv(dataset_path, header=None, names=column_names, index_col=False)    

# Checkout how our content looks like 
tweets_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [97]:
tweets = tweets_data['text']
labels = tweets_data['target']

corpus = [[tweet, label] if label == 0 else [tweet, 1] for tweet, label in zip(tweets,labels) ]

num_sentences = len(corpus)


In [98]:
bla = tweets_data['target'] == 4
print(bla[1599995])
# Test that data transformation has been successful :) Labels with 4 will be set to 1
corpus[1599995]

True


['Just woke up. Having no school is the best feeling ever ', 1]

In [139]:
# Define parameters 
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
VOCAB_SIZE = 5000
# Your dataset size here. Experiment using smaller values (i.e. 16000), 
# but don't forget to train on at least 160000 to see the best effects
training_size= 160000
test_portion=.1

In [140]:
print(num_sentences)
print(len(corpus))
print(corpus[1])

# Expected Output:
# 1600000
# 1600000
# ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]

1600000
1600000
["@ascrivner I did my own adaptation of King's work once called &quot;Shawshank Radio Show&quot; and it wasn't good at all. ", 0]


In [141]:
# This will shuffle in place
random.shuffle(corpus)

sentences=[]
labels=[]

for text, label in corpus[:training_size]:
    sentences.append(text)
    labels.append(label)

In [142]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size=len(word_index)

In [143]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences=sequences, maxlen=max_length,
                             padding=padding_type, truncating=trunc_type)

In [144]:
split = int(test_portion * training_size)
train_split = training_size - split

In [145]:
training_sequences,test_sequences = padded[:train_split], padded[train_split:]
training_labels, test_labels = labels[:train_split], labels[train_split:]

In [146]:
print(vocab_size)
print(word_index['i'])
# Expected Output
# 138858
# 1

138623
2


In [133]:
glove_dataset = "glove.6B.100d.txt"
glove_path = os.path.join(BASE_PATH, glove_dataset)
glove_full_url = os.path.join(BASE_URL, glove_dataset)

In [134]:
# Note this is the 100 dimension version of GloVe from Stanford
# File is unzipped and hosted it on: https://storage.googleapis.com/laurencemoroney-blog.appspot.com
!wget --no-check-certificate $glove_full_url -O $glove_path

--2019-12-09 22:07:22--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.193.128, 2a00:1450:400b:c01::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.193.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/home/laura/data/nlp_in_tensorflow/week_3_sequence_models/datasets/glove.6B.100d.txt’


2019-12-09 22:07:48 (12.9 MB/s) - ‘/home/laura/data/nlp_in_tensorflow/week_3_sequence_models/datasets/glove.6B.100d.txt’ saved [347116733/347116733]



In [135]:
'''
Create an embedding index by reading the line 
- word is values[0]
- weights is vaues[1:]
'''
embeddings_index = {};
with open(glove_path) as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

In [147]:
# Create an embedding matrix from the words from the word_index 
# and the embedding index dictionary
embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [148]:
print(len(embeddings_matrix))
# Expected Output
# 138859

138624


In [153]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, 
                              input_length=max_length, 
                              weights=[embeddings_matrix], 
                              trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

ValueError: Input 0 of layer conv1d_2 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 64]

In [None]:
model.summary()

In [0]:
model.compile(# YOUR CODE HERE)


num_epochs = 50
history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=2)

print("Training Complete")

In [0]:
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()


# Expected Output
# A chart where the validation loss does not increase sharply!