In [32]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import  train_test_split
from textblob import TextBlob
import numpy as np

In [6]:
# open twitter sentiment data
# 1 = positive
# 0 = Negative
twitter_data_df = pd.read_csv("../data/raw/Twitter_Sentiment_Analysis/Sentiment Analysis Dataset.csv",error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [28]:
blob.tokenize()

WordList(['..', 'Omgaga', '.', 'Im', 'sooo', 'im', 'gunna', 'CRy', '.', 'I', "'ve", 'been', 'at', 'this', 'dentist', 'since', '11..', 'I', 'was', 'suposed', '2', 'just', 'get', 'a', 'crown', 'put', 'on', '(', '30mins', ')', '...'])

# Load Data

In [7]:
twitter_data_df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [11]:
X_all = twitter_data_df.SentimentText

In [12]:
y_all = twitter_data_df.Sentiment

In [9]:
twitter_data_df.shape

(1578612, 4)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_all,y_all, test_size=.25)

In [15]:
X_train.shape

(1183959,)

In [16]:
y_train.shape

(1183959,)

### Once this is done, we need to convert the sentences as a one-hot tensor of shape [sentence_length x word_length x alphabet_size].
- https://charlesashby.github.io/2017/06/05/sentiment-analysis-with-char-lstm/

In [41]:
EMB_ALPHABET = 'abcdefghijklmnopqrstuvwxyz0123456789-,;' \
               '.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}'
MAX_WORD_LENGTH = 16 # number of characters in a word
ALPHABET_SIZE = len(emb_alphabet)
# we associate every character in our alphabet to a number: 
# e.g. b => 1 d => 3 etc.
ALPHABET_DICT = {ch: ix for ix, ch in enumerate(emb_alphabet)}

In [66]:
def encode_one_hot( sentence, emb_alphabet, max_word_length, alphabet_size, alphabet_dict):
    """Convert a sentence to a one hot character encoding tensor for that sentence using alphanumeric characters"""
    # https://charlesashby.github.io/2017/06/05/sentiment-analysis-with-char-lstm/
    # Convert Sentences to np.array of Shape 
    # ('sent_length', 'word_length', 'emb_size')

    sent = []

    # We need to keep track of the maximum length of the sentence in a minibatch
    # so that we can pad them with zeros, this is why we return the length of every
    # sentences after they are converted to one-hot tensors
    SENT_LENGTH = 0

    # Here, we remove any non-printable characters in a sentence (mostly
    # non-ASCII characters)
    printable = emb_alphabet
    encoded_sentence = filter(lambda x: x in printable, sentence)

    # word_tokenize() splits a sentence into an array where each element is
    # a word in the sentence, for example, 
    # "My name is Charles" => ["My", "name", "is", Charles"]
    blob = TextBlob(sentence)
    individual_words_from_sentence = blob.tokenize()
    for word in individual_words_from_sentence :

        # Encode one word as a matrix of shape [max_word_length x ALPHABET_SIZE]
        word_encoding = np.zeros(shape=(max_word_length, alphabet_size))

        for i, char in enumerate(word):

            # If the character is not in the alphabet, ignore it    
            try:
                char_encoding = alphabet_dict[char]
                one_hot = np.zeros(ALPHABET_SIZE)
                one_hot[char_encoding] = 1
                word_encoding[i] = one_hot

            except Exception as e:
                pass

        sent.append(np.array(word_encoding))
        SENT_LENGTH += 1

    return np.array(sent), SENT_LENGTH

In [67]:
array,length = encode_one_hot(X_all[1], EMB_ALPHABET , MAX_WORD_LENGTH, ALPHABET_SIZE, ALPHABET_DICT)
# array is number of words in sentence X Max word length  X one hot encoding for alphabet

# Pad tensors to all have the same size 
- [batch_size x maximum_sentence_length x maximum_word_length x alphabet_size]

In [68]:
y_all[0]

0

In [72]:
X_all[0][2:-1]

'                   is so sad for my APL friend............'

In [91]:
def make_minibatch( sentences, sentiment_y,  max_word_length, alphabet_size):
    """Create a minibath of one-hot encoded sentences 
    array is number of words in sentence X Max word length  X one hot encoding for alphabet,
    
    and one hot encoded y [0,1] or [1,0] whre 1=positive"""
    # Create a minibatch of sentences and convert sentiment
    # to a one-hot vector, also takes care of padding

    max_word_length = max_word_length
    minibatch_x = []
    minibatch_y = []
    max_length = 0

    for sentence,sent in zip(sentences, sentiment_y):
        # Append the one-hot encoding of the sentiment to the minibatch of Y
        # 0: Negative 1: Positive
        minibatch_y.append(np.array([0, 1]) if sent == 0 else np.array([1, 0]))

        # One-hot encoding of the sentence
        one_hot, length = encode_one_hot(sentence,EMB_ALPHABET, MAX_WORD_LENGTH, ALPHABET_SIZE, ALPHABET_DICT )

        # Calculate maximum_sentence_length
        if length >= max_length:
            max_length = length

        # Append encoded sentence to the minibatch of X
        minibatch_x.append(one_hot)


    # data is a np.array of shape ('b', 's', 'w', 'e') we want to
    # pad it with np.zeros of shape ('e',) to get 
    # ('b', 'SENTENCE_MAX_LENGTH', 'WORD_MAX_LENGTH', 'e')
    def numpy_fillna(data):
        """ This is a very useful function that fill the holes in our tensor """

        # Get lengths of each row of data
        lens = np.array([len(i) for i in data])

        # Mask of valid places in each row
        mask = np.arange(lens.max()) < lens[:, None]
        #print(mask)

        # Setup output array and put elements from data into masked positions
        out = np.zeros(shape=(mask.shape + (max_word_length, alphabet_size)),
                       dtype='float32')
        #print(out)

        out[mask] = np.concatenate(data)
        #print(out,'final')
        return out

    # Padding...
    minibatch_x = numpy_fillna(minibatch_x)

    return minibatch_x, np.array(minibatch_y)

In [99]:
lens = np.array([10,5,10,9,8])

In [102]:
mask = np.arange(16) <lens[:,None]

In [104]:
mask.shape

(5, 16)

In [108]:
z = np.zeros(shape=(mask.shape + (16, 69)))

In [109]:
z.shape

(5, 16, 16, 69)

In [94]:
minibatch_x, minibatch_y = make_minibatch(X_all[:5],y_all[:5], MAX_WORD_LENGTH, ALPHABET_SIZE)

In [95]:
minibatch_x.shape

(5, 31, 16, 69)

In [93]:
minibatch_x.shape

(10, 31, 16, 69)

In [86]:
x_1 , l = encode_one_hot(X_all[1],EMB_ALPHABET, MAX_WORD_LENGTH, ALPHABET_SIZE, ALPHABET_DICT )

In [87]:
x_2 , l = encode_one_hot(X_all[2],EMB_ALPHABET, MAX_WORD_LENGTH, ALPHABET_SIZE, ALPHABET_DICT )

In [88]:
x_1.shape

(7, 16, 69)

In [89]:
x_2.shape

(6, 16, 69)

# Goal is to train two models - LSTM and CNN to sentiment analysis
- Using tweets here because the primary source of text from founders will be Tweets