Organizing our Data, Twitter Sentiment Analysis Dataset

In [None]:
import numpy as np

# Extracing the Data from a CSV, Pulling form Columns 1 and 3
training = np.genfromtxt('/content/SentimentAnalysisDataset2.csv', delimiter=',', skip_header = 1, usecols = (1, 3), dtype = None)

# Create our Training Data
train_x = [x[1] for x in training]

# Index All the Sentiment Labels
train_y = np.asarray([x[0] for x in training])

  after removing the cwd from sys.path.


Use Keras to Preprocess the Data and Make it Machine-Friendly

In [None]:
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer

# Tokenizer Will Only Look at the 3000 Most Popular Words
# Low Frequency Words Don't Offer Much Imput, or Could be Typos
# It Also Saves Time Since We're Using a Boat Load of Data
max_words = 3000

# Create Tokenizer
tokenizer = Tokenizer(num_words = max_words)

# Feed Data into the Tokenizer
tokenizer.fit_on_texts(train_x)

# Tokenizers Come with a Convenient List of Words and IDs :) !
dictionary = tokenizer.word_index

# Saving Dictionary.json for Later Use
with open('dictionary.json', 'w') as dictionary_file:
  json.dump(dictionary, dictionary_file)

def convert_text_to_index_array(text):
  # Makes All Texts the Same Length as the Longest in the Set
  return [dictionary[word] for word in
          kpt.text_to_word_sequence(text)]
        
allWordIndices = []
# Change Each Token to its ID in the Tokenizers word_index
for text in train_x:
  wordIndices = convert_text_to_index_array(text)
  allWordIndices.append(wordIndices)

# Cast List of All Tweets Converted to index Array as Array
allWordIndices = np.asarray(allWordIndices)

#Create One-Hot Matrices Out of Indexed Twets
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode = 'binary')

# Labels as Categories
train_y = keras.utils.to_categorical(train_y, 2)

Making the Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape = (max_words,), activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation = 'sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation = 'softmax'))

Compile the Network

In [None]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

Training the Neural Network


In [None]:
model.fit(train_x, train_y,
          batch_size = 32,
          epochs = 5,
          verbose = 1,
          validation_split = 0.1,
          shuffle = True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7d8d9a4d30>

Saving the Model

In [None]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
  json_file.write(model_json)

model.save_weights('model.h5')

Using the Model

In [None]:
import json
import numpy as np
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json

# Utilizing Tokenizer
tokenizer = Tokenizer(num_words = 3000)

# Labels for Printing
labels = ['Negative', 'Positive']

# Accessing our Dictionary
with open('dictionary.json', 'r') as dictionary_file:
  dictionary = json.load(dictionary_file)

# Checking to Make Sure Words were in Training Corpus
# Before Converting into a Matrix
def convert_text_to_index_array(text):
  words = kpt.text_to_word_sequence(text)
  wordIndices = []
  for word in words:
    if word in dictionary:
      wordIndices.append(dictionary[word])
    else:
      pass
      #print("'%s' is not in the Training Corpus, Ignoring." %(word))
  return wordIndices

# Read our Saved Model Structure
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

# Creating Model from that Model
model = model_from_json(loaded_model_json)
# Adding Weight to the Nodes
model.load_weights('model.h5')

# THIS IS WHERE YOU INPUT THE FILE OR SENTENCE

str = open('/content/RedditTest3.json', 'r').read()
# evalSentence = ('You are amazing')


testArr = convert_text_to_index_array(str)#evalSentence)
input = tokenizer.sequences_to_matrix([testArr], mode = 'binary')

    # Predict if Positive or Negative
pred = model.predict(input)

print("%s Sentiment and %f%% Confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

Positive Sentiment and 65.915877% Confidence
