In [None]:
#Author:WitsOnTweets
    #This module helps to train and make the model which is used to predict the sentiment of a sentence.
#Dataset:Sentiment140 Dataset


In [1]:
#Usage:Libraries
    #keras:keras is high level Neural Network API written in python. It works on top of Theano, CNTK, Tensorflow. Here we chose 
    #Tensorflow as the backend for processing
    #Tokenizer:Helps to break the sentence in Tokens(Words)
    #numpy:python library for array manipulation
    #nltk:Natural Language Toolkit for Basic Text Processing . Here it is used to remove Stopwords(Words that dont convey any meaning)
    #pandas:pandas library providing high-performance, easy-to-use data structures and data analysis
    #EarlyStopping:Stops the Training Process after the validation accuracy stops increasing thus preventing overfitting of our model
    

import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd


Using TensorFlow backend.


In [2]:
#iloc[:,1] reads all the rows and picks the first column in it


training = pd.read_csv('2lakh1.csv', usecols=(0, 1),encoding="latin-1")
stop = set(stopwords.words('english'))
train_x=training.iloc[:,1].astype(str)
train_y=training.iloc[:,0]

In [3]:
max_words = 3000
#Create a tokenizer using Keras and input the tweets
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_x)

#The dictionary is a map of different words from the tweets and the corresponding index
dictionary = tokenizer.word_index

#Saving the dictionary to a json file so we can use it for processing Tweets while testing the model

with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

#text_to_word_sequence converts the all the tweets to same length i.e pads all the short tweets
def convert_text_to_index_array(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]



In [4]:
allWordIndices = []

#Using the dictionary convert all the text in the tweets to numbers and feed them to neural net and skip the unindexed words

for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

#convert all tweets to an array of lists
allWordIndices = np.asarray(allWordIndices)

#create a 3000 length one hot vector for each tweet where every word is 0 or 1 
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')


train_y = keras.utils.to_categorical(train_y, 2)

In [23]:
#NOW THE MODEL BEGINS
#Type of model: Sequential Model
#HyperParameters:
    #No of layers: 4
    #Input layer nodes:3000
    #Hidden Layer1: 512
    #Hidden Layer2: 256
    #Output Layer: 2
    #DropOut: 0.5
    #BatchSize: 28

from keras.callbacks import EarlyStopping
model = Sequential(model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
early_stopping=EarlyStopping(monitor='val_acc',mode='max')

#Shuffling the dataset and start Training
model.fit(train_x, train_y,
    batch_size=28,
    epochs=6,
    verbose=1,
    validation_split=0.1,
    shuffle=True,callbacks=[early_stopping])

#Saving the model and weights associated with each edge
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

print('saved model!')

Train on 177592 samples, validate on 19733 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
saved model!
