# Sentiment Analytics on the Sem-Eval Dataset Using a DCNN
#### Author: Jonathan Esquivel
>* Note: We will denote areas with params with a " * " in the heading
## Imports:

In [None]:
import tensorflow as tf
import numpy as np
import utils as util
import pandas as pd

## Params:
> To change the model simply change these parameters

In [None]:
#Pre-Process params
percent_training = .80 #percent of data used for training

val_percent = .1 # percent of whole dataset that is validation (taken from training split)
numAugs=1 #number of sugmentations on that data
probKeepPositive=1 #Percentage of prosive samples that are trimmed
combClasses = True #Combine objective, neutral, and objective-OR-neutral clases
maxTokens = 28 #Numebr of tokens used in a tweet (NOTE WE DID NOT TEST ADJUSTING THIS NUMBER. CHANGING MAY CAUSE ERRORS)
probKeepNeutral=1 #Percentage of neutral samples that are trimmed (used if comb classes is true)

#Training params
numEpochs = 50
batch_size = 256
drop_rate = 0.65
cost = 0
eps = 1
lrate = 0.001 #Learning rate

#REstore path for more training (seen in bottom used due to lack of memory on maachine)

#qSave = True
#savePath = "./tmp/" + "model_v_XXX" 
#restore = False #Load a old model
#restore_path = "./tmp/" + "model_v_XXX" #load at this path


## Pre-Processing

### * Load data:
>* Note: We make certain that our data sets have some of each class. <br>
>* Note: We are loading our augmented dataset

> Param percent_training: percent of the corpus will be training and validation <br>
> Param emebdding dims: default is 25 dimensions

In [None]:
#PARAM
#data found in "Data/proc_SemEvalClean"
e_Dims=25
dataSet = util.semEvalData("Data/proc_SemEvalClean.txt",n_dims=25)
qVal = True #Always use validation
#validation unused:
train_df, test_df, val_df= dataSet.grab_data(percent_training, validation=qVal,percentValidation=val_percent)
if combClasses:
    train_df = dataSet.combClasses(train_df)
    test_df = dataSet.combClasses(test_df)
    val_df = dataSet.combClasses(val_df)

In [None]:
print("BEFORE AUGMENTATION")
total = len(train_df)  + len(test_df)  + len(val_df)
print("Data total:",total)
print("Count Train:",len(train_df)," Percent:",round(len(train_df)/total * 100,4))
print("Count Validation:",len(val_df)," Percent:",round(len(val_df)/total *100,4))
print("Count Test:",len(test_df)," Percent:",round(len(test_df)/total * 100,4))
print("Train Example: (ndx will be truncated later)")
train_df["Sentiment"].value_counts()

## Augment training data:
> We do this to increase our datasize:

In [None]:
rep,train_df = dataSet.augmentData(train_df,numAugs=numAugs,probKeepPositive=probKeepPositive)
print("AFTER AUGMENTATION")
print("Count Train:",len(train_df))
numDatapoints = len(train_df)
print("Training:\n",train_df["Sentiment"].value_counts(),"\n")
print("Testing:\n",test_df["Sentiment"].value_counts(),"\n")
print("Validation:\n",val_df["Sentiment"].value_counts())

### Load embeddings:
>We note that we will ONLY load the embeddings for words in our corpus. Since the vocab size of the embeddings is MUCH larger than the vocab size of our corpus we do this to save memory

In [None]:
embeddings_dict,totalCount,missingCount= dataSet.load_embeddings("Data\GloVe_Embeddings\glove.twitter.27B.25d.txt")

#our emebdings / total embeddings in files (get an idea of memory saved)
print("Corpus vocabulary size:",len(embeddings_dict.keys()))

print("SemEval vocabulary Size:",totalCount)

#corpus specific
print("Vocab in our corpus missing in SemEval:",missingCount)

## *Tokenize data:
> Here we will:
>1. Tokenize the tweet 
>2. Replace word tokens with their ids relative to our vocab in the previoud step
>3. Pad or truncate to a max tokensize of 28

In [None]:

x_trainR = train_df["Tweet"]
y_train = pd.get_dummies( train_df["Sentiment"]).values

x_testR = test_df["Tweet"]
y_test = pd.get_dummies(test_df["Sentiment"]).values


x_valR = val_df["Tweet"]
y_val = pd.get_dummies(val_df["Sentiment"]).values

x_train = dataSet.tokenize_data(x_trainR,padLength=maxTokens)
x_test = dataSet.tokenize_data(x_testR,padLength=maxTokens)
x_val = dataSet.tokenize_data(x_valR,padLength=maxTokens)
y_train[:10]

In [None]:
print("Shape of X:",[None] + list(x_train.shape[1:]))
print("Shape of Y:",[None] + list(y_train.shape[1:]))
numClasses = [None] + list(y_train.shape[1:])
arr = np.array([embeddings_dict[word_id] for word_id in x_train[0]])
print("Shape of X_in:",[None] + list(arr.shape[:]))
print("Emebeddings Shape:",[len(embeddings_dict.keys()),e_Dims])

In [None]:
#BREAK NOTEBOOK

### Load Data into tf:

In [None]:
embVals = np.array([value for wordId,value in embeddings_dict.items()])

In [None]:
numClasses = [None] + list(y_train.shape[1:])
x_train = tf.data.Dataset.from_tensor_slices(x_train)
#x_test = tf.data.Dataset.from_tensor_slices(x_test)
#x_val = tf.data.Dataset.from_tensor_slices(x_val)
y_train = tf.data.Dataset.from_tensor_slices(y_train)
#y_test = tf.data.Dataset.from_tensor_slices(y_test)
#y_val = tf.data.Dataset.from_tensor_slices(y_val)

train_data = tf.data.Dataset.zip((x_train,y_train))
#test_data = tf.data.Dataset.zip((x_test,y_test))
#val_data = tf.Dataset.zip((x_val,y_val))

In [None]:
#Delete unneeded variables
del(x_train)
del(y_train)
del(embeddings_dict)

## Tensorflow Setup Model:

### Placeholders:
>X_ids: input sentence tensor:
>* 1st-D: This will be None. (Batch size) 
>* 2nd-D: This will have width equal to the number of maxTokens (including the paddings) 
>* TYPE: This will have type int32 (since it is a tensor of id's)
>* x_train.shape -> [None, 28]

>X: input sentence emebedding tensor:
>* 1st-D: This will be None. (Batch size) 
>* 2nd-D: This will have width equal to the embedding dimension size
>* 3rd-D: This will have heigth equal to the number of maxTokens (including the paddings)
>* TYPE: This will have type float32 (since it is the emebeddings of id's)
>* Example:
>* X_in.shape -> [None, 28, 25]

>Y: output tensor:
>* 1st-D: This will be None. (Batch size) 
>* 2nd-D: This will have the number of possbile classes
>* TYPE: this will have float32
>* Example: y_train.shape -> [None, 5]

>embeddings: the embedding lookup table:
>* 1st-D: This will be number of words in our lookup table
>* 2nd-D: This will be the size of the embeddings
>* Example: embeddings.shape -> [15303, 25]

>phase: a boolean for batch normalization

>* LearningRate: the learning rate for the model

In [None]:
X_ids = tf.placeholder(tf.int32,[None,maxTokens],name="X_Ids")

#?
#X = tf.placeholder(tf.float32,[None,maxTokens,e_Dims],name="X_input")

Y = tf.placeholder(tf.float32,numClasses,name="Output_Class")

embeddings = tf.placeholder(tf.float32,[len(embVals),e_Dims],name="Embeddings")

phase = tf.placeholder(tf.bool, name='phase')

learningRate = tf.placeholder(tf.float32, name='Learning_rate')

### word2id Lookup:
> Convert2Id: This operation will convert our batches of X to X_ids which is what will be looked up <br>

In [None]:
#word2id,_ = dataSet.getMapping()
#mapping_strings = np.array([value for wordId,value in word2id.items()])


#This is a table of strings that map to their ids <CUT1> if we want we can just use this
#Xin2Id_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings,default_value=0)

#emblookup
X = tf.nn.embedding_lookup(embeddings,X_ids,name="Lookup")
X

## Model:
> Our Model Will consist of 4 parrallel convolutional layers followed by 4 pooling layers <br>
>* Note: our convoultional layers will have width equal input width so they will always output [?,1]
>* Currently we will assume a maxTokens=28,eDims=25

### Convolution 1A:
>* Input: [batch_size,maxTokens,embeddingDims]
>* Filter: [3,embeddingDims]
>* Stride: [1,1] 
>* Pad: VALID
>* Feature Maps: 7
>* Output: [26,1,7]
>* Activation function: Relu

In [None]:
W_conv1A = tf.Variable(tf.truncated_normal([3, e_Dims, 7], stddev=0.1),name="CONV1A_Weights")
b_conv1A = tf.Variable(tf.constant(0.1, shape=[7]),name="CONV1A_Bias") # need 7 biases for 7 outputs
conv1A= tf.nn.conv1d(X, W_conv1A, stride=1, padding='VALID',name="CONV1A") + b_conv1A

h_conv1A = tf.nn.relu(conv1A,name="reluA")
h_conv1A 

### Convolution 1B:
>* Input: [batch_size,maxTokens,embeddingDims] 
>* Filter: [5,embeddingDims]
>* Stride: [1,1] 
>* Pad: VALID
>* Feature Maps: 7
>* Output: [24,1,7]
>* Activation function: Relu

In [None]:
W_conv1B = tf.Variable(tf.truncated_normal([5, e_Dims, 7], stddev=0.1),name="CONV1B_Weights")
b_conv1B = tf.Variable(tf.constant(0.1, shape=[7]),name="CONV1B_Bias") # need 7 biases for 7 outputs
conv1B = tf.nn.conv1d(X, W_conv1B, stride=1, padding='VALID',name="CONV1B") + b_conv1B

h_conv1B = tf.nn.relu(conv1B,name="reluB")
h_conv1B

### Convolution 1C:
>* Input: [batch_size,maxTokens,embeddingDims]
>* Filter: [7,embeddingDims]
>* Stride: [1,1] 
>* Pad: VALID
>* Feature Maps: 7
>* Output: [22,1,7]
>* Activation function: Relu

In [None]:
W_conv1C = tf.Variable(tf.truncated_normal([7, e_Dims, 7], stddev=0.1),name="CONV1C_Weights")
b_conv1C = tf.Variable(tf.constant(0.1, shape=[7]),name="CONV1C_Bias") # need 7 biases for 7 outputs
conv1C = tf.nn.conv1d(X, W_conv1C, stride=1, padding='VALID',name="CONV1C") + b_conv1C

h_conv1C = tf.nn.relu(conv1C,name="reluC")
h_conv1C 

### Convolution 1D:
>* Input: [batch_size,maxTokens,embeddingDims]
>* Filter: [3,embeddingDims]
>* Stride: [2,1] 
>* Pad: VALID
>* Feature Maps: 7
>* Output: [13,1,7]
>* Activation function: Relu

In [None]:
W_conv1D = tf.Variable(tf.truncated_normal([2, e_Dims, 7], stddev=0.1),name="CONV1D_Weights")
b_conv1D = tf.Variable(tf.constant(0.1, shape=[7]),name="CONV1D_Bias") # need 7 biases for 7 outputs
conv1D = tf.nn.conv1d(X, W_conv1D, stride=1, padding='VALID',name="CONV1D") + b_conv1D

h_conv1D = tf.nn.relu(conv1D,"reluD")
h_conv1D

### Pool 1A:
>* Input: [batch_size,26,7]
>* Ksize: 2
>* Stride: 2
>* Pad: VALID
>* Output: [batch_size,12,7]

In [None]:
pool1A = tf.layers.max_pooling1d(h_conv1A,pool_size=2,strides=1, padding='VALID',name="1A_POOL") #max_pool on 26 length of 7 channels
pool1A

### Pool 1B:
>* Input: [batch_size,24,7]
>* Ksize: 2
>* Stride: 2
>* Pad: VALID
>* Output: [batch_size,6,7]

In [None]:
pool1B = tf.layers.max_pooling1d(h_conv1B,pool_size=2,strides=1, padding='VALID',name="1B_POOL") #max_pool on 24 length of 7 channels
pool1B

### Pool 1C:
>* Input: [batch_size,12,7]
>* Ksize: 2
>* Stride: 1
>* Pad: VALID
>* Output: [batch_size,10,7]

In [None]:
pool1C = tf.layers.max_pooling1d(h_conv1C,pool_size=2,strides=1, padding='VALID',name="1C_POOL") #max_pool on 24 length of 7 channels
pool1C

### Pool 1D:
>* Input: [batch_size,13,7]
>* Ksize: 2
>* Stride: 1
>* Pad: VALID
>* Output: [batch_size,11,7]

In [None]:
pool1D = tf.layers.max_pooling1d(h_conv1D,2,strides=1, padding='VALID',name="1D_POOL") #max_pool on 24 length of 7 channels
pool1D

### Concat parallel layers to a Matrix:
>* Concat Input(s): 
>* pool1A: [25]*7
>* pool1B: [23]*7
>* pool1C: [21]*7
>* pool1D: [12]*7
>* Output: [81,7]

In [None]:
cat_layer = tf.concat([pool1A,pool1B,pool1C,pool1D],1,name="Concat_Layers")
#cat_layer = tf.concat([pool1A,pool1B,pool1C],1,name="Concat_Layers")
cat_layer

In [None]:
fc_layer = tf.reshape(cat_layer, [-1, 95*7])
fc_layer

### Batch normalization?

In [None]:
norm_fc_layer = tf.contrib.layers.batch_norm(fc_layer, center=True, scale=True, is_training=phase,scope='bn')

### Fully Connected Layer 1:

In [None]:
#Doesnt seem to learn much here...

#W_fc1 = tf.Variable(tf.truncated_normal([567, 128], stddev=0.1),name="FULL1_Weights") #273 neurons
#b_fc1 = tf.Variable(tf.constant(0.1, shape=[128]),name="FULL1_Bias") # 5 possibilities

#fc1=tf.matmul(norm_fc_layer, W_fc1) + b_fc1
#fc1

### Dropout Layer:

In [None]:
keep_prob = tf.placeholder(tf.float32)
layer_drop = tf.nn.dropout(norm_fc_layer, keep_prob,name="Dropout")
layer_drop

### Fully connected layer:

In [None]:
W_fc2 = tf.Variable(tf.truncated_normal([665, numClasses[1]], stddev=0.1),name="FULL2_Weights") #273 neurons
b_fc2 = tf.Variable(tf.constant(0.1, shape=[numClasses[1]]),name="FULL2_Bias") # 5 possibilities

fc2=tf.matmul(layer_drop, W_fc2) + b_fc2
fc2

### Final Activation:

In [None]:
y_CNN= tf.nn.softmax(fc2)
y_CNN

## Loss:
>We will use l2 loss

In [None]:

cost_OP = tf.nn.l2_loss(y_CNN-Y, name="squared_error_cost")

##cost_OP = tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y,logits=fc2,name="cross_entropy")#WRONG?

#cost_OP = -tf.reduce_sum(Y * tf.log(y_CNN + 1e-5)) #Not great....
cost_OP

## *Optimization:

In [None]:
#PARAM:
#learningRate = 0.001

#adamoptimizer

#learningRate = tf.train.exponential_decay(learning_rate=0.008,
#                                         global_step= 1,
#                                         decay_steps=numDatapoints, #this is the number of datapoints...
#                                         decay_rate= 0.95,
#                                         staircase=True)

#Defining our Gradient Descent
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
        training_OP = tf.train.AdamOptimizer(learningRate).minimize(cost_OP)

## Additional Ops:

In [None]:
init_op = tf.initializers.global_variables()

# argmax(activation_OP, 1) returns the label with the most probability
# argmax(Y, 1) is the correct label
correct_predictions_OP = tf.equal(tf.argmax(y_CNN,1),tf.argmax(Y,1))

# If every false prediction is 0 and every true prediction is 1, the average returns us the accuracy
accuracy_OP = tf.reduce_mean(tf.cast(correct_predictions_OP, "float"))

# Summary op for regression output
activation_summary_OP = tf.summary.histogram("output", y_CNN)

# Summary op for accuracy
accuracy_summary_OP = tf.summary.scalar("accuracy", accuracy_OP)

# Summary op for cost
cost_summary_OP = tf.summary.scalar("cost", cost_OP)

# *Running the Model:

### Inputs: MODIFY THIS TO SAVE OR LOAD MODEL

In [None]:
#copied from above Change if desired on loading a model
#Training params
'''
numEpochs = 50
batch_size = 256
drop_rate = 0.66
cost = 0
eps = 1
lrate = 0.001 #Learning rate
'''

#Before re-running be sure to rest this should you want to train again
qSave = False
savePath = "./tmp/" + "model_v_XXXXXXXXX" 
restore = False #Load a old model
restore_path = "./tmp/" + "model_v_XXXXXXX" #load at this path

### Training step:

In [None]:
with tf.Session() as sess:
    saver = tf.train.Saver()
    if not restore:
        sess.run(init_op)
    else:
        saver.restore(sess, restore_path)
    
    ##Sumary steps:#######################################################################################################
    # Summary ops to check how variables (W, b) are updating after each iteration
    CONV1A_weightSummary = tf.summary.histogram("CONV1A_weights", W_conv1A)
    CONV1A_biasSummary = tf.summary.histogram("CONV1A_biases", b_conv1A)
    
    CONV1B_weightSummary = tf.summary.histogram("CONV1B_weights", W_conv1B)
    CONV1B_biasSummary = tf.summary.histogram("CONV1B_biases", b_conv1B)
    
    CONV1C_weightSummary = tf.summary.histogram("CONV1C_weights", W_conv1C)
    CONV1C_biasSummary = tf.summary.histogram("CONV1C_biases", b_conv1C)
    
    CONV1D_weightSummary = tf.summary.histogram("CONV1D_weights", W_conv1D)
    CONV1D_biasSummary = tf.summary.histogram("CONV1D_biases", b_conv1D)
    
    #FULL1_weightSummary = tf.summary.histogram("FULL1_weights", W_fc1.eval())
    #FULL1_biasSummary = tf.summary.histogram("FULL1_biases", b_fc1.eval())

    
    FULL2_weightSummary = tf.summary.histogram("FULL2_weights", W_fc2)
    FULL2_biasSummary = tf.summary.histogram("FULL2_biases", b_fc2)
    # Merge all summaries
    merged = tf.summary.merge_all()

    # Summary writer 
    writer = tf.summary.FileWriter("Logged_Summaries2", sess.graph)
    
    
    ######################################################################################################################
    

    s = 0
    for epoch in range(1,numEpochs+1):
        batch_cnt = 0
        #Batch data:##########################################################################################################
        train_data = train_data.shuffle(buffer_size=10000)
        batched_dataset = train_data.batch(batch_size)
        train_Iterator = batched_dataset.make_one_shot_iterator()
        next_training_batch = train_Iterator.get_next()
       
        
        ######################################################################################################################
        #loop through batches
        while True:
            s+=1
            try:
                #Split batches
                train_batch = sess.run(next_training_batch)
                x_batch, y_batch = train_batch[0], train_batch[1]
                
                #Batch training:
                summ,step = sess.run([merged,training_OP], 
                                     feed_dict={X_ids:x_batch, Y:y_batch, 
                                                embeddings:embVals,
                                                keep_prob:drop_rate,
                                                phase:True,
                                               learningRate:lrate})
                
                writer.add_summary(summ, global_step=s)
                
                #Write steps:
                if batch_cnt % 15 == 0:
                    # Add epoch to epoch_values
                    #epoch_values.append(batch_cnt)
                    # Generate accuracy stats on batch data
                    train_accuracy, newCost = sess.run([accuracy_OP, cost_OP], 
                                                       feed_dict={X_ids: x_batch, Y: y_batch, 
                                                                  embeddings:embVals,
                                                                  keep_prob:1,
                                                                  phase:False,
                                                                 learningRate:lrate})
                    # Add accuracy to live graphing variable
                    #accuracy_values.append(train_accuracy)
                    # Add cost to live graphing variable
                    #cost_values.append(newCost)
                    # Re-assign values for variables
                    eps = abs(newCost - cost)
                    cost = newCost              
                    
                    
                    #generate print statements
                    if batch_cnt % 45:
                        print("batch %d, training accuracy %g, cost %g, change in cost %g, lrate %g"
                              %(batch_cnt, train_accuracy, newCost, eps,lrate))
                    
                batch_cnt+=1
                
            except tf.errors.OutOfRangeError:
                print("Finished epoch:",epoch)
                break
            
        #End of epoch handle:
        #Evaluate validation data:
        val_acc = sess.run(accuracy_OP,feed_dict={X_ids: x_val, 
                                              Y: y_val, 
                                              embeddings:embVals,
                                              keep_prob:1,
                                                phase:False,
                                                 learningRate:lrate})
        
        print("Accuracy on Validation:",val_acc,"on completion of epoch",epoch)
        
    #Evaluate test set
    test_acc = sess.run(accuracy_OP,feed_dict={X_ids: x_test, 
                                              Y: y_test, 
                                              embeddings:embVals,
                                              keep_prob:1,
                                            phase:False,
                                              learningRate:lrate})

    print("Final accuracy on Test Set:",test_acc,"on completion of epoch",epoch)
    
    
    if qSave:
        save_path = saver.save(sess, savePath)
        print("Model saved in path: %s" % save_path)
    writer.flush()
    writer.close()

In [None]:
print("done!")

In [None]:
##TODO: Predict Tweets on the fly
#def predictTweet(tweet):  