In [1]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import string

# import some modules
from numpy import linalg as LA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.corpus import words
from __future__ import division



In [2]:
STOP_WORDS = stopwords.words('english')

def remove_apostrophe(inp):
    inp = inp.replace("'", " ")
    return inp

def clean_text(inp):
    new_inp=inp
    clean_inp=[]
    for char in new_inp.split():
        if char.lower() not in STOP_WORDS:
            clean_inp.append(char) #cleaning stop words only
    return ' '.join(clean_inp)

def clean_matrix(df):
    #applying text clean on each sms in the matrix
    df['tweet'] = df['tweet'].apply(remove_apostrophe)
    df['tweet'] = df['tweet'].apply(clean_text)
    return df

def transform_train_and_test_matrix(train_dataset_path, test_dataset_path):
    df_train = pd.read_csv(train_dataset_path,encoding='latin1')
    df_test = pd.read_csv(test_dataset_path,encoding='latin1')

    clean_dataframe_train = clean_matrix(df_train)
    clean_dataframe_test = clean_matrix(df_test)

    labels_train = [1 if(df_train['handle'].iloc[i]) == 'Hillary' else 0 for i in range(len(df_train))]
    labels_test = [1 if(df_test['handle'].iloc[i]) == 'Trump' else 0 for i in range(len(df_test))]
    
    transformer = CountVectorizer(decode_error='ignore')
    messages_transform_train=transformer.fit_transform(clean_dataframe_train['tweet'])
    messages_transform_test=transformer.transform(clean_dataframe_test['tweet'])
    
    tf_idf_trans = TfidfTransformer(norm='l2', smooth_idf=False)
    messages_tfidf_train = tf_idf_trans.fit_transform(messages_transform_train)
    messages_tfidf_test = tf_idf_trans.transform(messages_transform_test)
    
    return np.array(messages_tfidf_train.toarray()), np.array(np.reshape(labels_train,(len(labels_train),1))), np.array(messages_tfidf_test.toarray()), np.array(np.reshape(labels_test,(len(labels_test),1)))

In [3]:
X_train, Y_train, X_test, Y_test = transform_train_and_test_matrix('HillaryOrTrump_train.csv','HillaryOrTrump_test.csv')

print 'X_train shape',X_train.shape
print 'Y_train shape',Y_train.shape
print 'X_test shape',X_test.shape
print 'Y_test shape',Y_test.shape

X_train shape (5000, 11408)
Y_train shape (5000, 1)
X_test shape (1444, 11408)
Y_test shape (1444, 1)


In [17]:
#################################---------------------------LSTM------------------###############################
import tensorflow as tf
from tensorflow.contrib import rnn

# Training Parameters# Train 
learning_rate = 0.001
training_steps = 10000
batch_size = X_train.shape[0]
display_step = 200

# Network Parameters
num_input = X_train.shape[0] # input of 
timesteps = X_train.shape[0] # timesteps
num_hidden =X_train.shape[0] # hidden layer num of features 
num_classes = 2 # Trump or Hillary

# tf Graph input
X = tf.placeholder("float", [num_input,num_hidden])
Y = tf.placeholder("float", [num_input,num_classes])

# Define weights# i guess this is where the weights and the biases are initialized
weights = {'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))}
biases = {'out': tf.Variable(tf.random_normal([num_classes]))}

def  RNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)

    x = tf.unstack(x, timesteps, 1)
    
    # Define a lstm cell with tensorflow
    lstm_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0,reuse=tf.AUTO_REUSE)

    # Get lstm cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

logits = RNN(X, weights, biases)
prediction = tf.nn.sigmoid(logits)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
    logits=logits, labels=Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, training_steps+1):
#         batch_x, batch_y = mnist.train.next_batch(batch_size) #this is where teh k-folds go?
        # Reshape data to get 28 seq of 28 elements
        X_train = X_train.reshape((batch_size, timesteps, num_input))
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: X_train, Y: Y_train})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: X_train,
                                                                 Y: Y_train})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

ValueError: Input 0 of layer basic_lstm_cell_9 is incompatible with the layer: expected ndim=2, found ndim=1. Full shape received: [5000]