In [1]:
import numpy as np 
import pandas as pd 
import re
from random import randint
from sklearn.model_selection import train_test_split

maxSeqLength = 200
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000
numStories = 43000

In [None]:
fake_df = pd.read_csv('fake.csv')
print fake_df.columns
fake_text = fake_df['text'].values

In [None]:
real_df = pd.read_json('signalmedia-1m.jsonl',lines=True)
print real_df.columns
real_text = real_df['content'].values

In [None]:
# choose 50,000 random stories from the real dataset
real_i = np.random.choice(len(real_text), size=30000, replace=False)
real_text = real_text[real_i]

In [2]:
def loadGloveModel():
    print "Loading Glove Model"
    f = open('glove.6B.100d.txt','r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print "Done.",len(model)," words loaded!"
    return model

In [3]:
# glove word-vec dictionary
glove = loadGloveModel()

Loading Glove Model
Done. 400000  words loaded!


In [None]:
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [None]:
# stories x the first 200 words
train_matrix = np.empty((numStories, maxSeqLength+1), dtype='object')
story_index = 0
for story in real_text:
    words = np.empty((maxSeqLength+1), dtype='object')
    indexCounter = 0
    cleanedLine = cleanSentences(story)
    split = cleanedLine.split()
    if len(split)< maxSeqLength:
        indexCounter = maxSeqLength-len(split)
    else:
        split = split[:maxSeqLength]
    for word in split:
        words[indexCounter] = word
        indexCounter = indexCounter + 1
    words[maxSeqLength]="REAL"
    train_matrix[story_index] = words
    story_index+=1

for story in fake_text:
    words = np.empty((maxSeqLength+1), dtype='object')
    indexCounter = 0
    if type(story) is float:
        story = str(story)
    cleanedLine = cleanSentences(story)
    split = cleanedLine.split()
    if len(split)< maxSeqLength:
        indexCounter = maxSeqLength-len(split)
    else:
        split = split[:maxSeqLength]
    for word in split:
        words[indexCounter] = word
        indexCounter = indexCounter + 1
    words[maxSeqLength]="Fake"
    train_matrix[story_index] = words
    story_index+=1

np.save('my_train_matrix', train_matrix)

In [None]:
print train_matrix.shape
#matrix_data = np.load('train_matrix.npy')
np.save('my_train_matrix', train_matrix)

In [57]:
matrix_data = np.load('my_train_matrix.npy')
np.random.shuffle(matrix_data)
labels = matrix_data[:-1,-1]
data = matrix_data[:-1,:-1]
print data.shape
print labels.shape

(42999, 200)
(42999,)


In [5]:
glove_vecs = pd.Series(glove)
glove_vecs = np.array(list(glove_vecs.values))
print glove_vecs.shape 

(1917494, 300)


In [20]:
glove_keys=np.array(glove.keys(), dtype=object)
print glove_keys.shape

['ketyung' 'coolwaremax' 'pre-increment' 'talbe' 'seniro' 'talba'
 'tuberoses' 'pudz' 'gainsbourg' 'shuncheng' 'sebokeng' 'groooovy'
 'toiket' 'imagesgallery' 'airball' 'write/read' 'hiranuma' 'achla'
 'urbanbaby' 'blasse' 'microdesign' 'newstandard' 'sidebars' 'buy-here'
 'bertholet' 'flyingfungal'
 '-----------------------------------------------------------------------------------------------------------------------------------------'
 'compettive' 'accpetance' 'benedikt' '11.05.2008' 'deloreans' 'xoda'
 'gelbert' 'pre-vietnam' 'storiesin' 'sylves' 'sylver' '11.05.2007'
 '11.05.2006' 'sylven' 'downloadmoneyworld' 'delphyne' 'expeditionary'
 'gelberg' '\xc3\xbaltimamente' 'majorsjob' 'shoppingfood' 'sites.a'
 'water-course' 'tumtube.com' 'eyehole' 'sites.i' 'sidebar.' 'kletke'
 'again-off' 'essix' 'ozment' 'hans-erik' 'opravdu' 'fluorite' 'jidaigeki'
 'edgeways' 'embarressing' 'kleffman' 'sunrays' 'basolo' 'missesdressy'
 'agencyfaqs' 'avengedsevenfold' '230k-dot' '.6705' 'italso' 't

In [58]:
def word2vec(word):
    try:
        return glove[word]
    except KeyError:
        return np.zeros((100))

vec_data = [[word2vec(w) for w in story] for story in data]
vec_data = np.array(vec_data)
print vec_data.shape

(42999, 200, 100)


In [59]:
def to_binary(X):
    if X == "REAL":
        return 1
    else:
        return 0

def one_hot_encode(labels):
    labels = map(to_binary, labels)
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), [int(l) for l in labels]] = 1
    return one_hot_encode

In [61]:
binary_labels = one_hot_encode(labels)

In [62]:
x_train, x_val, y_train, y_val = train_test_split(vec_data, binary_labels, test_size=0.20, random_state=42)

In [63]:
import tensorflow as tf
batch_size = 1000
time_size = 5
class RNN_cell(object):

    """
    RNN cell object which takes 3 arguments for initialization.
    input_size = Input Vector size
    hidden_layer_size = Hidden layer size
    target_size = Output vector size

    """

    def __init__(self, input_size, hidden_layer_size, target_size):

        #Initialization of given values
        self.input_size = input_size
        self.hidden_layer_size = hidden_layer_size
        self.target_size = target_size
        
        # Weights for input and hidden tensor
        self.Wx = tf.Variable(tf.zeros([self.input_size,self.hidden_layer_size]))
        self.Wr = tf.Variable(tf.zeros([self.input_size,self.hidden_layer_size]))
        self.Wz = tf.Variable(tf.zeros([self.input_size,self.hidden_layer_size]))
        
        self.br = tf.Variable(tf.truncated_normal([self.hidden_layer_size],mean=1))
        self.bz = tf.Variable(tf.truncated_normal([self.hidden_layer_size],mean=1))
        
        self.Wh = tf.Variable(tf.zeros([self.hidden_layer_size,self.hidden_layer_size]))

        
        #Weights for output layer
        self.Wo = tf.Variable(tf.truncated_normal([self.hidden_layer_size,self.target_size],mean=1,stddev=.01))
        self.bo = tf.Variable(tf.truncated_normal([self.target_size],mean=1,stddev=.01))
        # Placeholder for input vector with shape[batch, seq, embeddings]
        self._inputs = tf.placeholder(tf.float32,
                                      shape=[None, None, self.input_size],
                                      name='inputs')

        # Processing inputs to work with scan function
        self.processed_input = process_batch_input_for_RNN(self._inputs)

        '''
        Initial hidden state's shape is [1,self.hidden_layer_size]
        In First time stamp, we are doing dot product with weights to
        get the shape of [batch_size, self.hidden_layer_size].
        For this dot product tensorflow use broadcasting. But during
        Back propagation a low level error occurs.
        So to solve the problem it was needed to initialize initial
        hiddden state of size [batch_size, self.hidden_layer_size].
        So here is a little hack !!!! Getting the same shaped
        initial hidden state of zeros.
        '''

        self.initial_hidden = self._inputs[:, 0, :]
        self.initial_hidden = tf.matmul(
            self.initial_hidden, tf.zeros([input_size, hidden_layer_size]))
        
        
    #Function for GRU cell
    def Gru(self, previous_hidden_state, x):
        """
        GRU Equations
        """
        z= tf.sigmoid(tf.matmul(x,self.Wz)+ self.bz)
        r= tf.sigmoid(tf.matmul(x,self.Wr)+ self.br)
        
        h_= tf.tanh(tf.matmul(x,self.Wx) + tf.matmul(previous_hidden_state,self.Wh)*r)
                    
        
        current_hidden_state = tf.multiply((1-z),h_) + tf.multiply(previous_hidden_state,z)
        
        return current_hidden_state     
    
    # Function for getting all hidden state.
    def get_states(self):
        """
        Iterates through time/ sequence to get all hidden state
        """

        # Getting all hidden state throuh time
        all_hidden_states = tf.scan(self.Gru,
                                    self.processed_input,
                                    initializer=self.initial_hidden,
                                    name='states')

        return all_hidden_states

    # Function to get output from a hidden layer
    def get_output(self, hidden_state):
        """
        This function takes hidden state and returns output
        """
        output = tf.nn.relu(tf.matmul(hidden_state, self.Wo) + self.bo)

        return output

    # Function for getting all output layers
    def get_outputs(self):
        """
        Iterating through hidden states to get outputs for all timestamp
        """
        all_hidden_states = self.get_states()

        all_outputs = tf.map_fn(self.get_output, all_hidden_states)

        return all_outputs


# Function to convert batch input data to use scan ops of tensorflow.
def process_batch_input_for_RNN(batch_input):
    """
    Process tensor of size [5,3,2] to [3,5,2]
    """
    batch_input_ = tf.transpose(batch_input, perm=[2, 0, 1])
    X = tf.transpose(batch_input_)

    return X

In [64]:
hidden_layer_size = 30
input_size = 100
target_size = 2

y = tf.placeholder(tf.float32, shape=[None, target_size],name='inputs')
#Initializing rnn object
rnn=RNN_cell( input_size, hidden_layer_size, target_size)

#Getting all outputs from rnn
outputs = rnn.get_outputs()

#Getting final output through indexing after reversing
last_output = outputs[-1]

#As rnn model output the final layer through Relu activation softmax is used for final output.
output=tf.nn.softmax(last_output)

#Computing the Cross Entropy loss 
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=y))

# Trainning with Adadelta Optimizer
train_step = tf.train.AdamOptimizer().minimize(cross_entropy)


#Calculatio of correct prediction and accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(output,1))
accuracy = (tf.reduce_mean(tf.cast(correct_prediction, tf.float32)))*100

In [65]:
training_epochs = 500
display_step = 1

sess=tf.InteractiveSession()
sess.run(tf.initialize_all_variables())

for epoch in range(training_epochs):
    new_cost = 0.0
    total_batch = int(len(x_train) / batch_size)
    x_batches = np.array_split(x_train, total_batch)
    y_batches = np.array_split(y_train, total_batch)
    for i in range(total_batch):
        batch_x, batch_y = x_batches[i], y_batches[i]
        _, c = sess.run([train_step, cross_entropy], 
                        feed_dict={rnn._inputs:batch_x, y:batch_y})
        new_cost = c
    if epoch % display_step == 0:
        print("Epoch:", '%04d' % (epoch+1), "cost=", \
            "{:.9f}".format(new_cost))
print("Optimization Finished!")


('Epoch:', '0001', 'cost=', '0.622020185')
('Epoch:', '0002', 'cost=', '0.612218618')
('Epoch:', '0003', 'cost=', '0.598558247')
('Epoch:', '0004', 'cost=', '0.506338596')
('Epoch:', '0005', 'cost=', '0.491447955')
('Epoch:', '0006', 'cost=', '0.484116822')
('Epoch:', '0007', 'cost=', '0.481873631')
('Epoch:', '0008', 'cost=', '0.475472540')
('Epoch:', '0009', 'cost=', '0.471095741')
('Epoch:', '0010', 'cost=', '0.466704607')
('Epoch:', '0011', 'cost=', '0.463764578')
('Epoch:', '0012', 'cost=', '0.460626662')
('Epoch:', '0013', 'cost=', '0.457104266')
('Epoch:', '0014', 'cost=', '0.449717164')
('Epoch:', '0015', 'cost=', '0.461616725')
('Epoch:', '0016', 'cost=', '0.446820378')
('Epoch:', '0017', 'cost=', '0.441348851')
('Epoch:', '0018', 'cost=', '0.438674033')
('Epoch:', '0019', 'cost=', '0.436278164')
('Epoch:', '0020', 'cost=', '0.437655389')
('Epoch:', '0021', 'cost=', '0.436002493')
('Epoch:', '0022', 'cost=', '0.435992122')
('Epoch:', '0023', 'cost=', '0.435400903')
('Epoch:', 

KeyboardInterrupt: 

In [36]:
"""with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    print("Start session.")
    
    for epoch in range(training_epochs):
        new_cost = 0.0
        total_batch = int(len(x_train) / batch_size)
        x_batches = np.array_split(x_train, total_batch)
        y_batches = np.array_split(y_train, total_batch)
        for i in range(total_batch):
            batch_x, batch_y = x_batches[i], y_batches[i]
            _, c = sess.run([optimizer, cost], 
                            feed_dict={
                                x: batch_x, 
                                y: batch_y
                            })
            new_cost = c
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(new_cost))
    print("Optimization Finished!")
    
    # Test model
    pred = tf.nn.softmax(logits)  # Apply softmax to logits
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: X_val, y: y_val}))
    
    #Now, save the graph
    predict = tf.argmax(pred, 1)
    test_pred = predict.eval({x: test})
    f = open('results.txt','a')
    
    print(test_pred.shape)
    for prediction in test_pred:
        f.write(str(prediction) + '\n')
    f.close()
    
    df = pd.DataFrame(test_pred)
    df.to_csv("results.csv")"""

NameError: name 'graph' is not defined