In [4]:
# Import Packages

import tensorflow as tf
import numpy as np

# For Data Preparation
import nltk.data
nltk.download('punkt')
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

# For Word Embedding
import gensim
from gensim.models import Word2Vec
from gensim.models import Phrases
import logging

# For LSTM Model
from tensorflow.contrib import rnn
import pprint

pp = pprint.PrettyPrinter(indent=4)
sess = tf.InteractiveSession()
tf.set_random_seed(777)  # reproducibility

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eunbeejang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




# Data Preparation

In [None]:
# Data Processing

'''
#Read File
with open('FILE NAME', 'r') as f:
    bias_data = f.read()
'''

# Names for the columns (Header)
cols = ['sentence', 'bias', 'count', 'gender' 'age', 'affiliation']

# Load the data from a CSV (returns type(dataframe))
bias_data = pd.read_csv('FILE NAME', sep=',', names=cols, header=None) # encoding='latin-1'
print("Data loaded")

"""
# Accessing the data
bias_data.value
car_data[rows:cols]

# Basic Stats about the data
bias_data.describe()
bias_data.count
"""

In [None]:
# Convert a sentence into a list of words

def sentence_to_wordlist(sentence, remove_stopwords=False):
    # 1. Remove non-letters
    sentence_text = re.sub(r'[^\w\s]','', sentence)
    # 2. Convert words to lower case and split them
    words = sentence_text.lower().split()
    # 3. Return a list of words
    return(words)

In [None]:
# List of sentences where each sentence is a list of words
def data_to_sentences(data, tokenizer, remove_stopwords=False ):
    try:
        # 1. Use the NLTK tokenizer to split the text into sentences
        raw_sentences = tokenizer.tokenize(data.strip())
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call sentence_to_wordlist to get a list of words
                sentences.append(sentence_to_wordlist(raw_sentence))
        # 3. Return the list of sentences (each sentence is a list of words, so this returns a list of lists)
        len(sentences)
        return sentences
    except:
        print('Nope! ERROR in ...def... data_to_sentences')

In [None]:
bias_sentences = bias_data['sentence'].tolist()

sentences = []

for i in range(0,len(bias_sentences)):
    try:
        # Need to first change "./." to "." so that sentences parse correctly
        # bias = bias_sentences[i].replace("/.", '')
        # Now apply functions
        sentences += data_to_sentences(bias_sentences, tokenizer)
    except:
        print('Nonono! ERROR in ... creating a lit of sentences' ')

print("There are " + str(len(sentences)) + " sentences in our corpus of bias_data.")
              
# Accessing each sentences
              
# sentences[index]

In [None]:
labels = bias_data['bias'].tolist()

# Word Embedding

In [None]:
# Create Word Vectors

# train model
model = Word2Vec(sentences, min_count=1)

# Calling init_sims will make the model will be better for memory if we don't want to train the model over and over again
model.init_sims(replace=True)

# Summarize the loaded model
print(model)

# Summarize vocabulary
words = list(model.wv.vocab)
print(words)



"""

# save model
model.save('model.bin')

# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

# access vector for one word
print(model['sentence'])


"""


# LSTM Model Test

In [2]:
# RNN/LSTM Model

# One cell RNN input_dim(4) -> output_dim(2)
# sequnce_length 
num_classes = 5
learning_rate = 0.1
hidden_size = 5 # size of output from the LSTM
input_dim = 5 # one hot size
sequence_length = 6 # |hihello| == 6
batch_size = 1 # num of words to be inputted each training



# Data Creation
# idx2char: dictionary
idx2char = ['h', 'i', 'e', 'l', 'o']
# Teach hello: hihell -> ihello
x_data = [[0, 1, 0, 2, 3, 3]]   # hihell
x_one_hot = [[[1, 0, 0, 0, 0],   # h 0
              [0, 1, 0, 0, 0],   # i 1
              [1, 0, 0, 0, 0],   # h 0
              [0, 0, 1, 0, 0],   # e 2
              [0, 0, 0, 1, 0],   # l 3
              [0, 0, 0, 1, 0]]]  # l 3
y_data = [[1, 0, 2, 3, 3, 4]]    # ihello

"""
# One hot Encoding
h = [1, 0, 0, 0, 0] # 0:h
i = [0, 1, 0, 0, 0] # 1:i
e = [0, 0, 1, 0, 0] # 2:e
l = [0, 0, 0, 1, 0] # 3:l
o = [0, 0, 0, 0, 1] # 4:0

# shape of x_data : (batch size, sequence length, one hot vocab vector size)
x_data = np.array([[h,e,l,l,o],[e,o,l,l,l],[l,l,e,e,l]], dtype=np.float32)
print("x_data.shape = ", x_data.shape)
pp.pprint(x_data)
print("\n")


cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True, reuse=tf.AUTO_REUSE)
outputs, _states = tf.nn.dynamic_rnn(cell, x_data, dtype=tf.float32)
sess.run(tf.global_variables_initializer())

# shape of output : (batch size, sequence length, hidden size)
print("output.shape = ", outputs.shape)
pp.pprint(outputs.eval())
print("\n")

"""


X = tf.placeholder(tf.float32, [None, sequence_length, input_dim]) # X one-hot, None: Batch size
Y = tf.placeholder(tf.int32, [None, sequence_length])  # Y label


# Basic LSTM
cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True, reuse=tf.AUTO_REUSE)
initial_state = cell.zero_state(batch_size, tf.float32) # Initial State is always ZERO
outputs, _states = tf.nn.dynamic_rnn(cell, X, initial_state=initial_state, dtype=tf.float32)


# FC layer
X_for_fc = tf.reshape(outputs, [-1, hidden_size])
# fc_w = tf.get_variable("fc_w", [hidden_size, num_classes])
# fc_b = tf.get_variable("fc_b", [num_classes])
# outputs = tf.matmul(X_for_fc, fc_w) + fc_b
outputs = tf.contrib.layers.fully_connected(inputs=X_for_fc, num_outputs=num_classes, activation_fn=None)



# Calculate Sequence_Loss
# reshape out for sequence_loss
outputs = tf.reshape(outputs, [batch_size, sequence_length, num_classes])
weights = tf.ones([batch_size, sequence_length])
sequence_loss = tf.contrib.seq2seq.sequence_loss(logits=outputs, targets=Y, weights=weights)
loss = tf.reduce_mean(sequence_loss)

train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
prediction = tf.argmax(outputs, axis=2)


# Teach RNN 'ihello'
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(20):
        l, _ = sess.run([loss, train], feed_dict={X: x_one_hot, Y: y_data})
        result = sess.run(prediction, feed_dict={X: x_one_hot})
        print(i, "loss:", l, "\nprediction: ", result, "\ntrue Y: ", y_data)

        # print char using dic
        result_str = [idx2char[c] for c in np.squeeze(result)]
        print("\nPrediction str: ", ''.join(result_str), "\n==============")










0 loss: 1.60788 
prediction:  [[3 3 3 3 3 3]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  llllll 
1 loss: 1.51026 
prediction:  [[3 3 3 3 3 3]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  llllll 
2 loss: 1.4327 
prediction:  [[3 3 3 3 3 3]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  llllll 
3 loss: 1.34895 
prediction:  [[3 3 3 3 3 3]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  llllll 
4 loss: 1.25513 
prediction:  [[1 3 3 3 3 3]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  illlll 
5 loss: 1.14044 
prediction:  [[1 3 3 3 3 3]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  illlll 
6 loss: 1.01676 
prediction:  [[1 3 2 3 3 4]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  ilello 
7 loss: 0.896927 
prediction:  [[1 3 2 3 3 4]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  ilello 
8 loss: 0.769525 
prediction:  [[1 0 2 3 3 4]] 
true Y:  [[1, 0, 2, 3, 3, 4]]

Prediction str:  ihello 
9 loss: 0.655007 
prediction:  [[1 0 2 3 3 4]] 
true Y:  [[1, 0, 2, 3, 3

# LSTM MODEL for Bias Analysis