***Dataset:*** https://www.kaggle.com/c/si650winter11/data

In [9]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
from sklearn.utils import shuffle
%matplotlib inline

### Load Data

In [2]:
labeled_data = pd.read_csv('datasets/kaggle/training.txt', sep='\t')
labeled_data.columns =['Class', 'Data']

In [5]:
unlabeled_data = pd.read_csv('datasets/kaggle/testing.txt', sep='\t')
unlabeled_data.columns = ['Data']

In [14]:
labeled_data = shuffle(labeled_data)
unlabeled_data = shuffle(unlabeled_data)

In [15]:
labeled_data.head()

Unnamed: 0,Class,Data
4127,0,"Besides, Da Vinci Code sucks..."
3685,1,Brokeback Mountain was so awesome.
3070,1,I love playing defensive positions and i love ...
1147,1,i like mission impossible.
5808,0,"Is it just me, or does Harry Potter suck?..."


In [16]:
unlabeled_data.head()

Unnamed: 0,Data
18937,I need to pay Geico and a host of other bills ...
12164,"I like Tom Cruise, as I've stated over and over."
5716,. I'm pleased to announce that Boston sucked...
1439,"If, however, your eyes can't handle the resolu..."
15957,Tom Cruise sucks.


In [17]:
labels = labeled_data.iloc[:, 0].values
reviews = labeled_data.iloc[:, 1].values
unlabeled_reviews = unlabeled_data.iloc[:,0].values

In [18]:
labels

array([0, 1, 1, ..., 1, 1, 0])

In [19]:
reviews

array(['Besides, Da Vinci Code sucks...',
       'Brokeback Mountain was so awesome.',
       'I love playing defensive positions and i love brokeback mountain..',
       ...,
       'i love being a sentry for mission impossible and a station for bonkers.',
       '* brokeback mountain is an awesome movie..',
       'I hate Harry Potter, that daniel wotshisface needs a fucking slap...'], dtype=object)

In [20]:
unlabeled_reviews

array([ 'I need to pay Geico and a host of other bills but that is neither here nor there.',
       "I like Tom Cruise, as I've stated over and over.",
       ". I'm pleased to announce that Boston sucked...", ...,
       'Boston can suck my fucking tits...',
       'Toyota is doing some amazing things with fuel economy.',
       'Stupid UCLA.'], dtype=object)

### Clean Data

In [21]:
reviews_processed = []
unlabeled_processed = [] 
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)
    
for review in unlabeled_reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    unlabeled_processed.append(review_cool_one)

In [24]:
reviews_processed[:5]

['Besides Da Vinci Code sucks',
 'Brokeback Mountain was so awesome',
 'I love playing defensive positions and i love brokeback mountain',
 'i like mission impossible',
 'Is it just me or does Harry Potter suck']

In [25]:
word_reviews = []
word_unlabeled = []
all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())

for review in unlabeled_processed:
    word_unlabeled.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())
    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

In [26]:
word_reviews[:5]

[['besides', 'da', 'vinci', 'code', 'sucks'],
 ['brokeback', 'mountain', 'was', 'so', 'awesome'],
 ['i',
  'love',
  'playing',
  'defensive',
  'positions',
  'and',
  'i',
  'love',
  'brokeback',
  'mountain'],
 ['i', 'like', 'mission', 'impossible'],
 ['is', 'it', 'just', 'me', 'or', 'does', 'harry', 'potter', 'suck']]

In [29]:
vocab[:10]

['i', 'the', 'and', 'love', 'is', 'a', 'to', 'my', 'of', 'that']

### Process Data

In [32]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}
print(vocab_to_int['awesome'])

17


In [34]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])
print(reviews_to_ints[1])

[43, 44, 14, 15, 17]


In [36]:
unlabeled_to_ints = []
for review in word_unlabeled:
    unlabeled_to_ints.append([vocab_to_int[word] for word in review])

In [37]:
reviews_lens = Counter([len(x) for x in reviews_to_ints])
print(reviews_lens)

Counter({4: 1111, 5: 770, 6: 716, 7: 591, 11: 533, 15: 446, 8: 399, 14: 286, 12: 278, 21: 253, 9: 212, 25: 170, 10: 157, 13: 130, 16: 115, 17: 104, 20: 97, 22: 94, 28: 88, 34: 84, 33: 83, 3: 35, 19: 33, 18: 30, 23: 19, 24: 14, 27: 13, 31: 11, 26: 10, 29: 7, 32: 6, 30: 5, 36: 4, 72: 2, 35: 2, 311: 1, 63: 1, 104: 1, 931: 1, 38: 1, 333: 1, 175: 1, 44: 1, 40: 1})


In [38]:
print('Zero-length {}'.format(reviews_lens[0]))
print("Max review length {}".format(max(reviews_lens)))

Zero-length 0
Max review length 931


In [42]:
# Creating Word Vectors
seq_len = 250

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
print(len(reviews_to_ints))
print(features.shape)
print(features)

6917
(6917, 250)
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [43]:
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]
print(features)

[[  0   0   0 ...,  41  45  20]
 [  0   0   0 ...,  14  15  17]
 [  0   0   0 ...,   4  43  44]
 ..., 
 [  0   0   0 ..., 675  21 945]
 [  0   0   0 ...,  76  17 102]
 [  0   0   0 ...,   6 153 788]]


In [49]:
features_test = np.zeros((len(unlabeled_to_ints), seq_len), dtype=int)
for i, review in enumerate(unlabeled_to_ints):
    features_test[i, -len(review):] = np.array(review)[:seq_len]
print(len(unlabeled_to_ints))
print(features_test.shape)
print(features_test)

28936
(28936, 250)
[[   0    0    0 ...,  114  682  113]
 [   0    0    0 ...,  129    3  129]
 [   0    0    0 ...,   10   35   81]
 ..., 
 [   0    0    0 ...,    8  153 1080]
 [   0    0    0 ...,   52  624  701]
 [   0    0    0 ...,    0   19   42]]


In [51]:
X_train = features[:6400]
y_train = labels[:6400]

X_test = features[6400:]
y_test = labels[6400:]

X_unlabeled = features_test

print('X_train shape {}'.format(X_train.shape))
print('X_test shape {}'.format(X_test.shape))
print('X_unlabeled shape {}'.format(X_unlabeled.shape))

X_train shape (6400, 250)
X_test shape (517, 250)
X_unlabeled shape (28936, 250)


### Build RNN Model

In [68]:
hidden_layer_size = 512
number_of_layers = 1 
batch_size = 100
learning_rate = 0.001
number_of_words = len(vocab_to_int) + 1
dropout_rate = 0.8 
embedding_size = 300 
epochs = 3

In [69]:
tf.reset_default_graph() 

In [70]:
# Placeholders
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')

In [71]:
# Embedding Layer
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embedding_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)

In [72]:
# Hidden Layer
hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
init_state = cell.zero_state(batch_size, tf.float32)

In [73]:
outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)

In [74]:
# Backpropagation
prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
cost = tf.losses.mean_squared_error(targets, prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [75]:
# Accuracy
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

### Training

In [76]:
session = tf.Session()
session.run(tf.global_variables_initializer())
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        
        a, o, _ = session.run([accuracy, cost, optimizer], feed_dict={inputs:X_batch, targets:y_batch})

        training_accurcy.append(a)
        epoch_loss.append(o)
        ii += batch_size
    print('Epoch: {}/{}'.format(i + 1, epochs), ' | Current loss: {}'.format(np.mean(epoch_loss)),
          ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

Epoch: 1/3  | Current loss: 0.04499276727437973  | Training accuracy: 94.4375
Epoch: 2/3  | Current loss: 0.008344389498233795  | Training accuracy: 98.9531
Epoch: 3/3  | Current loss: 0.005247828084975481  | Training accuracy: 99.3906


### Testing

In [77]:
test_accuracy = []

ii = 0
while ii + batch_size <= len(X_test):
    X_batch = X_test[ii:ii+batch_size]
    y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
    
    test_accuracy.append(a)
    ii += batch_size
print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))

Test accuracy is 98.8000%


In [None]:
predictions_unlabeled = []
ii = 0
while ii + batch_size <= len(X_unlabeled):
    if ii + batch_size > len(X_unlabeled):
        batch_size = len(X_unlabeled) - ii
    X_batch = X_unlabeled[ii:ii+batch_size]
    y_batch = X_unlabeled[ii:ii+batch_size].reshape(-1, 1)

    pred = session.run([prediction], feed_dict={inputs:X_batch, targets:y_batch})
    
    predictions_unlabeled.append(pred)
    ii += batch_size

In [84]:
pred_real  = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [85]:
np.savetxt('predictions.txt', pred_real)

In [87]:
new_dataframe = unlabeled_data[:len(pred_real)]

In [None]:
new_dataframe['Classes'] = pred_real

In [89]:
new_dataframe

Unnamed: 0,Data,Classes
18937,I need to pay Geico and a host of other bills ...,0
12164,"I like Tom Cruise, as I've stated over and over.",1
5716,. I'm pleased to announce that Boston sucked...,0
1439,"If, however, your eyes can't handle the resolu...",0
15957,Tom Cruise sucks.,0
23678,"By the time I left the Hospital, I had no time...",0
26653,Three days at Purdue with three awesome people.,1
24986,"I really, really hate TOM CRUISE.",0
5761,I really hate Tom Cruise.,0
20284,How terrible London is ar ~ ~ share a bit with...,0
