In [1]:
!pip install kaggle
from google.colab import files
uploaded = files.upload()
!mkdir .kaggle
!mv kaggle.json ./.kaggle/kaggle.json
!kaggle competitions download -c si650winter11
!mv ./.kaggle/competitions/si650winter11/* ./

Collecting kaggle
  Downloading https://files.pythonhosted.org/packages/74/9b/a6bdc22939c32e1cecc086a972b5e3f7c991b73d74dab0e5d82ff80c5968/kaggle-1.3.8.tar.gz
Building wheels for collected packages: kaggle
  Running setup.py bdist_wheel for kaggle ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/d6/3c/92/375decc0cf0a65b281db3cca9b8e21dc27b8ebc4d1ce29afca
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.3.8


Saving kaggle.json to kaggle.json
testdata.txt: Downloaded 2MB of 2MB to /content/.kaggle/competitions/si650winter11
training.txt: Downloaded 437KB of 437KB to /content/.kaggle/competitions/si650winter11


In [2]:
!ls

datalab  testdata.txt  training.txt


In [0]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
%matplotlib inline

In [0]:
sentiment_data = pd.read_csv('training.txt', sep='\t')
sentiment_data.columns =['Class', 'Data']

In [0]:
unlabeld_data = pd.read_csv('testdata.txt', sep='\t')
unlabeld_data.columns = ['Data']

In [7]:
sentiment_data.head()

Unnamed: 0,Class,Data
0,1,this was the first clive cussler i've ever rea...
1,1,i liked the Da Vinci Code a lot.
2,1,i liked the Da Vinci Code a lot.
3,1,I liked the Da Vinci Code but it ultimatly did...
4,1,that's not even an exaggeration ) and at midni...


In [8]:
unlabeld_data.head()

Unnamed: 0,Data
0,"harvard is dumb, i mean they really have to be..."
1,I'm loving Shanghai > > > ^ _ ^.
2,harvard is for dumb people.
3,"As i stepped out of my beautiful Toyota, i hea..."
4,"Bodies being dismembered, blown apart, and mut..."


In [0]:
# Shuffle data
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)
unlabeld_data = shuffle(unlabeld_data)

In [10]:
sentiment_data.head()

Unnamed: 0,Class,Data
1188,1,Which is why i said silent hill turned into re...
4133,0,The Da Vinci Code sucked balls.
309,1,the people who are worth it know how much i lo...
2411,1,"I want to be here because I love Harry Potter,..."
2165,1,I love Harry Potter ( the books are much bette...


In [0]:
labels = sentiment_data.iloc[:, 0].values
reviews = sentiment_data.iloc[:, 1].values
unlabeled_reviews = unlabeld_data.iloc[:,0].values

In [0]:
# remove punctuation
reviews_processed = []
unlabeled_processed = [] 
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)
    
for review in unlabeled_reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    unlabeled_processed.append(review_cool_one)

In [0]:
word_reviews = []
word_unlabeled = []
all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())

for review in unlabeled_processed:
    word_unlabeled.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())
    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

In [0]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

In [0]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])

In [0]:
unlabeled_to_ints = []

for review in word_unlabeled:
    unlabeled_to_ints.append([vocab_to_int[word] for word in review])

In [0]:
# creating word vectors
seq_len = 250

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]
    
features_test = np.zeros((len(unlabeled_to_ints), seq_len), dtype=int)
for i, review in enumerate(unlabeled_to_ints):
    features_test[i, -len(review):] = np.array(review)[:seq_len]

In [18]:
X_train = features[:6400]
y_train = labels[:6400]

X_test = features[6400:]
y_test = labels[6400:]

X_unlabeled = features_test

print('X_trian shape {}'.format(X_train.shape))
print('X_unlabeled shape {}'.format(X_unlabeled.shape))

X_trian shape (6400, 250)
X_unlabeled shape (28936, 250)


In [0]:
hidden_layer_size = 512 
number_of_layers = 1
batch_size = 100 
learning_rate = 0.001 
number_of_words = len(vocab_to_int)
dropout_rate = 0.8 
embed_size = 300 
epochs = 6 

In [0]:
tf.reset_default_graph()

In [0]:
inputs = tf.placeholder(tf.int32, [None, 250], name='inputs')
targets = tf.placeholder(tf.int32, [None, 1], name='targets')

In [0]:
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)

In [0]:
hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer])
init_state = cell.zero_state(batch_size, tf.float32)

In [0]:
outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)

In [53]:
X_train.shape

(6400, 250)

In [0]:
prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(targets, tf.float32), logits=prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [0]:
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

In [0]:
session = tf.Session()

In [0]:
session.run(tf.global_variables_initializer())

In [59]:
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        
        a, o, _ = session.run([accuracy, cost, optimizer], feed_dict={inputs:X_batch, targets:y_batch})

        training_accurcy.append(a)
        epoch_loss.append(o)
        ii += batch_size
    print('Epoch: {}/{}'.format(i, epochs), ' | Current loss: {}'.format(np.mean(epoch_loss)),
          ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

Epoch: 0/6  | Current loss: 0.5148965716362  | Training accuracy: 93.6094
Epoch: 1/6  | Current loss: 0.4861615002155304  | Training accuracy: 98.0938
Epoch: 2/6  | Current loss: 0.4824259579181671  | Training accuracy: 98.9531
Epoch: 3/6  | Current loss: 0.4807281494140625  | Training accuracy: 99.2813
Epoch: 4/6  | Current loss: 0.47907477617263794  | Training accuracy: 99.6094
Epoch: 5/6  | Current loss: 0.4792642891407013  | Training accuracy: 99.6250


In [0]:
test_accuracy = []

ii = 0
while ii + batch_size <= len(X_test):
    X_batch = X_test[ii:ii+batch_size]
    y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
    
    test_accuracy.append(a)
    ii += batch_size

In [0]:
print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))

Test accuracy is 98.8000%


In [0]:
#testing phase
predictions_unlabeled = []
ii = 0
while ii + batch_size <= len(X_unlabeled):
    if ii + batch_size > len(X_unlabeled):
        batch_size = len(X_unlabeled) - ii
    X_batch = X_unlabeled[ii:ii+batch_size]
    y_batch = X_unlabeled[ii:ii+batch_size].reshape(-1, 1)

    pred = session.run([prediction], feed_dict={inputs:X_batch, targets:y_batch})
    
    predictions_unlabeled.append(pred)
    ii += batch_size

In [0]:
pred_real = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [0]:
np.savetxt('predictions.txt', pred_real)

In [0]:
new_dataframe = unlabeld_data[:len(pred_real)]

In [0]:
new_dataframe['Classes'] = pred_real

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [0]:
new_dataframe

Unnamed: 0,Data,Classes
6244,london sucks....,0
22537,I love the Toyota Prius.,1
28827,Great job ata soccer..........,1
23001,"AAA's "" Q "" is catchy and an ear worm, like ma...",1
6567,i love shanghai ~ ~ ~ ~ 外滩好像有一家专卖上海纪念品的小店 ~ ~ ~.,1
17371,+ + + Bruce Willis hat den PrÃ ¤ sident von Ko...,0
9954,"Since then, 25 automakers including Toyota Mot...",0
18476,And as stupid as San Francisco's road system i...,1
8848,"Today, when Monkee was backing out of the Milp...",0
6277,Then we had stupid trivia about San Francisco ...,0
