In [2]:
import os
import re
import csv
import sys
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors



In [3]:
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences

In [4]:
from tensorflow.contrib import keras
import tensorflow as tf

In [4]:
# Load google pre-trained word2vec documents
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

In [13]:
def clean_sentence(sent, remove_stop = True, word_stem = True):
    sent = re.sub(r'[^\w]', ' ', sent).lower().split(' ')
    
    if remove_stop:
        stops = list(set(stopwords.words('english')))
        sent = [w for w in sent if w not in stops]
    
    if word_stem:
        stemmer = SnowballStemmer('english')
        stem_word = [stemmer.stem(w) for w in sent if w != '']
    sent = ' '.join(sent)
    return sent

In [6]:
question1_train = []
question2_train = []
labels_train = []
question1_test = []
question2_test = []

df_train = pd.read_csv('data/train.csv')

In [7]:
df_train = df_train[df_train.question2.notnull()]
df_train = df_train.drop(df_train.index[105780])
df_train.reset_index(inplace=True)

In [5]:
df_test = pd.read_csv('data/test.csv')

In [7]:
df_test.question1[df_test.question1.isnull()] = 'No Words'
df_test.question2[df_test.question2.isnull()] = 'No Words'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [8]:
for i in range(len(df_train)):
    sys.stdout.write('\rIndex:' + str(i))
    question1_train.append(clean_sentence(df_train.question1[i]))
    question2_train.append(clean_sentence(df_train.question2[i]))
    labels_train.append(df_train.is_duplicate[i])

Index:404286

In [None]:
for i in range(len(df_test)):
    sys.stdout.write('\rIndex:' + str(i))
    question1_test.append(clean_sentence(df_test.question1[i]))
    question2_test.append(clean_sentence(df_test.question2[i]))

In [9]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=250000)
tokenizer.fit_on_texts(question1_train + question2_train + question1_test + question2_test)

sequences_1 = tokenizer.texts_to_sequences(question1_train)
sequences_2 = tokenizer.texts_to_sequences(question2_train)
sequence_1_test = tokenizer.texts_to_sequences(question1_test)
sequence_2_test = tokenizer.texts_to_sequences(question2_test)

In [10]:
word_index = tokenizer.word_index

In [11]:
n_words = len(word_index) + 1

In [23]:
tf.reset_default_graph()

In [13]:
trainX_q1 = pad_sequences(sequences_1, maxlen=40, value=0.)
trainX_q2 = pad_sequences(sequences_2, maxlen=40, value=0.)

testX_q1 = pad_sequences(sequence_1_test, maxlen=40, value=0.)
testX_q2 = pad_sequences(sequence_2_test, maxlen=40, value=0.)

In [14]:
labels = np.array(labels_train, dtype=np.int_)

In [15]:
weight_matrix = np.zeros((n_words, 300))
for w, i in word_index.items():
    if w in word2vec.vocab:
        weight_matrix[i] = word2vec.word_vec(w)

In [16]:
trainY = to_categorical(labels_train, nb_classes=2)

In [17]:
weight_maxtrix = np.array(weight_matrix, dtype = np.float32)

In [24]:
net1 = tflearn.input_data([None, 40])
net1 = tflearn.embedding(net1, input_dim=n_words, output_dim=300, name='EmbeddingLayer')
net1 = tflearn.lstm(net1, 128, dropout=0.5)

net2 = tflearn.input_data([None, 40])
net2 = tflearn.embedding(net2, input_dim = n_words, output_dim=300, name='EmbeddingLayer')
net2 = tflearn.lstm(net2, 128, dropout=0.5)

network = tflearn.merge([net1, net2],'concat')
network = tflearn.fully_connected(network, 2, activation='softmax')
network = tflearn.regression(network, optimizer='adam', loss='categorical_crossentropy', learning_rate=0.001)

model = tflearn.DNN(network, tensorboard_verbose=0)
embeddingWeights = tflearn.get_layer_variables_by_name('EmbeddingLayer')[0]
model.set_weights(embeddingWeights, weight_matrix)

In [27]:
model.fit([trainX_q1,trainX_q2], trainY, validation_set=0.1, show_metric=True,
          batch_size=128)

Training Step: 855  | total loss: [1m[32m0.37707[0m[0m | time: 3.342s
[2K| Adam | epoch: 007 | loss: 0.37707 - acc: 0.8455 -- iter: 01152/18000


KeyboardInterrupt: 

In [None]:
# Save a model
model.save('my_model.tflearn')
# Load a model
model.load('my_model.tflearn')

In [None]:
tflearn.merge

In [None]:
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)