# Quora question pairs.

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd

TensorFlow 2.x selected.


In [2]:
print(tf.__version__)

2.1.0


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/capstone2/quora_questions_similarity/train.csv', index_col = False, encoding='utf-8')

In [6]:
train.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [0]:
train = train.drop(['id', 'qid1', 'qid2'], axis=1)

In [0]:
train = train.dropna()

In [10]:
train.shape

(404287, 3)

# Clean dataset.

In [11]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def preprocessor(text):
    '''Preprocessor function to tokenize, 
    remove the markup and join back to a string. '''
    
    # tokenize
    tokens = word_tokenize(text)
    
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    
    # remove punctutation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove non-alphabetic tokens
    words = [word for word in stripped if word.isalpha()]
    
    # remove stop words
    stop_words = stopwords.words('english')
    words = [w for w in words if w not in stop_words]
    
    # join back to a string
    seperator = ' '
    text = seperator.join([w for w in words])
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
test_string = train.loc[3, 'question2']

In [13]:
test_string

'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'

In [14]:
preprocessor(test_string)

'find remainder math math divided'

In [0]:
train['question1'] = train['question1'].apply(preprocessor)

In [0]:
train['question2'] = train['question2'].apply(preprocessor)

In [18]:
train.head(10)

Unnamed: 0,question1,question2,is_duplicate
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,mentally lonely solve,find remainder math math divided,0
4,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0
5,astrology capricorn sun cap moon cap rising say,triple capricorn sun moon ascendant capricorn say,1
6,buy tiago,keeps childern active far phone video games,0
7,good geologist,great geologist,1
8,use シ instead し,use instead,0
9,motorola company hack charter motorolla,hack motorola free internet,0


# Create a dataset.

In [0]:
target = train.pop('is_duplicate')

In [0]:
train_valid_raw = tf.data.Dataset.from_tensor_slices(
    (train['question1'].values, train['question2'].values, target.values))

In [36]:
# Inspect
for entry in train_valid_raw.take(3):
    tf.print(entry[0].numpy(), entry[1].numpy(), entry[2])

b'isis really want' b'isis want accomplish' 1
b'apps automatically installed phone get rid' b'phone gets switched automatically' 0
b'know actually nt like something' b'know like something' 0


In [35]:
train_valid_raw

<ShuffleDataset shapes: ((), (), ()), types: (tf.string, tf.string, tf.int64)>

In [42]:
train_valid_raw

<ShuffleDataset shapes: ((), (), ()), types: (tf.string, tf.string, tf.int64)>

# Train / validation / test split. 

In [0]:
tf.random.set_seed(1)

train_valid_raw = train_valid_raw.shuffle(
    404287, reshuffle_each_iteration=False)

train_raw = train_valid_raw.take(300000)
valid_raw = train_valid_raw.skip(300000)

# Find unique tokens.

In [34]:
from collections import Counter
import tensorflow_datasets as tfds

tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

for entry in train_valid_raw:
    for question in range(0,2):
        tokens = tokenizer.tokenize(entry[question].numpy())
        token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))

Vocab-size: 93835


# Encoding each unique token into integers.

In [0]:
encoder = tfds.features.text.TokenTextEncoder(token_counts)

In [38]:
example_str = 'read watch although'
encoder.encode(example_str)

[416, 455, 5611]

# Define the functions for integer encoding.

In [0]:
def encode(quest1, quest2, label):
    ''' Encode each entry using integer encoding.'''
    quest1 = quest1.numpy()
    quest2 = quest2.numpy()
    encoded_quest1 = encoder.encode(quest1)
    encoded_quest2 = encoder.encode(quest2)
    return encoded_quest1, encoded_quest2, label

def encode_map_fn(quest1, quest2, label):
    '''Wrap the encode function to a TensorFlow operator.'''
    return tf.py_function(encode, inp=[quest1, quest2, label], 
                          Tout=(tf.int64, tf.int64, tf.int64))

In [0]:
train_raw = train_raw.map(encode_map_fn)
valid_raw = valid_raw.map(encode_map_fn)

In [57]:
tf.random.set_seed(1)
for example in train_raw.shuffle(1000).take(5):
    print('Sequences lengths are' + str(example[0].shape) + str(example[1].shape))

example

Sequences lengths are(3,)(2,)
Sequences lengths are(13,)(10,)
Sequences lengths are(3,)(3,)
Sequences lengths are(8,)(4,)
Sequences lengths are(4,)(4,)


(<tf.Tensor: shape=(4,), dtype=int64, numpy=array([ 181, 3026, 3027, 3028])>,
 <tf.Tensor: shape=(4,), dtype=int64, numpy=array([2809, 3026, 3029,  743])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

# Batching the dataset.

In [0]:
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))

valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))

test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))

# RNN model.