# Text Similarity with Quora question pair
## Preprocessing
### 1. Data 
#### 1.1 Data load

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')

import re
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [87]:
DATA_PATH = "C:/python/NLP/Chap_5/"
TRAIN_FILE = "train.csv"

train = pd.read_csv(DATA_PATH + TRAIN_FILE, encoding='UTF-8')
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


#### 1.2 Data sampling 
##### Ease difficulty from imbalanced data  

In [26]:
train_pos_data = train.loc[train['is_duplicate'] == 1]
train_neg_data = train.loc[train['is_duplicate'] == 0]

class_difference = len(train_neg_data) - len(train_pos_data)
sample_frac = 1 - (class_difference / len(train_neg_data))

train_neg_data = train_neg_data.sample(frac = sample_frac)
print(sample_frac)

0.5852831268846044


In [25]:
print("Duplicated: {}".format(len(train_pos_data)))
print("Not duplicated: {}".format(len(train_neg_data)))

Duplicated: 149263
Not duplicated: 149263


In [28]:
train = pd.concat([train_pos_data, train_neg_data])
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298526 entries, 5 to 197611
Data columns (total 6 columns):
id              298526 non-null int64
qid1            298526 non-null int64
qid2            298526 non-null int64
question1       298526 non-null object
question2       298524 non-null object
is_duplicate    298526 non-null int64
dtypes: int64(4), object(2)
memory usage: 15.9+ MB


### 2. Regularization

In [42]:
FILTERS = '[^a-zA-Z0-9]'              # Including number
change_filter = re.compile(FILTERS)

question1 = [str(s) for s in train['question1']]
quesiton2 = [str(s) for s in train['question2']]

re_question1 = []
re_question2 = []

for q in question1:
    re_question1.append(re.sub(change_filter, ' ', q).lower())
for q in quesiton2:    
    re_question2.append(re.sub(change_filter, ' ', q).lower())

### 3. Tokenizing

In [48]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(re_question1 + re_question2)

In [49]:
question1_sequence = tokenizer.texts_to_sequences(re_question1)
question2_sequence = tokenizer.texts_to_sequences(re_question2)

### 4. Padding

In [62]:
question1_sequence[0]

[2123, 5, 81, 7, 9343, 950, 4371, 778, 12, 4371, 5787, 2, 21, 29, 259, 49, 66]

In [63]:
MAX_SEQUENCE_LENGTH = 31

q1_data = pad_sequences(question1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

### 5. Save
##### For train set

In [75]:
word_vocab={}
word_vocab = tokenizer.word_index

labels = np.array(train['is_duplicate'], dtype=int)

print('Shape of question1_data: {}'.format(q1_data.shape))
print('Shape of question2_data: {}'.format(q2_data.shape))
print('Shape of Label: {}'.format(labels.shape))
print('Word index: %d' %(len(word_vocab)))

Shape of question1_data: (298526, 31)
Shape of question2_data: (298526, 31)
Shape of Label: (298526,)
Word index: 69263


In [84]:
import os
import json

data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) + 1

TRAIN_Q1 = 'q1_train.npy'
TRAIN_Q2 = 'q2_train.npy'
TRAIN_LABEL = 'label_train.npy'
DATA_CONFIGS = 'data_configs.npy'

if not os.path.exists(DATA_PATH + "data_for_modeling"):
    os.mkdir(DATA_PATH + "data_for_modeling")
    
np.save(open(DATA_PATH + "data_for_modeling/" + TRAIN_Q1, 'wb'), q1_data)
np.save(open(DATA_PATH + "data_for_modeling/" + TRAIN_Q2, 'wb'), q2_data)
np.save(open(DATA_PATH + "data_for_modeling/" + TRAIN_LABEL, 'wb'), labels)
json.dump(data_configs, open(DATA_PATH + "data_for_modeling/" + DATA_CONFIGS, 'w'))

##### For test set

In [91]:
test_data = pd.read_csv(DATA_PATH + 'test.csv', encoding='utf-8')
valid_ids = [type(x) == int for x in test_data.test_id]
test_data = test_data[valid_ids].drop_duplicates()

In [93]:
FILTERS = '[^a-zA-Z0-9]'              # Including number
change_filter = re.compile(FILTERS)

test_question1 = [str(s) for s in test_data['question1']]
test_quesiton2 = [str(s) for s in test_data['question2']]

test_re_question1 = []
test_re_question2 = []

for q in test_question1:
    test_re_question1.append(re.sub(change_filter, ' ', q).lower())
for q in test_quesiton2:    
    test_re_question2.append(re.sub(change_filter, ' ', q).lower())

In [94]:
test_question1_sequence = tokenizer.texts_to_sequences(test_re_question1)
test_question2_sequence = tokenizer.texts_to_sequences(test_re_question2)

MAX_SEQUENCE_LENGTH = 31

test_q1_data = pad_sequences(test_question1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_question2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [95]:
test_id = np.array(test_data['test_id'])

print('Shape of test_question1_data: {}'.format(test_q1_data.shape))
print('Shape of test_question2_data: {}'.format(test_q2_data.shape))
print('Shape of test_id: {}'.format(test_id.shape))

Shape of test_question1_data: (2345796, 31)
Shape of test_question2_data: (2345796, 31)
Shape of test_id: (2345796,)


In [96]:
TEST_Q1 = 'test_q1.npy'
TEST_Q2 = 'test_q2.npy'
TEST_ID = 'test_id.npy'

np.save(open(DATA_PATH + TEST_Q1, 'wb'), test_q1_data)
np.save(open(DATA_PATH + TEST_Q2, 'wb'), test_q2_data)
np.save(open(DATA_PATH + TEST_ID, 'wb'), test_id)

##### Note

* MAX LENGTH

EDA과정에서 도출된 의사결정. 로그 스케일 말고 다른 값으로 보자

* 정규화

숫자 포함 or 제외?