<a href="https://colab.research.google.com/github/jakartaresearch/quora-question-pairs/blob/master/notebook/R1_data_cleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Covers

- Remove NaN value
- Remove punctuation
- Remove hyperlinks
- Remove HTML tag
- Remove non-ascii character
- Remove single character
- Lowercase all text

In [0]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive/"     # default location for the drive
drive.mount(ROOT)           # we mount the google drive at /content/drive

import pandas as pd
import numpy as np
import re
import string
import unicodedata
import nltk

from tqdm import tqdm_notebook
tqdm_notebook().pandas()
from bs4 import BeautifulSoup
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
data_path = 'drive/My Drive/Colab Notebooks/quora-question-pairs/data/quora_duplicate_questions.tsv'
data = pd.read_csv(data_path, sep='\t')

In [0]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Remove NaN value

In [0]:
print('Jumlah NaN di question1 :', data.question1.isnull().sum())
print('Jumlah NaN di question2 :', data.question2.isnull().sum())

Jumlah NaN di question1 : 1
Jumlah NaN di question2 : 2


In [0]:
data[pd.isna(data.question1)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [0]:
data[pd.isna(data.question2)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [0]:
index_drop = [data[pd.isna(data.question1)].index[0],
              data[pd.isna(data.question2)].index[0],
              data[pd.isna(data.question2)].index[1]]

In [0]:
# Drop data which question1 or question2 have NaN value
data = data.drop(index_drop).reset_index(drop=True)

## Find out hyperlink

In [0]:
def find_hyperlink(string): 
    url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string) 
    return url 

In [0]:
# Test code 
string = 'My Profile: https://auth.geeksforgeeks.org / user / Chinmoy % 20Lenka / articles in the portal of http://www.geeksforgeeks.org/' 
print("Urls: ", find_hyperlink(string)) 

Urls:  ['https://auth.geeksforgeeks.org', 'http://www.geeksforgeeks.org/']


In [0]:
# Check hyperlink in 'question1' and 'question2'
total_hyperlink = 0

for i in data['question1']:
  result = find_hyperlink(i)
  if len(result) != 0 :
    total_hyperlink += 1
    print('Get Hyperlink :', result)

for i in data['question2']:
  result = find_hyperlink(i)
  if len(result) != 0 :
    total_hyperlink += 1
    print('Get Hyperlink :', result)

print('Total Hyperlink :', total_hyperlink)

Total Hyperlink : 0


# remove punctuation, remove html tag, remove non-ascii character, remove stopwords

In [0]:
def remove_punctuation(text):
    import string
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    return text.translate(table)

In [0]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [0]:
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    new_text = []
    text = text.split()
    for word in text:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_text.append(new_word)
    new_text = ' '.join(new_text)
    return new_text

In [0]:
exception_words = ['few', 'more', 'most', 'because', 'now', 'before', 'after',
                   'what', 'when', 'where', 'how', 'should', 'which', 'whose']
stop_words = [w for w in stopwords.words('english') if not w in exception_words]

def remove_stopwords(text):
    word_tokens = word_tokenize(text)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = ' '.join(filtered_sentence)
    return filtered_sentence 

In [0]:
def clean_text(text):
    text = remove_punctuation(text.lower())
    text = strip_html(text)
    text = remove_non_ascii(text)
    text = remove_stopwords(text)
    result = text.replace(r"\b\w\b","") # remove single char
    return result

In [0]:
## Cleansing text
data['clean_question1'] = data.question1.progress_apply(clean_text)
data['clean_question2'] = data.question2.progress_apply(clean_text)

HBox(children=(FloatProgress(value=0.0, max=404287.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=404287.0), HTML(value='')))




In [0]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,clean_question1,clean_question2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what step step guide invest share market india,what step step guide invest share market
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what story kohinoor koh noor diamond,what would happen indian government stole kohi...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how increase speed internet connection using vpn,how internet speed increased hacking dns
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely how solve,find remainder when math 23 24 math divided 24 23
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,which one dissolve water quikly sugar salt met...,which fish would survive salt water


In [0]:
data.to_csv('drive/My Drive/Colab Notebooks/quora-question-pairs/data/clean_quora_duplicate_questions.csv', index=False)