In [1]:
import sys
sys.path.append('..')

from common.download_utils import download_week1_resources

download_week1_resources()

File data/train.tsv is already downloaded.
File data/validation.tsv is already downloaded.
File data/test.tsv is already downloaded.
File data/text_prepare_tests.tsv is already downloaded.


In [2]:
from grader import Grader

In [3]:
grader = Grader()

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [15]:
def read_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    data['_original_tags'] = data['tags'].copy(deep=True)
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [16]:
train_set, validation_set = read_data('data/train.tsv'), read_data('data/validation.tsv')
test_set = pd.read_csv('data/test.tsv', sep='\t')

In [17]:
train_set.head()

Unnamed: 0,title,tags,_original_tags
0,How to draw a stacked dotplot in R?,[r],['r']
1,mysql select all records where a datetime fiel...,"[php, mysql]","['php', 'mysql']"
2,How to terminate windows phone 8.1 app,[c#],['c#']
3,get current time in a specific country via jquery,"[javascript, jquery]","['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,[java],['java']


In [19]:
X_train, y_train = train_set['title'].values, train_set['title'].values
X_val, y_val = validation_set['title'].values, validation_set['title'].values
X_test = test_set['title'].values

In [74]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# remove stopwords
STOPWORDS_RE = re.compile('|'.join(['\\b{0}\\b'.format(x) for x in set(stopwords.words('english'))]))
# compress space
COMPRESS_SPACES_RE = re.compile('( +)') 

def preprocess_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = STOPWORDS_RE.sub('', text)
    text = COMPRESS_SPACES_RE.sub(' ', text)
    # the first word can be a space
    text = text.lstrip()
    return text

def test_preprocess_text():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if preprocess_text(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [75]:
test_preprocess_text()

'Basic tests are passed.'

In [76]:
prepared_questions = []
for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):
    line = preprocess_text(line.strip())
    prepared_questions.append(line)
text_prepare_results = '\n'.join(prepared_questions)

grader.submit_tag('TextPrepare', text_prepare_results)

Current answer for task TextPrepare is:
 sqlite php readonly
creating multiple textboxes dynamically
self one prefer javascript
save php date...


In [77]:
X_train = [preprocess_text(x) for x in X_train]
X_val = [preprocess_text(x) for x in X_val]
X_test = [preprocess_text(x) for x in X_test]

In [78]:
X_train[:3]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app']