In [1]:
import numpy as np
import pandas as pd
import pickle
import re

In [2]:
# Import spacy corpus, glove embeddings.
import spacy
import en_core_web_md



# !spacy download en_core_web_sm
# !pip install textacy
# import textacy
# import 'en_core_web_md'
print('Loading SpaCy `en_core_web_md` corpus...')
nlp = en_core_web_md.load()

Loading SpaCy `en_core_web_md` corpus...


In [3]:

# q1_train_simple, q2_train_simple, y_train = pickle.load(open('./lystdo_kernel/train_text_without_process.pkl','rb'))
# q1_test_simple, q2_test_simple, y_test = pickle.load(open('./lystdo_kernel/test_text_without_process.pkl','rb'))

df_train = pd.read_csv('../data/processed/train.csv')
df_test = pd.read_csv('../data/processed/test.csv')
q1_train_original = df_train['question1']
q2_train_original = df_train['question2']
q1_test_original = df_test['question1']
q2_test_original = df_test['question2']

del df_train
del df_test

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:

'''
Helper functions
'''

def get_function_name(f):
    return str(f).split(' ')[1]

def get_regex_number(regex, text):
    m = re.findall(regex, text)
    if m==None:
        return 0
    return len(m)

'''
Features
'''

def get_non_ascii(text):
    return get_regex_number('[^\x00-\x7F]', text)
    
def get_number(text):
    return get_regex_number('[0-9]+[\.\,]*[0-9]*', text)
    
def get_puncts(text):
    return get_regex_number('[\!\?！？\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\\(\)\[\]\{\}\<\>\'\"’`“…é\$\%\&]', text)

def get_brackets(text):
    return get_regex_number('[\(\)\[\]\{\}\<\>\'\"]', text)

def get_dashes(text):
    return get_regex_number('\-', text)

def get_dots(text):
    return get_regex_number('\.', text)

def get_end_of_sent(text):
    return get_regex_number('[\.\!\?！？]', text)

def get_commas(text):
    return get_regex_number('\,', text)

def get_spaces(text):
    return get_regex_number('[\s\t\n]+', text)

def get_entity_count(text):
    doc = nlp(text)
    count = 0
    for token in doc:
        if token.ent_type_!='':
            count+=1
    return count

def get_OOB(text):
    doc = nlp(text)
    count = 0
    for token in doc:
        if not token.has_vector:
            count+=1
    return count

In [5]:
from joblib import Parallel, delayed

features_before_clean = [
    get_non_ascii,
    get_number,
    get_puncts,
    get_dashes,
    get_dots,
    get_end_of_sent,
    get_commas,
    get_spaces,
    get_entity_count,
    get_OOB,
    get_brackets,
]

features_after_clean = [
    get_non_ascii,
    get_number,
    get_puncts,
    get_dashes,
    get_dots,
    get_end_of_sent,
    get_commas,
    get_entity_count,
    get_OOB,
    get_brackets,
]

def extract_features(texts, functions, run_name, q):
    
    fields = [run_name+'_'+q+'_'+get_function_name(f) for f in functions]
    
    sample_len = len(texts)
    print('At '+run_name, sample_len, ' samples')
    
    ret = []
    for i,text in enumerate(texts):
        if type(text)!=str:
            text = ''
        features = [func(text) for func in functions]
        ret.append(features)
        
        if i%100000==0:
            print(i,'/',sample_len)
        
    return pd.DataFrame(data=ret, columns=fields)

In [6]:
run_name = 'word_corrected'

q1 = extract_features(q1_train_corrected, features_after_clean, run_name, q='q1')
q2 = extract_features(q2_train_corrected, features_after_clean, run_name, q='q2')
c = pd.concat([q1,q2], axis=1)
c.to_csv('../data/interim/Key_features_'+run_name+'.csv', index=False)

q1 = extract_features(q1_test_corrected, features_after_clean, run_name, q='q1')
q2 = extract_features(q2_test_corrected, features_after_clean, run_name, q='q2')
c = pd.concat([q1,q2], axis=1)
c.to_csv('../data/interim/Key_features_'+run_name+'.csv', index=False)

#######################

run_name = 'simple_tokenizer'

q1 = extract_features(q1_train_simple, features_after_clean, run_name, q='q1')
q2 = extract_features(q2_train_simple, features_after_clean, run_name, q='q2')
c = pd.concat([q1,q2], axis=1)
c.to_csv('../data/interim/Key_features_'+run_name+'.csv', index=False)

q1 = extract_features(q1_test_simple, features_after_clean, run_name, q='q1')
q2 = extract_features(q2_test_simple, features_after_clean, run_name, q='q2')
c = pd.concat([q1,q2], axis=1)
c.to_csv('../data/interim/Key_features_'+run_name+'.csv', index=False)

#######################

run_name = 'raw'

q1 = extract_features(q1_train_original, features_before_clean, run_name, q='q1')
q2 = extract_features(q2_train_original, features_before_clean, run_name, q='q2')
c = pd.concat([q1,q2], axis=1)
c.to_csv('../data/interim/Key_features_'+run_name+'.csv', index=False)

q1 = extract_features(q1_test_original, features_before_clean, run_name, q='q1')
q2 = extract_features(q2_test_original, features_before_clean, run_name, q='q2')
c = pd.concat([q1,q2], axis=1)
c.to_csv('../data/interim/Key_features_'+run_name+'.csv', index=False)

NameError: name 'q1_train_corrected' is not defined