# Embedding Setup

Citation: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
import os
import re
import numpy as np
import pickle as pkl
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [7]:
ICNALE_PATH = '../data/ICNALE'
TEST_VAL_SPLIT = 0.8
EMBEDDING_DIM = 50

## Utility Functions

In [3]:
def sample_gen():
    class_map = {
        'A2-0': 0,
        'B1-1': 1,
        'B1-2': 2,
        'B2-0': 3
    }
    
    for path in sorted(os.listdir(ICNALE_PATH)):
        file_name, file_ext = path.split('.')
        if file_name == 'W_CHN_SMK_B1_1': continue
        level = '-'.join(file_name.split('_')[3:])
        if level in class_map:
            level = class_map[level]
        else:
            level = len(class_map)
        with open('{}/{}'.format(ICNALE_PATH, path), 'r') as fd:
            for sample in fd:
                sample = sample.decode("utf-8-sig")
                sample = sample.strip('\n')
                sample = sample.strip('\r')
                if sample == '': continue
                yield sample, level
                
def to_sentences(text):
    stop = re.compile(r'([\.?!])')
    sentences = []
    for split in stop.split(text):
        if split == '': continue
        if stop.match(split):
            sentences[-1] = sentences[-1] + split
        else:
            sentences.append(split.strip())
    return sentences

def tokenize(sentence):
    sentence = re.sub(r'([^A-z0-9\s])', r' \1', sentence)
    return text_to_word_sequence(sentence, filters='')

## Parse Data

In [4]:
sampler = sample_gen()

texts = []
levels = []

sequences = []
word_index = {}

while True:
    try:
        sample = sampler.next()
    except:
        break
    text, level = sample
    texts.append(text)
    levels.append(level)
    
    sequence = []    
    tokens = tokenize(text)
    for word in tokens:
        if word not in word_index:
            word_index[word] = len(word_index) + 1
        sequence.append(word_index[word])
    sequences.append(sequence)
    
lengths = sorted([len(seq) for seq in sequences])
coverage = 80
coverage_length = lengths[len(lengths) * coverage // 100]
    
print('Parsed {} samples'.format(len(texts)))
print('Sample length covering %{} of samples = {}'.format(coverage, coverage_length))

Parsed 5418 samples
Sample length covering %80 of samples = 286


## Prepare Data

In [5]:
data = pad_sequences(sequences, maxlen=coverage_length, padding='post')
labels = to_categorical(levels)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

with open('../data/data.pkl', 'w+') as fd:
    pkl.dump(data, fd)
with open('../data/labels.pkl', 'w+') as fd:
    pkl.dump(labels, fd)

print('Shape of data: {}'.format(data.shape))
print('Shape of labels: {}'.format(labels.shape))

Shape of data: (5418, 286)
Shape of labels: (5418, 5)


In [8]:
embeddings_index = {}
with open('../data/glove.6B.50d.txt') as fd:
    for line in fd:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
with open('../data/embed_matrix.pkl', 'w+') as fd:
    pkl.dump(embedding_matrix, fd)
        
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.
