In [1]:
from collections import defaultdict
import operator
class Vocab(object):
    unk = u'<unk>'
    sos = u'<sos>'
    eos = u'<eos>'
    def __init__(self, unk=unk):
        self.word_to_index = {}
        self.index_to_word = {}
        self.word_freq = defaultdict(int)
        self.total_words = 0
        self.unknown = unk
        self.add_word(self.unknown, count=0)
        self.add_word(self.sos, count=0)
        self.add_word(self.eos, count=0)

    def add_word(self, word, count=1):
        word = word.strip()
        if len(word) == 0:
            return
        elif word.isspace():
            return
        if word not in self.word_to_index:
            index = len(self.word_to_index)
            self.word_to_index[word] = index
            self.index_to_word[index] = word
        self.word_freq[word] += count

        
    def construct(self, words):
        for word in words:
            self.add_word(word)
        self.total_words = float(sum(self.word_freq.values()))
        print '{} total words with {} uniques'.format(self.total_words, len(self.word_freq))
 

    def limit_vocab_length(self, length):
        """
        Truncate vocabulary to keep most frequent words
        
        Args:
            None
            
        Returns:
            None 
        """
        if length > self.__len__():
            return
        new_word_to_index = {self.unknown:0}
        new_index_to_word = {0:self.unknown}
        self.word_freq.pop(self.unknown)          #pop unk word
        sorted_tup = sorted(self.word_freq.items(), key=operator.itemgetter(1))
        sorted_tup.reverse()
        vocab_tup = sorted_tup[:length]
        self.word_freq = dict(vocab_tup)
        for word in self.word_freq:
            index = len(new_word_to_index)
            new_word_to_index[word] = index
            new_index_to_word[index] = word
        self.word_to_index = new_word_to_index
        self.index_to_word = new_index_to_word
        self.word_freq[self.unknown]=0
        
        
    def save_vocab(self, filePath):
        """
        Save vocabulary a offline file
        
        Args:
            filePath: where you want to save your vocabulary, every line in the 
            file represents a word with a tab seperating word and it's frequency
            
        Returns:
            None 
        """
        self.word_freq.pop(self.unknown)
        sorted_tup = sorted(self.word_freq.items(), key=operator.itemgetter(1))
        sorted_tup.reverse()
        with open(filePath, 'wb') as fd:
            for (word, freq) in sorted_tup:
                fd.write(('%s\t%d\n'%(word, freq)).encode('utf-8'))
            

    def load_vocab_from_file(self, filePath, sep='\t'):
        """
        Truncate vocabulary to keep most frequent words
        
        Args:
            filePath: vocabulary file path, every line in the file represents 
                a word with a tab seperating word and it's frequency
            
        Returns:
            None 
        """
        with open(filePath, 'rb') as fd:
            for line in fd:
                line_uni = line.decode('utf-8')
                word, freq = line_uni.split(sep)
                index = len(self.word_to_index)
                if word not in self.word_to_index:
                    self.word_to_index[word] = index
                    self.index_to_word[index] = word
                self.word_freq[word] = int(freq)
            print 'load from <'+filePath+'>, there are {} words in dictionary'.format(len(self.word_freq))
 

    def encode(self, word):
        if word not in self.word_to_index:
            word = self.unknown
        return self.word_to_index[word]

    
    def decode(self, index):
        return self.index_to_word[index]

    
    def __len__(self):
        return len(self.word_to_index)

In [2]:
vocab = Vocab()

In [3]:
with open('train.txt','r') as fd:
    step=0
    for line in fd:
        line_uni = line.decode('utf-8')
        if step < 2:
            step+=1
            continue
        step+=1
        if line_uni.isspace():
            step=0
            continue
        for word in line_uni.strip().split():
            vocab.add_word(word)
        

In [4]:
with open('test.txt','r') as fd:
    step=0
    for line in fd:
        line_uni = line.decode('utf-8')
        if step < 2:
            step+=1
            continue
        step+=1
        if line_uni.isspace():
            step=0
            continue
        for word in line_uni.strip().split():
            vocab.add_word(word)

In [5]:
with open('valid.txt','r') as fd:
    step=0
    for line in fd:
        line_uni = line.decode('utf-8')
        if step < 2:
            step+=1
            continue
        step+=1
        if line_uni.isspace():
            step=0
            continue
        for word in line_uni.strip().split():
            vocab.add_word(word)

In [6]:
vocab.save_vocab('vocab.txt')

In [2]:
vocab = Vocab()
vocab.load_vocab_from_file('vocab.txt')
vocab.limit_vocab_length(100000)
vocab.save_vocab('vocab.100k')

load from <vocab.txt>, there are 2718261 words in dictionary


In [76]:
import copy
def load_data(fileName):
    with open(fileName,'r') as fd:
        step=0
        data_list = []
        tmp_data = []
        for line in fd:
            line_uni = line.decode('utf-8')
            if step < 2:
                tmp_data = []
                step+=1
                continue
            step+=1
            if line_uni.isspace():
                step=0
                data_list.append(tmp_data)
                continue
            tmp_data.append(line_uni.strip().split())
    return data_list

def flatten(li):
    ret = []
    for item in li:
        if isinstance(item, list) or isinstance(item, tuple):
            ret += flatten(item)
        else:
            ret.append(item)
    return ret

def shuffleData(data):
    def shuffleList(li):
        li = copy.deepcopy(li)
        index = range(len(li))
        np.random.shuffle(index)
        tmp_list = [li[i] for i in index]
        index = np.argsort(index)
        return tmp_list, index.tolist()
    ret_data=[]
    ret_label = []
    for item in data:
        shuffled, label = shuffleList(item)
        ret_data.append(shuffled)
        ret_label.append(label)
    return ret_data, ret_label
        

def batch_encodeNpad(data, label, vocab):
    sent_num = [len(i) for i in data]
    max_sent_num = max(sent_num)
    sent_len = [[len(i[j]) if j<len(i) else 0 for j in range(max_sent_num)]for i in data]
    max_sent_len = max(flatten(sent_len))
    ret_label = [[i[j] if j<len(i) else -1 for j in range(max_sent_num)] for i in label]
    ret_batch = np.zeros([len(data), max_sent_num, max_sent_len], dtype=np.int32)
    for (i, item) in enumerate(data):
        for (j, sent) in enumerate(item):
            for (k, word) in enumerate(sent):
                ret_batch[i, j, k] = vocab.encode(word)
    return ret_batch, np.array(ret_label), sent_num, sent_len  #(b_sz, max_snum, max_slen), (b_sz, max_snum), (b_sz,), (max_slen)
    

In [77]:
from random import shuffle
vocab = Vocab()
vocab.load_vocab_from_file('vocab.100k')
data_list = load_data('test_file')
ret_data, ret_label = shuffleData(data_list)
ret = batch_encodeNpad(ret_data, ret_label, vocab)
print type(ret[0])
print type(ret[1])
print type(ret[2])
print type(ret[3])

load from <vocab.100k>, there are 100003 words in dictionary
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
<type 'list'>
<type 'list'>


In [48]:
aa = np.array([[1, 2, 3], [4, 5, 6]])
np.random.shuffle(aa[0])
print aa

[[1 3 2]
 [4 5 6]]


In [44]:
aa = [1, 2, 3, 4, 5]
np.random.shuffle(aa)
aa

[3, 5, 2, 1, 4]

In [50]:
index = range(10)
np.random.shuffle(index)
print index
print np.argsort(index)

[8, 6, 3, 0, 9, 5, 4, 1, 7, 2]
[3 7 9 2 6 5 1 8 0 4]


In [55]:
aa = np.array([[4, 2, 3, 1], [4, 3, 2, 1]])
print  aa[0, [3, 2, 1]]

[1 3 2]


In [57]:
li = [1, 2, 3, 4]
aa = li
li = copy.deepcopy(li)
np.random.shuffle(li)
print li
print aa

[2, 1, 3, 4]
[1, 2, 3, 4]


In [67]:
def shuffleData(data):
    def shuffleList(li):
        li = copy.deepcopy(li)
        index = range(len(li))
        np.random.shuffle(index)
        tmp_list = [li[i] for i in index]
        index = np.argsort(index)
        return tmp_list, index.tolist()
    ret_data=[]
    ret_label = []
    for item in data:
        shuffled, label = shuffleList(item)
        ret_data.append(shuffled)
        ret_label.append(label)
    return ret_data, ret_label

In [68]:
aa = [[1, 2, 3, 4, 5], [2, 3, 4, 5, 6, 7], [5, 6, 7, 8, 9]]
print shuffleData(aa)
print aa

([[3, 5, 4, 2, 1], [2, 4, 6, 3, 7, 5], [9, 6, 8, 5, 7]], [[4, 3, 0, 2, 1], [0, 3, 1, 5, 2, 4], [3, 1, 4, 2, 0]])
[[1, 2, 3, 4, 5], [2, 3, 4, 5, 6, 7], [5, 6, 7, 8, 9]]


In [79]:
aa = np.array([1, 2, 3, 4, 5])
aa.tolist()

[1, 2, 3, 4, 5]

In [80]:
bb = [0, 1, 2]
np.concatenate([aa, bb])

array([1, 2, 3, 4, 5, 0, 1, 2])