In [1]:
import torch
import kaldi_io
import json
import re
import numpy as np

## 读取feats.scp

In [7]:
feats_map = {}
with open("/home/meichaoyang/dataset/500C/feats.scp", "r") as f:
    for line in f:
        data = line.split()
        feats_map[data[0]] = data[1]

In [3]:
feats_map["T0055G0002S0007"]

'/home/meichaoyang/dataset/500C//_fbank/raw_fbank_500C.1.ark:106521'

## 读取corpus.txt

In [4]:
corpus_map = {}
with open('/home/meichaoyang/dataset/500C/corpus.txt', 'r') as f:
    for line in f:
        data = line.split()
        corpus_txt = re.sub(r"([.!?。！，？、 \[\],，])", r"", data[1])
        corpus_map[data[0]] = corpus_txt

In [5]:
corpus_map["T0055G0002S0081"]

'国内哪些大学有韩语专业'

In [6]:
SOS_token = 0
EOS_token = 1
import re
# string = '我要把你卸载掉'
# re.findall(r'.{1}', string)

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in re.findall(r'.{1}', sentence):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?。！，？、 \[\],，])", r" \1", s)
    return s

def readLangs(lang1):
    print("Reading lines...")
    lang = Lang(lang1)

    # Read the file and split into lines
    lines = open('/home/meichaoyang/dataset/500C/corpus.txt', "r").\
        read().strip().split('\n')

    for l in lines:
        s = l.split("\t")
        lang.addSentence(normalizeString(s[1]))

    return lang

In [8]:
lang = readLangs("ZH")

Reading lines...


In [9]:
print(len(lang.index2word))
print(lang.index2word[4377])
print(lang.n_words)
print(lang.word2index[" "])
print(lang.word2count[" "])

4378
俾
4378
90
108258


# 准备json数据

In [10]:
json_data = {}
utts = json_data["utts"] = {}
dic = json_data["dic"] = {}
for i in range(lang.n_words):
    dic[lang.index2word[i]] = i
    lang.index2word[i]

In [11]:
keys = list(feats_map.keys())
for utt in keys:
    utts[utt] = {}
    utts[utt]["input"] = {}
    utts[utt]["input"]["feat"] = feats_map[utt]
    utts[utt]["input"]["shape"] = list(kaldi_io.read_mat(feats_map[utt]).shape)
    utts[utt]["output"] = {}
    utts[utt]["output"]["text"] = corpus_map[utt]
    tokenid = [lang.word2index[ch.lower()] for ch in corpus_map[utt]]
    utts[utt]["output"]["tokenid"] = tokenid
    utts[utt]["output"]["shape"] = [len(tokenid), lang.n_words]

In [17]:
a = kaldi_io.read_mat(feats_map["T0055G0002S0006"])

In [21]:
a[:a.shape[0]//4*4].shape
# a.shape[0]//4*4

(304, 80)

In [9]:
b = np.array([[1,2,3],[4,5,6]])

In [10]:
b[:,0:2]

array([[1, 2],
       [4, 5]])

In [13]:
utts["T0055G0002S0007"]["output"]

{'text': '再给我讲个笑话好吗',
 'tokenid': [39, 40, 20, 41, 42, 43, 44, 45, 46],
 'shape': [9, 4378]}

## 写入json文件

In [14]:
with open('data.json', 'w') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

## 读取json文件

In [23]:
with open('data.json', 'r') as f:
    json_data = json.load(f)

In [37]:
len(json_data["dic"])

4378

In [20]:
input1 = json_data["utts"]["T0055G0002S0007"]["input"]
torch.tensor(kaldi_io.read_mat(input1["feat"]))

tensor([[ 8.9022, 10.2037, 11.9769,  ..., 14.1464, 14.7736, 14.0025],
        [ 6.5292,  7.8970,  9.2169,  ..., 14.2464, 13.6531, 13.0010],
        [ 7.6166,  9.3611, 11.2072,  ..., 13.2118, 13.5785, 12.3967],
        ...,
        [ 8.5451,  9.9356,  9.8816,  ..., 13.6333, 14.6908, 13.7032],
        [ 8.5094, 10.2420, 11.1217,  ..., 12.9736, 13.7464, 13.2024],
        [ 7.5809,  9.3611, 10.9934,  ..., 13.3218, 14.2075, 13.4787]])

In [27]:
json_data["utts"]["T0055G0002S0007"]

{'input': {'feat': '/home/meichaoyang/dataset/500C//_fbank/raw_fbank_500C.1.ark:106521',
  'shape': [232, 80]},
 'output': {'text': '再给我讲个笑话好吗',
  'tokenid': [39, 40, 20, 41, 42, 43, 44, 45, 46],
  'shape': [9, 4378]}}

In [46]:
s = 'Hello, Runoob\n'

In [49]:
repr(s)

"'Hello, Runoob\\n'"

In [45]:
1/7

0.14285714285714285