In [3]:
import torch

In [4]:
torch.__version__

'1.7.1+cu110'

In [7]:
import nltk

In [8]:
from nltk import data

In [9]:
data.path

['C:\\Users\\ASUS/nltk_data',
 'D:\\Anaconda3\\envs\\pytorch1.7.1\\nltk_data',
 'D:\\Anaconda3\\envs\\pytorch1.7.1\\share\\nltk_data',
 'D:\\Anaconda3\\envs\\pytorch1.7.1\\lib\\nltk_data',
 'C:\\Users\\ASUS\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [10]:
nltk.find('.')

FileSystemPathPointer('D:\\Anaconda3\\envs\\pytorch1.7.1\\nltk_data')

In [11]:
from nltk.corpus import sentence_polarity

In [12]:
sentence_polarity.sents()

[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'], ...]

In [18]:
from collections import defaultdict, Counter
class Vocab:
    def __init__(self, tokens=None):
        self.idx_to_token = list() #使用列表存储所有的标记，从而跟据索引值获取相应的标记
        self.token_to_idx = dict() #使用字典实现标记到索引值的映射

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens = tokens + ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx['<unk>']

    @classmethod
    def build(cls, text, min_freq=1, reserved_tokens=None):
        # 创建词表，输入的text包含若干句子，每个句子由若干标记构成
        token_freqs = defaultdict(int) #存储标记及其出现次数的映射词典
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        # 无重复的标记，其中预留了未登录词(Unknown word)标记(<unk>)以及若干用户自定义的预留标记
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items() \
                        if freq >= min_freq and token != "<unk>"]
        return cls(uniq_tokens)

    def __len__(self):
        # 返回词表的大小
        return len(self.idx_to_token) 

    def __getitem__(self, token):
        # 查找输入标记对应的索引值，如果该标记不存在，则返回标记<unk>的索引值（0）
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        # 查找一系列输入标记对应的索引值
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, indices):
        # 查找一系列索引值对应的标记
        return [self.idx_to_token[index] for index in indices]


def save_vocab(vocab, path):
    with open(path, 'w') as writer:
        writer.write("\n".join(vocab.idx_to_token))


def read_vocab(path):
    with open(path, 'r') as f:
        tokens = f.read().split('\n')
    return Vocab(tokens)


In [19]:
vocab = Vocab.build(sentence_polarity.sents())

In [20]:
vocab.token_to_idx

{'<unk>': 0,
 'simplistic': 1,
 ',': 2,
 'silly': 3,
 'and': 4,
 'tedious': 5,
 '.': 6,
 "it's": 7,
 'so': 8,
 'laddish': 9,
 'juvenile': 10,
 'only': 11,
 'teenage': 12,
 'boys': 13,
 'could': 14,
 'possibly': 15,
 'find': 16,
 'it': 17,
 'funny': 18,
 'exploitative': 19,
 'largely': 20,
 'devoid': 21,
 'of': 22,
 'the': 23,
 'depth': 24,
 'or': 25,
 'sophistication': 26,
 'that': 27,
 'would': 28,
 'make': 29,
 'watching': 30,
 'such': 31,
 'a': 32,
 'graphic': 33,
 'treatment': 34,
 'crimes': 35,
 'bearable': 36,
 '[garbus]': 37,
 'discards': 38,
 'potential': 39,
 'for': 40,
 'pathological': 41,
 'study': 42,
 'exhuming': 43,
 'instead': 44,
 'skewed': 45,
 'melodrama': 46,
 'circumstantial': 47,
 'situation': 48,
 'visually': 49,
 'flashy': 50,
 'but': 51,
 'narratively': 52,
 'opaque': 53,
 'emotionally': 54,
 'vapid': 55,
 'exercise': 56,
 'in': 57,
 'style': 58,
 'mystification': 59,
 'story': 60,
 'is': 61,
 'also': 62,
 'as': 63,
 'unoriginal': 64,
 'they': 65,
 'come': 66,
 

In [22]:
train_data = [(vocab.convert_tokens_to_ids(sentence), 0)
                  for sentence in sentence_polarity.sents(categories='pos')[:4000]] \
        + [(vocab.convert_tokens_to_ids(sentence), 1)
            for sentence in sentence_polarity.sents(categories='neg')[:4000]]

In [23]:
test_data = [(vocab.convert_tokens_to_ids(sentence), 0)
                 for sentence in sentence_polarity.sents(categories='pos')[4000:]] \
        + [(vocab.convert_tokens_to_ids(sentence), 1)
            for sentence in sentence_polarity.sents(categories='neg')[4000:]]

In [24]:
train_data[0]

([23,
  2444,
  61,
  9851,
  76,
  308,
  23,
  1664,
  14509,
  496,
  219,
  14510,
  219,
  4,
  27,
  175,
  363,
  76,
  29,
  32,
  5884,
  201,
  7984,
  73,
  5354,
  4219,
  2,
  14511,
  1204,
  2701,
  25,
  2184,
  14512,
  6],
 0)

In [69]:
import jieba
import json

with open('trains.json', 'r',encoding='utf-8') as f:
    data = json.load(f)
    
train_data = [] #[[第一个句子的分词结果],[第二个句子的分词结果],...]
for sent in data:
    fenci = jieba.cut(sent['sentence'],cut_all=False)
    res=[]
    for j in fenci:
        res.append(j)
    train_data.append(res)

In [70]:
print(len(train_data))

10000


In [71]:
vocab = Vocab.build(train_data)

In [72]:
len(vocab.token_to_idx)

69808