In [74]:
from typing import Optional, List, Tuple
import os
import sys
from argparse import ArgumentParser
from tqdm import tqdm
import torch

## 1. 词频字典：通过约193w数据统计构造

In [81]:
vocab = torch.load("./smiles_vocab_190w.pt")

In [84]:
# 字典大小
len(vocab)

220682

In [109]:
# 按词频降序：（特征id，特征出现次数）
vocab

[(1763725584049666355, 1796032),
 (735972946868555642, 1696746),
 (6260505905572191606, 1605142),
 (18158413337196038376, 1512787),
 (1067127966115687587, 1310083),
 (6179043702226148714, 1221454),
 (9368696454794743891, 1037030),
 (5010282239590355792, 1002823),
 (14061540360439089399, 1002665),
 (7628424680246176688, 1002654),
 (11196928345026846018, 973371),
 (12631275888259905431, 910054),
 (18421543954717774246, 850727),
 (14839357416249598759, 788941),
 (16591264855984744421, 785709),
 (13837189228176952228, 779937),
 (933380421294911611, 750210),
 (4599494775249237052, 750089),
 (16294449445368438630, 727904),
 (9539911685505501156, 726255),
 (4152132880567326599, 714150),
 (6335251982134985537, 666633),
 (3499315946157484807, 649320),
 (3158030302236397308, 637007),
 (11997984232217996515, 636916),
 (11606906963360101319, 625905),
 (4100995352714331354, 624658),
 (16610627509796293476, 617730),
 (3600599191630089050, 609244),
 (17441575015459610259, 597456),
 (74085888724339563

In [110]:
# 最高词频
most_freq = max(vocab, key=lambda x: x[1])
print(f"最高词频：{most_freq}")
print(f"最高词频特征个数：{len(list(filter(lambda x: x[1] == most_freq[1], vocab)))}")

最高词频：(1763725584049666355, 1796032)
最高词频特征个数：1


In [111]:
# 最低词频及其个数
lowest_freq = min(vocab, key=lambda x: x[1])
print(f"最低词频：{lowest_freq}")
print(f"最低词频特征个数：{len(list(filter(lambda x: x[1] == lowest_freq[1], vocab)))}")

最低词频：(833457525658361873, 1)
最低词频特征个数：53276


## 2. Onehot编码：根据词频字典维度编码

In [209]:
class FingerPrints(object):
    """
    instance size:   1936962 (≈190W)
    vocabulary size: 220682  (≈22w)
    Most Freq: (1763725584049666355, 1796032)
    """
    def __init__(self, lower, upper):
        self.lower = lower
        self.upper = upper
        self.vocab_freq = torch.load("./smiles_vocab_190w.pt")
        # print(f"原始词频字典大小: {len(self.vocab_freq)}")
        self.vocab_freq_squeezed = self._squeeze_vocab_frequency(self.vocab_freq)
        print(f"取词频范围（{lower}-{upper}）之间的词频字典大小: {len(self.vocab_freq_squeezed)}")
        self.dict = self._create_dictionary(self.vocab_freq_squeezed)
        print(f"字典维度，即onehot编码维度: {len(self.dict)}")
        pass

    def _squeeze_vocab_frequency(self, vocab_freq):
        vocab_freq_squeezed = list(filter(lambda vocab: self.lower < vocab[1] < self.upper, vocab_freq))
        return vocab_freq_squeezed

    def _create_dictionary(self, vocab_freq_squeezed):
        return {data[0]: index for index, data in enumerate(vocab_freq_squeezed)}

    def _to_onehot(self, fp_list: Optional[List[int]]) -> Tuple[int, int, List[int]]:
        one_hot = [0] * len(self.dict)
        miss, total = 0, len(fp_list)
        for elem in fp_list:
            try:
                one_hot[self.dict[elem]] = 1
            except KeyError as ke:
                miss += 1
        return miss, total, one_hot

    def to_onehot(self):
        miss_all, total_all, one_hots = 0, 0, []
        with open("./Jiang1823Train_.csv") as fp:
            lines = fp.readlines()
            for line in lines:
                line = list(filter(lambda x: x != '', line.split(",")))[2:-1]
                line = list(map(lambda x: int(x), line))
                miss, total, one_hot = self._to_onehot(line)
                miss_all = miss_all + miss
                total_all = total_all + total
        res = miss_all / total_all
        print(f"\n未命中率：{round(res, 4)*100}%")

## 2.1 未压缩字典

190W的数据集构造的全量字典上，在Jiang1823Train.csv数据集上依然存在10.2%查找不到特征的情况！！

In [210]:
fp = FingerPrints(0, 2296033)
fp.to_onehot()

取词频范围（0-2296033）之间的词频字典大小: 220682
字典维度，即onehot编码维度: 220682

未命中率：10.2%


## 2.2 压缩字典：压缩下界

In [211]:
# 压缩字典：压缩下界
fp = FingerPrints(1, 1796033)
fp.to_onehot()

取词频范围（1-1796033）之间的词频字典大小: 167406
字典维度，即onehot编码维度: 167406

未命中率：12.55%


In [212]:
# 压缩字典：压缩下界
fp = FingerPrints(5, 1796033)
fp.to_onehot()

取词频范围（5-1796033）之间的词频字典大小: 118894
字典维度，即onehot编码维度: 118894

未命中率：16.91%


In [213]:
# 压缩字典：压缩下界
fp = FingerPrints(10, 1796033)
fp.to_onehot()

取词频范围（10-1796033）之间的词频字典大小: 100345
字典维度，即onehot编码维度: 100345

未命中率：19.52%


## 2.3 压缩上界

In [214]:
# 压缩字典：压缩上界
fp = FingerPrints(0, 1796032)
fp.to_onehot()

取词频范围（0-1796032）之间的词频字典大小: 220681
字典维度，即onehot编码维度: 220681

未命中率：10.59%


In [217]:
# 压缩字典：压缩上界
fp = FingerPrints(0, 1000000)
fp.to_onehot()

取词频范围（0-1000000）之间的词频字典大小: 220672
字典维度，即onehot编码维度: 220672

未命中率：12.889999999999999%


In [220]:
# 压缩字典：压缩上界
fp = FingerPrints(0, 500000)
fp.to_onehot()

取词频范围（0-500000）之间的词频字典大小: 220643
字典维度，即onehot编码维度: 220643

未命中率：16.49%


In [223]:
# 压缩字典：压缩上界
fp = FingerPrints(0, 300000)
fp.to_onehot()

取词频范围（0-300000）之间的词频字典大小: 220595
字典维度，即onehot编码维度: 220595

未命中率：19.91%


## 2.3 压缩上下界

In [227]:
# 压缩字典：压缩上下界
fp = FingerPrints(100, 400000)
fp.to_onehot()

取词频范围（100-400000）之间的词频字典大小: 51638
字典维度，即onehot编码维度: 51638

未命中率：39.46%


In [229]:
!jupyter nbconvert --to html statistics.ipynb

[NbConvertApp] Converting notebook statistics.ipynb to html
[NbConvertApp] Writing 333404 bytes to statistics.html
