# Sub Fields Description

TODO: about hyper fields

# Char Information


Token is made up by Chars.

Word-Token is made up by several Chars, while Char-Token has only one Char.


Use Word-Token as an example:


北京 --> 北 京 

Beijing --> B e i j i n g


In this section, we want to derive more information for char only. (Not the whole token)

## Char Itself

In [1]:

def charGrainChar(char, end_grain = False):
    '''char level only!'''
    info = [char]
    if end_grain:
        info = info + ['ch0']
    return info


print(charGrainChar('京'))
print(charGrainChar('j'))


['京']
['j']


In [2]:

print(charGrainChar('京', end_grain= True))
print(charGrainChar('j', end_grain= True))


['京', 'ch0']
['j', 'ch0']


## Chinese Char's SubComp

For English char (letter), only return itself

In [3]:
import pickle


with open('nlptext/sources/CharSubComp.p', 'rb') as handle:
    CharSubCompInfos = pickle.load(handle)

def subcompGrainChar(char, end_grain = False):
    '''char level only!'''
    if char in CharSubCompInfos:
        info = CharSubCompInfos[char]
        if info:
            info = ['c' + i for i in info ]
        else:
            info = ['c' + char] 
    else:
        info = [char]
        
    if end_grain:
        info = info + ['c0']
    return info


print(subcompGrainChar('京'))
print(subcompGrainChar('j'))


['c67', 'c119', 'c159']
['j']


In [4]:

print(subcompGrainChar('京', end_grain= True))
print(subcompGrainChar('j', end_grain= True))


['c67', 'c119', 'c159', 'c0']
['j', 'c0']


## Wrap as a Token Information

In [5]:
import pickle


with open('nlptext/sources/CharSubComp.p', 'rb') as handle:
    CharSubCompInfos = pickle.load(handle)

def subcompGrainChar(char, end_grain = False):
    '''char level only!'''
    if char in CharSubCompInfos:
        info = CharSubCompInfos[char]
        if info:
            info = ['c' + i for i in info ]
        else:
            info = ['c' + char] 
    else:
        info = [char]
        
    if end_grain:
        info = info + ['c0']
    return info

def subcompGrainToken(token, end_grain = False):
    info = sum([subcompGrainChar(char, end_grain) for char in token], [])
    return info


In [6]:
token = '北京'
channel = 'subcomp'
end_grain = True

subcompGrainToken(token, end_grain = end_grain)

['c117', 'c24', 'c0', 'c67', 'c119', 'c159', 'c0']

In [7]:
token = 'Beijing'
end_grain = False

subcompGrainToken(token,  end_grain = end_grain)

['B', 'e', 'i', 'j', 'i', 'n', 'g']

# Token Information

## Syllable

In [8]:
import pyphen


def syllableGrainToken(token, end_grain = False):
    
    # pyphen.LANGUAGES
    dic = pyphen.Pyphen(lang='en')

    # token = 'tomorrow'
    return dic.inserted(token).split('-')


token = 'Beijing'
syllableGrainToken(token)

['Bei', 'jing']

## Phoneme

In [9]:

with open('nlptext/sources/WordPhoneme.p', 'rb') as handle:
    WordPhenomeInfo = pickle.load(handle)

def phonemeGrainToken(token, end_grain = False): 
    try:
        phonemes = WordPhenomeInfo[token.lower()]
    except:
        phonemes = ['']
    return phonemes

token = 'Beijing'
phonemeGrainToken(token)



['B', 'EY2', 'ZH', 'IH1', 'NG']

# Wrap Token-Based Channel

In [10]:
from nlptext.utils.channel import Channel_Ind_Methods

Channel_Ind_Methods

{'char': <function nlptext.utils.channel.charGrainToken(token, end_grain=False)>,
 'basic': <function nlptext.utils.channel.basicGrainToken(token, end_grain=False)>,
 'medical': <function nlptext.utils.channel.medicalGrainToken(token, end_grain=False)>,
 'radical': <function nlptext.utils.channel.radicalGrainToken(token, end_grain=False)>,
 'subcomp': <function nlptext.utils.channel.subcompGrainToken(token, end_grain=False)>,
 'stroke': <function nlptext.utils.channel.strokeGrainToken(token, end_grain=False)>,
 'pinyin': <function nlptext.utils.channel.pinyinGrainToken(token, end_grain=False)>,
 'syllable': <function nlptext.utils.channel.syllableGrainToken(token, end_grain=False)>,
 'phoneme': <function nlptext.utils.channel.phonemeGrainToken(token, end_grain=False)>}

In [12]:
method = Channel_Ind_Methods['phoneme']
method('Beijing', end_grain = True)

['B', 'EY2', 'ZH', 'IH1', 'NG']

In [13]:

def getGrainNgrams(subword_infos, n):
    if n == 1:
        return [i for i in subword_infos]
    if n > len(subword_infos):
        # How to deal this when the length is not so long
        # Condition: where n is larger than the infos
        return [] 
    l = [subword_infos[i:n+i] for i in range(len(subword_infos) - n + 1)]
    l = ['-'.join(i) for i in l]
    return l

def grainToken(token, grainTokenFunction, Ngram = 1,Max_Ngram = None, end_grain = True):
    infos =  grainTokenFunction(token, end_grain = end_grain) 
    if not Max_Ngram:
        return getGrainNgrams(infos, Ngram)
    else:
        return sum([getGrainNgrams(infos, idx+1) for idx in range(Max_Ngram)], [])

def getChannelGrain4Token(token, channel, Ngram = 1, Max_Ngram = None,  end_grain = False):
    if channel == 'token':
        return [token]
    elif channel in Channel_Ind_Methods:
        return grainToken(token, Channel_Ind_Methods[channel], Ngram = Ngram, Max_Ngram = Max_Ngram, end_grain = end_grain)
    else:
        print('The Channel "', channel, '" is not available currently!')


In [14]:
token = '北京'
channel = 'subcomp'
getChannelGrain4Token(token, channel, Ngram = 2, Max_Ngram = None,  end_grain = False)

['c67-c119', 'c119-c159']

In [15]:
token = 'Beijng'
channel = 'syllable'
getChannelGrain4Token(token, channel, Ngram =1, Max_Ngram = 2,  end_grain = False)

['Bei', 'jng', 'Bei-jng']

In [16]:
token = 'Beijing'
channel = 'phoneme'

getChannelGrain4Token(token, channel, Ngram =1, Max_Ngram = None, end_grain = False)

['B', 'EY2', 'ZH', 'IH1', 'NG']

# Export Function

## `getChannelGrain4Token`

In [1]:
from nlptext.utils.channel import getChannelGrain4Token

token = 'Beijing'
channel = 'phoneme'

getChannelGrain4Token(token, channel, Ngram =1, Max_Ngram = None, end_grain = False)

['B', 'EY2', 'ZH', 'IH1', 'NG']

In [2]:
token = '北京'
channel = 'subcomp'
getChannelGrain4Token(token, channel, Ngram = 2, Max_Ngram = None,  end_grain = False)

['c117-c24', 'c24-c67', 'c67-c119', 'c119-c159']

## `getChannelGrain4Sent`

In [3]:
from nlptext.utils.channel import getChannelGrain4Sent


['北京', '是', '中国', '的', '首都']

In [8]:
sent = '北京 是 中国 的 首都'.split(' ')
print(sent)
channel = 'subcomp'

getChannelGrain4Sent(sent, channel, Ngram = 2, Max_Ngram = None, end_grain = False)

['北京', '是', '中国', '的', '首都']


[['c117-c24', 'c24-c67', 'c67-c119', 'c119-c159'],
 ['c209-c1', 'c1-c207'],
 ['c214-c120', 'c120-c173', 'c173-c6'],
 ['c331-c64', 'c64-c6'],
 ['c140-c392', 'c392-c180', 'c180-c209', 'c209-c166']]

In [10]:
sent = 'Beijing is the capital of China'.split(' ')
print(sent)
channel = 'phoneme'

getChannelGrain4Sent(sent, channel, Ngram = 1, Max_Ngram = None, end_grain = False)

['Beijing', 'is', 'the', 'capital', 'of', 'China']


[['B', 'EY2', 'ZH', 'IH1', 'NG'],
 ['IH1', 'Z'],
 ['DH', 'AH0'],
 ['K', 'AE1', 'P', 'AH0', 'T', 'AH0', 'L'],
 ['AH1', 'V'],
 ['CH', 'AY1', 'N', 'AH0']]

# Get  Vocab and Freq of Sub Fields

In [11]:
from pprint import pprint
from nlptext.base import BasicObject

########### Wiki ###########
CORPUSPath = 'corpus/wiki/'
corpusFileIden = '.txt'

textType   = 'line'

Text2SentMethod  = 'whole'

# sentence to tokens
Sent2TokenMethod = 'pos'
TOKENLevel = 'word'
use_hyper = True

anno = False
annoKW = {}

BasicObject.INIT(CORPUSPath, corpusFileIden, textType, 
                 Text2SentMethod, Sent2TokenMethod, TOKENLevel, 
                 anno, annoKW, use_hyper = use_hyper)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


'File'
corpus/wiki/sample_wiki_smp.txt


Loading model cost 0.602 seconds.
Prefix dict has been built succesfully.


Total Num of All    Tokens 41843
Total Num of Unique Tokens 5965
CORPUS	it is Dumped into file: data/wiki/word/Pyramid/CORPUS.p
CORPUS	the length of it is   : 1
GROUP	it is Dumped into file: data/wiki/word/Pyramid/GROUP.p
GROUP	the length of it is   : 1
TEXT	it is Dumped into file: data/wiki/word/Pyramid/TEXT.p
TEXT	the length of it is   : 500
SENT	it is Dumped into file: data/wiki/word/Pyramid/SENT.p
SENT	the length of it is   : 500
TOKEN	it is Dumped into file: data/wiki/word/Pyramid/TOKEN.p
TOKEN	the length of it is   : 41843
**************************************** 

pos-es	is Dumped into file: data/wiki/word/Vocab/pos-es.voc
pos-es	the length of it is   : 229
		Write to: data/wiki/word/Vocab/pos-es.tsv
token	is Dumped into file: data/wiki/word/Vocab/token.voc
token	the length of it is   : 5965
		Write to: data/wiki/word/Vocab/token.tsv
****************************************


In [13]:
LTU, DTU = BasicObject.TokenVocab

In [17]:
idx2freq = BasicObject.idx2freq

idx2freq

array([3218, 2356, 1415, ...,    1,    1,    1])

In [76]:
def get_num_freq(idx2freq, max_vocab_token_num = None, min_token_freq = 1):
    if min_token_freq:
        max_vocab_token_num = len(idx2freq[idx2freq >= min_token_freq])
        
    elif max_vocab_token_num:
        if max_vocab_token_num > len(idx2freq):
            max_vocab_token_num,  min_token_freq = len(idx2freq), 1
        else:
            min_token_freq = max_vocab_token_num[max_vocab_token_num]
    else:
        raise('Error in max_vocab_token_num and min_token_freq')
        
    print('max_vocab_token_num  is:', max_vocab_token_num)
    print('min_token_freq       is:', min_token_freq)
    print('Corpus coverage rate is:', np.sum(idx2freq[:max_vocab_token_num]) / np.sum(idx2freq))
    print('Token  coverage rate is:', max_vocab_token_num / len(idx2freq))
    return max_vocab_token_num,  min_token_freq


In [77]:
len(LTU)

5965

In [86]:
##################################################################################################LTU_LGU-LT
from datetime import datetime

import numpy as np
def get_GU_or_LKP(TokenVocab, tkidx2freq, channel= 'char', Max_Ngram = 1, end_grain = False, 
                  max_vocab_token_num = None, min_token_freq = 1, min_grain_freq = 1):

    # ListGrainUnique = []
    LTU, DTU = TokenVocab
    max_vocab_token_num, min_token_freq = get_num_freq(tkidx2freq, max_vocab_token_num = max_vocab_token_num, 
                                                       min_token_freq = min_token_freq)
    LTU = LTU[:max_vocab_token_num]
    
    # the containers to store our results
    oldLGU = []
    oldDGU = {}
    oldidx2freq = []
    LKP = []
    
    print('For channel: |', channel, '| build GrainUnique and LookUp')
    for idx, token in enumerate(LTU):
        token_freq  = idx2freq[DTU[token]]
        ChN = getChannelGrain4Token(token, channel, Max_Ngram = Max_Ngram, end_grain = end_grain)
        grain2number = dict(collections.Counter(ChN).most_common())
        for gr in grain2number:
            if gr in oldDGU:
                oldidx2freq[oldDGU[gr]] = oldidx2freq[oldDGU[gr]] + grain2number[gr] * token_freq
            else:
                oldDGU[gr] = len(oldDGU)
                oldLGU.append(gr)
                oldidx2freq.append(grain2number[gr] * token_freq)

        LKP.append([oldDGU[gr] for gr in ChN])
        if idx % 100000 == 0:
            print('\t\tFor Channel:', channel, '\t', idx, datetime.now())

    # remove some high and low frequency grains.
    # how to deal with the high freqency grains?
    # notice that the grain freq is based on vocab instead of corpus.
    assert len(LKP) == len(LTU)
    
    # sort the LGU, DGU and renew LKP
    oldidx2freq = np.array(oldidx2freq)
    max_grain_num = len(oldidx2freq[oldidx2freq >= min_grain_freq])
    
    del oldDGU 
    grainidx2freq = np.sort(oldidx2freq)[::-1]
    newidx2oldidx = np.argsort(oldidx2freq)[::-1]
    del oldidx2freq

    oldidx2newidx = np.zeros(len(newidx2oldidx), dtype= int) 
    for new_idx, old_idx in enumerate(newidx2oldidx):
        oldidx2newidx[old_idx] = new_idx
    
    for tkidx, grainlist in enumerate(LKP):
        new_grainlist = []
        for oldidx in grainlist:
            newidx = oldidx2newidx[oldidx]
            # throw away the low frequency grains
            if grainidx2freq[newidx] < min_grain_freq:
                continue
            new_grainlist.append(newidx)
        LKP[tkidx] = new_grainlist 
    del oldidx2newidx

    LGU = []
    for new_idx in range(max_grain_num):
        # to filter some grains
        LGU.append(oldLGU[newidx2oldidx[new_idx]])
    del oldLGU
    del newidx2oldidx

    DGU = {}
    for new_idx, token in enumerate(LGU):
        DGU[token] = new_idx
        
    grainidx2freq = grainidx2freq[:max_grain_num]
    
    return (LGU, DGU), LKP, grainidx2freq

In [87]:
(LGU, DGU), LKP, grainidx2freq = get_GU_or_LKP(BasicObject.TokenVocab, BasicObject.idx2freq, 
                  channel= 'subcomp', Max_Ngram = 1, end_grain = False, 
                  max_vocab_token_num = None, min_token_freq = 2, min_grain_freq = 1)

max_vocab_token_num  is: 2780
min_token_freq       is: 2
Corpus coverage rate is: 0.9238821308223598
Token  coverage rate is: 0.4660519698239732
For channel: | subcomp | build GrainUnique and LookUp
		For Channel: subcomp 	 0 2019-07-02 18:29:02.661262


In [80]:
LGU

['c6',
 'c1',
 'c119',
 'c1111',
 'c64',
 'c331',
 'c209',
 'c5',
 'c45',
 'c46',
 'c67',
 ',',
 'c99',
 'c23',
 'c20',
 'c2',
 'c139',
 'c72',
 'c277',
 'c3',
 'c207',
 'c162',
 'c113',
 'c88',
 'c326',
 '。',
 'c41',
 'c84',
 'c83',
 'c253',
 'c49',
 'c47',
 'c11',
 'c19',
 'c238',
 'c220',
 'c82',
 'c106',
 'c185',
 'c221',
 'c140',
 'c108',
 'c17',
 'c405',
 'c159',
 'c232',
 'c127',
 'c173',
 'c42',
 '、',
 'c135',
 'c257',
 'c339',
 'c39',
 'c156',
 'c208',
 'c154',
 'c32',
 'c27',
 'c358',
 'c48',
 'c43',
 'c247',
 'c174',
 'c55',
 'c214',
 'c24',
 'c391',
 'c105',
 'c180',
 'c120',
 'c101',
 'c71',
 'c297',
 'c202',
 'c70',
 'c166',
 'c429',
 'c63',
 'c85',
 'c298',
 'c81',
 'c9',
 'c100',
 'c142',
 'c191',
 'c44',
 'c366',
 'c61',
 'c291',
 'c62',
 'c287',
 'c301',
 'c322',
 'c38',
 'c128',
 'c50',
 'c89',
 'c80',
 'c69',
 'c122',
 'c130',
 'c93',
 'c114',
 'c97',
 'c248',
 'c22',
 'c183',
 'c138',
 'c53',
 'c90',
 'c295',
 'c4',
 'c382',
 'c152',
 'c204',
 'c161',
 'c96',
 'c42

In [81]:
DGU

{'c6': 0,
 'c1': 1,
 'c119': 2,
 'c1111': 3,
 'c64': 4,
 'c331': 5,
 'c209': 6,
 'c5': 7,
 'c45': 8,
 'c46': 9,
 'c67': 10,
 ',': 11,
 'c99': 12,
 'c23': 13,
 'c20': 14,
 'c2': 15,
 'c139': 16,
 'c72': 17,
 'c277': 18,
 'c3': 19,
 'c207': 20,
 'c162': 21,
 'c113': 22,
 'c88': 23,
 'c326': 24,
 '。': 25,
 'c41': 26,
 'c84': 27,
 'c83': 28,
 'c253': 29,
 'c49': 30,
 'c47': 31,
 'c11': 32,
 'c19': 33,
 'c238': 34,
 'c220': 35,
 'c82': 36,
 'c106': 37,
 'c185': 38,
 'c221': 39,
 'c140': 40,
 'c108': 41,
 'c17': 42,
 'c405': 43,
 'c159': 44,
 'c232': 45,
 'c127': 46,
 'c173': 47,
 'c42': 48,
 '、': 49,
 'c135': 50,
 'c257': 51,
 'c339': 52,
 'c39': 53,
 'c156': 54,
 'c208': 55,
 'c154': 56,
 'c32': 57,
 'c27': 58,
 'c358': 59,
 'c48': 60,
 'c43': 61,
 'c247': 62,
 'c174': 63,
 'c55': 64,
 'c214': 65,
 'c24': 66,
 'c391': 67,
 'c105': 68,
 'c180': 69,
 'c120': 70,
 'c101': 71,
 'c71': 72,
 'c297': 73,
 'c202': 74,
 'c70': 75,
 'c166': 76,
 'c429': 77,
 'c63': 78,
 'c85': 79,
 'c298': 80,
 'c81

In [82]:
grainidx2freq

array([12015,  9213,  6980,  4868,  3565,  3463,  3430,  3429,  2791,
        2755,  2478,  2356,  2131,  1868,  1788,  1781,  1621,  1599,
        1520,  1498,  1483,  1479,  1459,  1437,  1424,  1415,  1328,
        1291,  1250,  1187,  1182,  1177,  1151,  1118,  1067,  1058,
        1042,   960,   958,   941,   921,   888,   818,   801,   785,
         772,   748,   739,   733,   729,   722,   677,   674,   672,
         666,   662,   647,   629,   629,   628,   619,   619,   606,
         586,   585,   585,   582,   562,   557,   555,   551,   550,
         541,   538,   536,   533,   530,   520,   516,   498,   494,
         484,   475,   462,   457,   447,   438,   426,   421,   417,
         416,   414,   406,   405,   400,   400,   398,   396,   390,
         388,   386,   383,   373,   373,   368,   353,   352,   344,
         343,   333,   332,   326,   316,   315,   315,   308,   301,
         296,   286,   284,   273,   271,   270,   270,   269,   267,
         262,   260,

In [88]:

###############################################################################################################
def getChannelName(channel, Max_Ngram = 1,  end_grain = False, tagScheme = 'BIO', min_grain_freq = 1,
                   style = 'normal', channel_name = None, channel_name_abbr = None, **kwargs):

    if style == 'normal':
        MN = str(Max_Ngram) if Max_Ngram > 1 else ''
        e  = 'e'            if end_grain else ''
        f  = '-f' + str(min_grain_freq) if min_grain_freq>1 else ''
        tS = '-' + tagScheme.replace('BIO', '').lower() if tagScheme != 'BIO' else ''
        return channel + MN + e + tS + f 

    elif style == 'abbr':
        channel = CHANNEL_ABBR[channel] # if abbr else channel
        MN = str(Max_Ngram) if Max_Ngram > 1 else ''
        e  = 'e'            if end_grain else ''
        f  = '-f' + str(min_grain_freq) if min_grain_freq>1 else ''
        tS = '-' + tagScheme.replace('BIO', '').lower() if tagScheme != 'BIO' else ''
        return channel + MN + e + tS + f 

    elif channel_name and style == 'extract':
        assert channel in channel_name

        if '-f' in channel_name:
            channel_name, min_grain_freq = channel_name.split('-f')
            min_grain_freq = str(min_grain_freq)
        else:
            min_grain_freq = 1

        MN_e_tS = channel_name[len(channel):]
        if len(MN_e_tS) == 0:
            return channel, Max_Ngram, end_grain, tagScheme
        if MN_e_tS[0] in '23456789':
            Max_Ngram = int(MN_e_tS[0])
            e_ts = MN_e_tS[1:]
            if len(e_ts) == 0:
                return channel, Max_Ngram, end_grain, tagScheme
        else:
            Max_Ngram = 1
            e_ts = MN_e_tS
        
        if e_ts[0] == 'e':
            end_grain = True
            ts = e_ts[1:]
        else:
            end_grain = False
            ts = e_ts
        if ts.upper() in ['-ES', '-E', '-S']:
            tagScheme = 'BIO' + ts.upper()[1:]
        else:
            tagScheme = 'BIO'
        return channel, Max_Ngram, end_grain, tagScheme, min_grain_freq
        
    elif channel_name_abbr and style == 'extract':
        channel_abbr = CHANNEL_ABBR[channel]

        if '-f' in channel_name_abbr:
            channel_name_abbr, min_grain_freq = channel_name_abbr.split('-f')
            min_grain_freq = str(min_grain_freq)
        else:
            min_grain_freq = 1


        MN_e_tS = channel_name_abbr[len(channel_abbr): ]
        if len(MN_e_tS) == 0:
            return channel, Max_Ngram, end_grain, tagScheme
        if MN_e_tS[0] in '23456789':
            Max_Ngram = int(MN_e_tS[0])
            e_ts = MN_e_tS[1:]
            if len(e_ts) == 0:
                return channel, Max_Ngram, end_grain, tagScheme
        else:
            Max_Ngram = 1
            e_ts = MN_e_tS
        
        if e_ts[0] == 'e':
            end_grain = True
            ts = e_ts[1:]
        else:
            end_grain = False
            ts = e_ts
        if ts.upper() in ['-ES', '-E', '-S']:
            tagScheme = 'BIO' + ts.upper()[1:]
        else:
            tagScheme = 'BIO'
        
        return channel, Max_Ngram, end_grain, tagScheme, min_grain_freq

    else:
        print('Error in getChannelName')

In [94]:
from nlptext.utils.channel import CHANNEL_ABBR

getChannelName(channel = 'subcomp', Min_Ngram = 1, Max_Ngram = 9,  end_grain = True, tagScheme = 'BIO', min_grain_freq = 3,
                   style = 'normal', channel_name = None, channel_name_abbr = None)

'subcomp9e-f3'