# Preprocessing of the corpus
- `HIT-IRLab-Cilin_extended_full_2005.3.3.utf8.txt`
- `Sense_POS.txt`

See also `Cilin_process_guild.md`.

## Cilin processing

In [37]:
cilin = open(r'./data/HIT-IRLab-Cilin_extended_full_2005.3.3.utf8'
    r'.txt', 'r', encoding='utf-8')

t_syn_dict = {}
# Words with more than 1 senses
syn_dict = {}
# Sense : words
syn_sense = {}

for line in cilin:
    # Skip the empty line
    if (line == ''):
        continue
    # Split the text by [space]
    sep = line.strip().split(' ')
    # Skip if not the synonym
    if (not sep[0].endswith('=')):
        continue
    # Read the type and the words
    sense, words = sep[0], sep[1:]
    # Trim the sense for each word from length of 8 to length
    # of 4 in order to align with the sense format of `Sense_POS.txt`
    sense = sense[0:4]

    # Everytime a new sense come in, update the `syn_sense`
    if (sense in syn_sense):
        for word in words:
            if (syn_sense[sense].count(word) == 0):
                syn_sense[sense].append(word)
    else:
        syn_sense[sense] = words

    # Iterate through the `words` with sense of `sense`,
    # and insert the sense into the dictionary respectively.
    for word in words:
        if (word in t_syn_dict):
            # Word has been added to the dictionary with 
            # different sense, so add the new sense to the 
            # word.
            if (t_syn_dict[word].count(sense) == 0):
                t_syn_dict[word].append(sense)
        else:
            # Word hasn't been found before, so we add it to 
            # the dictionary and create a sense list for it.
            t_syn_dict[word] = [sense]

for key in t_syn_dict:
    if (len(t_syn_dict[key]) > 1):
        syn_dict[key] = t_syn_dict[key]
cilin.close()

In [39]:
syn_dict

{'人': ['Aa01', 'Ab02', 'Dd17', 'De01', 'Dn03'],
 '士': ['Aa01', 'Ab01', 'Ae10', 'Al01'],
 '生人': ['Aa01', 'Aj02'],
 '人口': ['Aa01', 'Dn03'],
 '口': ['Aa01', 'Bk04', 'Bo09', 'Ca14', 'Dn03'],
 '食指': ['Aa01', 'Bk08', 'Dn03'],
 '劳力': ['Aa01', 'Dd14'],
 '劳动力': ['Aa01', 'Dd14'],
 '匹夫': ['Aa01', 'Al04'],
 '个人': ['Aa01', 'Aa02', 'Ed55'],
 '家伙': ['Aa01', 'Ba01', 'Bo01', 'Bo27'],
 '东西': ['Aa01', 'Ba01', 'Da28'],
 '货色': ['Aa01', 'Ba04'],
 '崽子': ['Aa01', 'Bi01'],
 '小子': ['Aa01', 'Aa02', 'Ab04', 'Ah14'],
 '杂种': ['Aa01', 'Ah14'],
 '畜生': ['Aa01', 'Bi01'],
 '混蛋': ['Aa01', 'Ak03', 'Al04'],
 '竖子': ['Aa01', 'Ab04', 'Af02'],
 '鼠辈': ['Aa01', 'Ak03'],
 '者': ['Aa01', 'Dd05', 'Ed61', 'Kd03', 'Kd04'],
 '手': ['Aa01', 'Bk08', 'Fa06', 'Ka24'],
 '匠': ['Aa01', 'Ae02'],
 '客': ['Aa01', 'Ae09', 'Ag10', 'Aj05', 'Aj06', 'An07', 'Dn08', 'Ed61'],
 '主': ['Aa01', 'Aj05', 'Df14', 'Ed56', 'Gb02', 'Ja04'],
 '子': ['Aa01', 'Ab01', 'Ah14', 'Bh13', 'Bl05', 'Di17', 'Dj05', 'Eb36', 'Ed56'],
 '家': ['Aa01', 'Ah08', 'Al02', 'Cb28', 'Di05',

## HIT Corpus processing

In [40]:
import re

corpus = open(r'./data/Sense_POS.txt', 'r', encoding='utf-8')

syn_lines = []
# The upper bound for synonyms which can appear in one sentence.
syn_count_bound = 15

for line in corpus:
    # Skip the empty line
    if (line == ''):
        continue
    # Clean the word's hierachy, e.g.
    # [香港/ns/Cb25 中华/nz/Di02 总商会/n/Di23]nt 
    # transferred to
    # 香港/ns/Cb25 中华/nz/Di02 总商会/n/Di23
    line = re.sub(r'\]\w*', '', line.strip().replace('[', ''))
    # Split by [space]
    sep = line.split(' ')
    # Iterate through the words to check if the sentence 
    # containing only one synonym. If so, mark the 
    # corrsponding index and generate two line for traning.
    syn_count = 0
    i, index = 0, []
    for word in sep:
        word_text, _, word_sense = word.split('/')
        if (word_text in syn_dict and word_sense in syn_dict[word_text]):
            syn_count = syn_count + 1
            index.append(i)
        i = i + 1
    if (len(index) == 0 or syn_count > syn_count_bound):
        continue
    tmp = []
    for w in sep:
        tmp_sep = w.split('/')
        tmp.append((tmp_sep[0], tmp_sep[1], tmp_sep[2]))
    syn_lines.append((index, tmp))

corpus.close()

# Format for `syn_lines`
# 1. `syn_lines`是一个列表，每个元素描述了句子中多义词的位置
# 2. 每一个元素是一个tuple，tuple的前一个元素是一个列表，列表中的每一个元素是一个数字，代表同义词
# 在后一个元素中的下标位置，后一个元素是句子按照词语分割形成的列表

In [41]:
syn_lines[0]

([2, 3, 4, 8, 9, 14],
 [('迈向', 'v', 'Fb01'),
  ('充满', 'v', 'Jd06'),
  ('希望', 'n', 'Df08'),
  ('的', 'u', 'Kd01'),
  ('新', 'a', 'Eb28'),
  ('世纪', 'n', 'Ca18'),
  ('——', 'wp', '-1'),
  ('一九九八年', 'nt', '-1'),
  ('新年', 'nt', 'Ca25'),
  ('讲话', 'n', 'Dk11'),
  ('（', 'wp', '-1'),
  ('附', 'v', '-1'),
  ('图片', 'n', 'Dk18'),
  ('１', 'm', '-1'),
  ('张', 'q', 'Dn08'),
  ('）', 'wp', '-1')])

## Generate the training set

In [62]:
import random


def random_pick(collection: list) -> str :
    return collection[random.randint(0, len(collection) - 1)]

# Exclusively picking another item from list
def random_pick_ex(collec: list, ex_item: str) -> str :
    offset = random.randint(1, len(collec) - 1)
    ex_index = collec.index(ex_item)
    return collec[(ex_index + offset) % len(collec)]

# Generating the training set
gen_corpus_list = []
for sentence in syn_lines[:738]:
    index = random_pick(sentence[0])
    word, _, sense = sentence[1][index]
    senses = syn_dict[word]
    # Get another sense
    sub_sense = random_pick_ex(senses, sense)
    # Get another word for substitution
    sub_words = syn_sense[sub_sense]
    sub_word = random_pick_ex(sub_words, word)

    positive_line = "%d %s\n" % (1, ' '.join(w[0] for w in sentence[1]))
    tmp_sentence = [w[0] for w in sentence[1]]
    tmp_sentence[index] = sub_word
    negative_line = "%d %s\n" % (0, ' '.join(tmp_sentence))
    gen_corpus_list.extend([positive_line, negative_line])
gen_corpus = open(r'./data/gen_corpus.plain.utf8.txt', 'w', encoding='utf-8')
# random.shuffle(gen_corpus_list)
for line in gen_corpus_list:
    gen_corpus.write(line)
gen_corpus.close()

# Generating the validation set
gen_valid_list = []
for sentence in syn_lines[738:]:
    pos_line = "%d %s\n" % (1, ' '.join([w[0] for w in sentence[1]]))
    for index in sentence[0]:
        word, _, sense = sentence[1][index]
        senses = syn_dict[word]
        tmp_valid_list = [pos_line]
        for sub_sense in senses:
            if (sub_sense == sense):
                continue
            sub_word = random_pick_ex(syn_sense[sub_sense], word)
            tmp_sentence = [w[0] for w in sentence[1]]
            tmp_sentence[index] = sub_word
            neg_line = "%d %s\n" % (0, ' '.join(tmp_sentence))
            tmp_valid_list.append(neg_line)
        gen_valid_list.append(tmp_valid_list)

gen_valid = open(r'./data/gen_valid.plain.utf8.txt', 'w', encoding='utf-8')
for lines in gen_valid_list:
    for line in lines:
        gen_valid.write(line)
    gen_valid.write('$\n')
gen_valid.close()

In [58]:
f = open(r'./data/gen_corpus.plain.utf8.txt', 'r', encoding='utf-8')
for l1 in f:
    l2 = f.readline()
    s1 = ''.join(l1.split(' ')[1:])
    s2 = ''.join(l2.split(' ')[1:])
    if (s1 == s2) :
        print("%s\n%s\n" % (s1, s2))
f.close()

## Author
Coded by [mine268](https://github.com/mine268).