In [3]:
import torch
from transformers import pipeline
import nltk
from lemminflect import getAllInflections, getAllLemmas
from nltk.corpus import wordnet as wn
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import semcor
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

# Preparation

In [4]:
# load data
unmasker = pipeline('fill-mask', model='bert-base-uncased', top_k=10)
clear_output()

In [5]:
# test hugging face api
sentense = "Letters whose sole [MASK] is to make a political point will not be published."
candidate = unmasker(sentense)
result = []
for i in range(len(candidate)):
    result.append((candidate[i]['score'], candidate[i]['token_str']))
    
result

[(0.8293460607528687, 'purpose'),
 (0.06537218391895294, 'aim'),
 (0.026322726160287857, 'goal'),
 (0.013286370784044266, 'object'),
 (0.01147634256631136, 'function'),
 (0.009352392517030239, 'objective'),
 (0.00922191422432661, 'intention'),
 (0.0075449394062161446, 'intent'),
 (0.004589724820107222, 'task'),
 (0.004187298007309437, 'use')]

In [4]:
# load corpus 
with open('dataset/BAWE.txt', 'r', encoding='utf-8') as f:
    BAME_corpus = f.read().strip().split('. ')
with open('dataset/big.txt', 'r', encoding='utf-8') as f:
    big_corpus = f.read().strip().split('. ')
with open('dataset/paper.txt', 'r', encoding='utf-8') as f:
    paper_corpus = f.read().strip().split('. ')
with open('dataset/party_test.txt', 'r', encoding='utf-8') as f:
    party_test_corpus = f.read().strip().split('\n')
with open('dataset/party_train.txt', 'r', encoding='utf-8') as f:
    party_train_corpus = f.read().strip().split('\n')
    
corpuses = [BAME_corpus, big_corpus, paper_corpus, party_test_corpus, party_train_corpus]
cor_names = ["BAME_corpus", "big_corpus", "paper_corpus", "party_test_corpus", "party_train_corpus"]
c_len = len(cor_names)
for i in  range(c_len):
    print(cor_names[i], "len:", len(corpuses[i]))
    
corpus_combine = BAME_corpus + big_corpus + paper_corpus + party_test_corpus + party_train_corpus

BAME_corpus len: 244506
big_corpus len: 31564
paper_corpus len: 123656
party_test_corpus len: 70
party_train_corpus len: 637


In [5]:
# load AKL words
with open("data/noun.txt", 'r', encoding="utf-8") as f:
    noun = f.read().strip().split(', ')
with open("data/adj.txt", 'r', encoding="utf-8") as f:
    adj = f.read().strip().split(', ')
with open("data/adv.txt", 'r', encoding="utf-8") as f:
    adv = f.read().strip().split(', ')
with open("data/verb.txt", 'r', encoding="utf-8") as f:
    verb = f.read().strip().split(', ')
with open("data/others.txt", 'r', encoding="utf-8") as f:
    others = f.read().strip().split(', ')
    
AKL_words = [noun, adj, adv, verb, others]
AKL_merge = noun + adj + adv + verb + others
types = ["noun", "adj", "adv", "verb", "others"]

In [6]:
a_len = len(AKL_words)
for i in  range(a_len):
    print(types[i], "words:", len(AKL_words[i]))

noun words: 353
adj words: 180
adv words: 86
verb words: 233
others words: 75


In [7]:
# Preprocess the sentences
def preprocess(text):
    """
    input: a string
    output: a list
    - transform to lower case
    - remove the punctuation
    - seperate the words by blank
    """
    text = text.lower()
    punc = '!()-[]{};:"\,<">./?@#$%^&*_~1234567890'
    for p in punc: 
        text = text.replace(p, "")
    return text

corpus = []
for cor in corpus_combine:
    sentence = preprocess(cor)
    corpus.append(sentence)

# Function definition

In [8]:
def check_word_exist(st, base_word):
    """"
    若st 中有base_word的任何變形，回傳True
    """
    tokens = st.split(' ')
    vairation = getAllInflections(base_word)
    var_list = set()
    for types in vairation:
        for item in vairation[types]:
            var_list.add(item)
        
    for item in var_list:
        if item in tokens:
            return True
    return False

def put_mask(sentense, base_word):
    """
    把 [MASK] 放到第一個出現的 `base_word`各種變形
    """
    tokens = sentense.split(' ')
    vairation = getAllInflections(base_word)
    var_list = set()
    for types in vairation:
        for item in vairation[types]:
            var_list.add(item)
            
    rep_tokens = []
    mask = 0 # Only put mask on the first appeared base word
    for token in tokens:
        add = 0
        for item in var_list:
            if token == item and mask== 0:
                rep_tokens.append("[MASK]")
                add = 1
                mask += 1
        if add == 0:
            rep_tokens.append(token)

    res_sent = " ".join(rep_tokens)
    return res_sent, var_list

def get_candidates(sentense, base_word):
    """
    所有`base_word`的變形都不會納入candidates
    """
    sentense, var_list = put_mask(sentense, base_word)
    candidate = unmasker(sentense)
    result = {}
    for i in range(len(candidate)):
        same = 0
        for item in var_list:
            if candidate[i]['token_str'] == item:
                same = 1
        if same == 0:
            result[candidate[i]['token_str']] = candidate[i]['score']
    return result

# 檢查是否是AKL字
def check_akl(word):
    if word in AKL_merge:
        return True
    return False

#得到相似度分數(use wup_similarity)
def get_similarity_score(base_word, syn_word):
    """
    return mean similarity score of this two words
    compare all meaning
    """
    base_sets = wn.synsets(base_word)
    syn_sets = wn.synsets(syn_word)
    n = len(base_sets)
    m = len(syn_sets)
    score = 0
    for i in range(n):
        for j in range(m):
            try:
                score += base_sets[i].wup_similarity(syn_sets[j])
            except:
                pass
    try:
        score = score/ (n*m)
    except:
        score = score / 1
    return score

# 把分太細的POS 縮小分類
verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
adj = ['JJ', 'JJR', 'JJS']
adv = ['RB', 'RBR', 'RBS']
noun = ['NN', 'NNS', 'NNP', 'NNPS']

# 拿到句子中的詞性
def get_POS(sentense, target_word):
    """
    回傳 `target_word` 在 `sentense`中的詞性
    詞性種類: https://www.guru99.com/pos-tagging-chunking-nltk.html
    """
    # print("sentense: ", sentense)
    # print("target_word: ", target_word)
    tokens = nltk.word_tokenize(sentense)
    tag = nltk.pos_tag(tokens)
    # all variation 
    vairation = getAllInflections(target_word)
    var_list = set()
    for types in vairation:
        for item in vairation[types]:
            var_list.add(item)
    flag = 0
    mini_pos = ""
    # print(var_list)
    for tu in tag:
        for var in var_list:
            if tu[0] == var:
                mini_pos = tu[1]
                flag = 1
                break
        if flag == 1: # found
            break
    # print("mini_pos: ", mini_pos)
    pos = []
    if mini_pos in verb:
        pos.append("verb")
    if mini_pos in adj:
        pos.append("adj")
    if mini_pos in adv:
        pos.append("adv")
    if mini_pos in noun:
        pos.append("noun")
#     print(pos)
#     print('-'*10)
    return pos 

def same(cand_pos ,base_pos):
    for i in cand_pos:
        for j in base_pos:
            if i==j:
                return True
    return False        
    
def calculate_weight_ver2(cand, sentense, base_word):
    """
    input 1: the possible words dictionary
    input 2: the sentense used
    input 3: base word
    """
    # print(cand)
    data_items = cand.items()
    data_list = list(data_items)
    cand_df = pd.DataFrame(data_list, columns=['Words', 'Score'])
    
    # AKL part
    c_len = len(cand_df)
    for i in range(c_len):
        if check_akl(cand_df['Words'][i]):
            cand_df['Score'][i] = cand_df['Score'][i] *1.25
#             print("in AKL")
            
    # POS-tagging part
    base_pos = get_POS(sentense, base_word) #取得base的詞性
    vairation = getAllInflections(base_word)
    var_list = set()
    for types in vairation:
        for item in vairation[types]:
            var_list.add(item)
    # print("base_pos", base_pos)
    for i in range(c_len): 
        sen_tokens = sentense.split()
        for var in var_list:
            if var in sen_tokens:
                sent_temp = sentense.replace(var, cand_df['Words'][i])
                break
        cand_pos = get_POS(sent_temp, cand_df['Words'][i]) #取得candidate的詞性
        # print("cand", cand_df['Words'][i])
        # print("cand_pos", cand_pos)
        # print("sent_temp: ", sent_temp)
        if same(cand_pos ,base_pos):
            cand_df['Score'][i] = cand_df['Score'][i] *1.5
            # print("Same type")
#         else:
#             print('Not Same type')
    # Wordnet Similarity
    for i in range(c_len):
        cand_df['Score'][i] += get_similarity_score(base_word, cand_df['Words'][i])
    
    cand_df = cand_df.sort_values(by=['Score'], ascending=False).reset_index(drop=True)
    return cand_df

# 找兩個字最近的字義
def find_sense_of_two_words(base_word, syn_word):
    base_word = wn.synsets(base_word) #可增加詞性 base_word = wn.synsets(base_word, pos=wn.VERB)  [VERB, NOUN, ADJ, ADV]
    syn_word = wn.synsets(syn_word) #可增加詞性 syn_word = wn.synsets(syn_word, pos=wn.VERB)  [VERB, NOUN, ADJ, ADV]
#     print("-"*10)
#     print("find_sense_of_two_words")
#     print("base_word", base_word)
#     print("syn_word", syn_word)
#     print("-"*10)
    wup_similarity=[]
    wup_similarity_dict={}
    for i in base_word:
        for j in syn_word:
            if wn.wup_similarity(i, j) != None:
                wup_similarity.append(wn.wup_similarity(i, j))
                wup_similarity_dict[wn.wup_similarity(i, j)]=[i,j]
    # print("wup_similarity", wup_similarity)
    # print("wup_similarity_dict", wup_similarity_dict)  
    
    #找出相似度最大的值與sense    
    similarity = max(wup_similarity)
    #sense編號 
    sense= wup_similarity_dict[max(wup_similarity)][0]
    #字義
    definition = wup_similarity_dict[max(wup_similarity)][0].definition()
    
#     sense1 = wup_similarity_dict[max(wup_similarity)][0].definition()
#     sense2 = wup_similarity_dict[max(wup_similarity)][1].definition()
#     print("sense1: ", wup_similarity_dict[max(wup_similarity)][0], sense1)
#     print("sense2: " ,  wup_similarity_dict[max(wup_similarity)][1], sense2)

    return similarity, sense, definition  #propose和need相似度, propose和need相似度最接近的sense編號, 字義 


def summary(sentence, base_word):
    """
    輸入: sentence, target word
    輸出: target word/ 例句/ 在此例句中找到最近的詞比對出來的字義 
    """
    cand = get_candidates(sentence, base_word) # 找出可能的答案 
    # print("candidate", cand)
    result_df = calculate_weight_ver2(cand, sentence, base_word) # 加權
    r_len = len(result_df['Words'])
    for i in range(r_len):
        if(len(wn.synsets(result_df['Words'][i])))!= 0:
            syn_final_word = result_df['Words'][i] # 拿第一名的結果
            break
    # print("base_word: ", base_word, "syn_final_word", syn_final_word)
    similarity, sense, definition = find_sense_of_two_words(base_word, syn_final_word) # 找最近的字義
    
    similar = sense.lemma_names()
    filter = []
    for tmp in similar:
        if tmp!=base_word:
            filter.append(tmp)
    result = '、'.join(filter)
    
    # 印出結果
    print(f"""
    Target Word：{base_word}

    例句：{sentence}

    --------------------

    在此例句中 "{base_word}" 的字義是：{definition} 
    
    以下列出同義字：{result}
    """)
    return sense
    

# Usage example

In [9]:
base_word = "star"

In [10]:
# get the sentense that contains base_word
filter_corpus = []
for cor in corpus: 
    if check_word_exist(cor, base_word): 
        filter_corpus.append(cor)
print("length of our base word sentense: ", len(filter_corpus))

length of our base word sentense:  253


In [11]:
sentense = filter_corpus[1]

answer = summary(sentense, base_word)
print("sense:", answer)


    Target Word：star

    例句：emails being popular and vulnerable are a star target for virus writers

    --------------------

    在此例句中 "star" 的字義是：be the star in a performance 
    
    以下列出同義字：
    
sense: Synset('star.v.02')


# Semcor

http://man.hubwiz.com/docset/NLTK.docset/Contents/Resources/Documents/api/nltk.corpus.reader.html#module-nltk.corpus.reader.semcor

https://www.nltk.org/api/nltk.corpus.reader.semcor.html

https://www.nltk.org/_modules/nltk/corpus/reader/semcor.html

https://www.nltk.org/howto/corpus.html#chunked-corpora

In [17]:
nltk.download('semcor')

[nltk_data] Downloading package semcor to
[nltk_data]     C:\Users\WangHongWen\AppData\Roaming\nltk_data...
[nltk_data]   Package semcor is already up-to-date!


True

In [12]:
" ".join(semcor.sents()[0]) # sentences

"The Fulton County Grand Jury said Friday an investigation of Atlanta 's recent primary election produced `` no evidence '' that any irregularities took place ."

In [18]:
num_sents = len(semcor.sents()) # total number of sentences
num_sents

37176

In [99]:
# 一定會跑很久 -> 估計兩小時
from tqdm import tqdm
semcor_sence = []
for i, j in zip(tqdm(range(100)),range(100)) :
    semcor_sence.append(" ".join(semcor.sents()[j]))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:20<00:00,  4.79it/s]


In [None]:
sencor_sents = open("sencor_sentenses.txt", "w")
for element in semcor_sence:
    sencor_sents.write(element + "\n")
sencor_sents.close()

In [None]:
# read text file -> corpus

In [95]:
def find_semcor_idx(target_word):
    filter_corpus = []
    for cor in corpus: 
        sentense = preprocess(cor)
        if target_word in sentense:
            filter_corpus.append(sentense)
    return filter_corpus

test

In [97]:
" ".join(semcor.sents()[0]) # sentences

"The Fulton County Grand Jury said Friday an investigation of Atlanta 's recent primary election produced `` no evidence '' that any irregularities took place ."

In [96]:
def get_lemma(sentence_idx, target_word):
    sent = semcor.tagged_sents(tag='sem')[0]
    for word in sent:
        if(type(word)!=list):
            for lf in word.leaves():
                if (lf == target_word):
                    sense = word.label()
                    return sense.synset()

In [91]:
get_lemma(0, "investigation")

Synset('probe.n.01')

In [94]:
sentense = " ".join(semcor.sents()[0])
sentense = preprocess(sentense)

answer = summary(sentense, "investigation")
print("sense:", answer)


    Target Word：investigation

    例句：the fulton county grand jury said friday an investigation of atlanta 's recent primary election produced `` no evidence '' that any irregularities took place 

    --------------------

    在此例句中 "investigation" 的字義是：the work of inquiring into something thoroughly and systematically 
    
    以下列出同義字：investigating
    
sense: Synset('investigation.n.02')


# Evaluate

1. 我們判斷出的
2. semcor 套件中已經有正確答案

-> 1&2兩者去做比較！

In [None]:
candidates_words = ["star", "mole", "galley", "cone", "bass", "bow", " taste", " interest", "issue", "duty", "sentence", "slug"]