# Test Match Company
In this notebook, I try to deal with company matching problem.
E.g, 上海速必达有限公司. Cannot find `速必达`'s embedding in dictionary. Thus the generated vector may lose key information.

Solution: Can ignore the key word `速必达`. We detect the company name structure, then the middle part can be matched.

In [40]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import jieba

In [41]:
model = model = KeyedVectors.load('./test_50.bin')

## Case1
entity = `上海爱笔有限公司`
In this case, every word in this entity could be found in vocabulary.
Given the same term, the similarity of their embedding vectors is approximately 1.

In [125]:
term1 = '上海爱笔有限公司'
term2 = '上海'
term3 = '爱笔'
term4 = '有限公司'
entity = '爱笔（北京）智能科技有限公司'

In [42]:
def generateCompanyEmbeddings(name):
    words = jieba.cut(name, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        if word in model.vocab:
            v += model[word]
    
    v /= len(v)
    return v

In [47]:
def generateTermEmbeddings(term):
    words = jieba.cut(term, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        print(word)
        if word in model.vocab:
            v += model[word]
        else:
            print(f'{word} not found')
    v /= len(v)
    return v

In [118]:
def calculate_cosine_similarity(a, b):
    vector_a = np.mat(a)
    vector_b = np.mat(b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    
    if denom == 0:
        return 0.0
    
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim

In [126]:
entity_vec = generateCompanyEmbeddings(entity)
print(entity_vec.shape)

(200,)


In [45]:
term2_vec = model[term2] # 上海
term4_vec = model[term4] # 有限公司
print(term2_vec.shape)

(200,)


In [135]:
xx = model['艾美特']

In [136]:
model.most_similar('艾美特')

[('伊莱克斯', 0.7312238812446594),
 ('格兰仕', 0.717694103717804),
 ('冷风扇', 0.6984611749649048),
 ('法罗力', 0.6955032348632812),
 ('志高空调', 0.6942696571350098),
 ('ao史密斯', 0.6932370662689209),
 ('落地扇', 0.6921243071556091),
 ('空调质量', 0.6915013194084167),
 ('富士宝', 0.6913032531738281),
 ('荣事达', 0.6868325471878052)]

In [127]:
term1_vec = generateCompanyEmbeddings(term1)
term3_vec = generateCompanyEmbeddings(term3)

In [128]:
sim1 = calculate_cosine_similarity(entity_vec, term1_vec)
print(sim1)

0.9225701537699091


In [133]:
sim3 = calculate_cosine_similarity(entity_vec, term3_vec)
print(sim3)

0.7808995749093834


In [46]:
sim4 = calculate_cosine_similarity(entity_vec, term4_vec)
print(sim4)

0.8836462869842496


## Case2
entity: `多玛凯拔科技有限公司`. The company name, `多玛凯拔`, which is the most descriptive word, cannot be found in the dictionary.

In [94]:
entity2 = '多玛凯拔科技有限公司'
term5 = '多玛凯拔'
term6 = '科技'

In [95]:
entity2_vec = generateCompanyEmbeddings(entity2)

In [96]:
term5_vec = generateCompanyEmbeddings(term5)
sim5 = calculate_cosine_similarity(entity2_vec, term5_vec)
print(sim5)

nan


  


In [97]:
term6_vec = generateTermEmbeddings(term6)
sim6 = calculate_cosine_similarity(entity2_vec, term6_vec)
print(sim6)

科技
0.9284849222005065


In [98]:
# if sim5 > 0.9, exact match
idx = entity2.find(term5)
print(idx)

0


In [99]:
def extractKeyword(term):
    '''
    Extract keyword in a term, whose embedding can not be found in vocabulary
    Return a list
    '''
    words = jieba.cut(term, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    keyword = []
    for word in word_list:
        if word not in model.vocab:
            print(f'keyword found: {word}')
            keyword.append(word)
    return keyword

In [100]:
def exact_match(entity, term):
    keyword = extractKeyword(term)
    is_match = False
    for key in keyword:
        idx = entity.find(key)
        if idx != -1:
            is_match = True
            print(f'match {key}')
    return is_match

In [101]:
is_match = exact_match(entity2, term5)
print(is_match)

keyword found: 多玛凯拔
match 多玛凯拔
True


In [102]:
is_match2 = exact_match(entity2, term6)
print(is_match2)

False


## Case3
entity: `广州酷刻科技有限公司`.

In [106]:
entity3 = '广州酷刻科技有限公司'
term7 = '广州'
term8 = '酷刻'
term9 = '有限公司'
term10 = '广州酷刻科技'

In [107]:
entity3_vec = generateCompanyEmbeddings(entity3)
term7_vec = generateCompanyEmbeddings(term7)
term8_vec = generateCompanyEmbeddings(term8)
term9_vec = generateCompanyEmbeddings(term9)
term10_vec = generateCompanyEmbeddings(term10)

In [110]:
sim7 = calculate_cosine_similarity(entity3_vec, term7_vec)
print(sim7)
is_match7 = exact_match(entity3, term7)
print(is_match7)

0.8798465039357534
False


In [119]:
sim8 = calculate_cosine_similarity(entity3_vec, term8_vec)
print(sim8)
is_match8 = exact_match(entity3, term8)
print(is_match8)

0.0
keyword found: 酷刻
match 酷刻
True


In [112]:
sim9 = calculate_cosine_similarity(entity3_vec, term9_vec)
print(sim9)
is_match9 = exact_match(entity3, term9)
print(is_match9)

0.890014971996258
False


In [113]:
sim10 = calculate_cosine_similarity(entity3_vec, term10_vec)
print(sim10)
is_match10 = exact_match(entity3, term10)
print(is_match10)

0.9642831390531543
keyword found: 酷刻
match 酷刻
True


## Case4
entity: `爱国者电子科技有限公司`

In [120]:
entity4 = '爱国者电子科技有限公司'
term11 = '爱国者'
entity4_vec = generateCompanyEmbeddings(entity4)
term11_vec = generateCompanyEmbeddings(term11)

In [122]:
sim11 = calculate_cosine_similarity(term11_vec, entity4_vec)
print(sim11)

0.8100053842362365


## Case5
entity: `爱乐福（深圳）科技有限公司`

In [137]:
entity5 = '爱乐福（深圳）科技有限公司'
term12 = '爱乐福'
entity5_vec = generateCompanyEmbeddings(entity5)
term12_vec = generateCompanyEmbeddings(term12)

In [138]:
sim12 = calculate_cosine_similarity(term12_vec, entity5_vec)
print(sim12)

0.8778953803630984


In [139]:
ff = model[term12]

KeyError: "word '爱乐福' not in vocabulary"