# Demo1
version2, based on version1
This is a demo. What I want to achieve in this demo?
+ improve company link accuracy

Actually there are 2 problems concerning the accuracy
+ Some company's name cannot be found in dictionary, those the key information is lost. 广州xxx公司
+ Some irrelerant terms are linked to a company name, like the location. 深圳->深圳xxx公司

In [137]:
import jieba
import re
import chardet
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import math
import datetime

## Load File

In [128]:
def loadCompany():
    path1 = 'data/company_list_ch.csv'
    company_ch_df = pd.read_csv(path1, header=None, delimiter=",", skiprows=2, names=['rank', 'name', 'Location', 'Income'])
    company_ch_df = pd.DataFrame(company_ch_df, columns=['name'])
    path2 = 'data/member-data.csv'
    company_df = pd.read_csv(path2, header=None, delimiter=",", skiprows=2, names=['name', 'No.', 'Resume', 'Position'])
    company_df = pd.DataFrame(company_df, columns=['name'])
    company_ch_df = pd.concat([company_ch_df, company_df], axis=0, ignore_index=True)
    
    print(f'before dedup, size: {company_ch_df.shape[0]}')
    company_ch_df = company_ch_df.drop_duplicates(subset=['name'], keep='first')
    print(f'after dedup, size: {company_ch_df.shape[0]}')
    print(type(company_ch_df))
    print(company_ch_df.tail())
    return company_ch_df

In [4]:
def loadMember():
    path = 'data/member-data.csv'
    member_df = pd.read_csv(path, header=None, delimiter=",", skiprows=1, names=['Company', 'No.', 'Resume', 'Position'])
    member_df = pd.DataFrame(member_df, columns=['Resume'])
    print(member_df.head())
    return member_df

# Preprocess Text
+ remove punctuations
+ remove stopwords
+ to lower letter for regconition later
+ generate N-grams to link entity

In [139]:
def removeStopWords(seglist):
    stopwords = {}
    fstop = open('data/stopwords_cn.txt', 'r', encoding='utf-8', errors='ignore')
    for w in fstop:
        stopwords[w.strip()] = w.strip()

    fstop.close()
    stopwords[' '] = ' '
    
    segListSanitized = []

    for word in seglist:
        if word == 'omnigo':
            word = '酷刻'
        if word == 'Aibee':
            word = '爱笔'
        if word == 'ilife':
            word = '爱乐福'
        if word == 'oracleen':
            word = '爱芽'
        if word not in stopwords:
            segListSanitized.append(word)
    return segListSanitized

In [44]:
def preprocess(text):
    # remove punctuations
    text = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", text)
    text = text.lower()
    # seperate words
    words = jieba.cut(text, cut_all=False)
    seglist = list(words)
    # remove stopwords
    segListSanitized = removeStopWords(seglist)
    print(f'Before sanitize, len: {len(seglist)}. After sanitize, len: {len(segListSanitized)}')

    return segListSanitized

## N-gram Algorithm

In [45]:
def getNgrams(wordList, n):
    '''
    This function only generete N-Grams
    '''
    output = set()
    for i in range (len(wordList) - n + 1):
        n_gram_temp = "".join(wordList[i:i+n])
        output.add(n_gram_temp)
    return output

In [46]:
def generateNgrams(wordList, n):
    '''
    This function genereates [1, N]-Grams
    '''
    result = set()
    for i in range(n):
        temp = getNgrams(wordList, i+1)
        result = result | temp
    
    return result

## Word Embedding

In [47]:
model = KeyedVectors.load('./test_50.bin')

In [96]:
def calculate_cosine_similarity(a, b):
    vector_a = np.mat(a)
    vector_b = np.mat(b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    
    if denom == 0:
        return 0.0
    
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim

In [49]:
def generateEmbeddings(name):
    words = jieba.cut(name, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        if word in model.vocab:
            v += model[word]
    
    v /= len(v)
    return v

In [145]:
def calculate_IDF(df):
    company_num = 0
    m = dict()
    for index, row in df.iterrows():
        name = row['name']
        name = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", name)
        name = name.lower()
        if isinstance(name, float) or name == " ":
            continue
        company_num += 1
        words = jieba.cut(name, cut_all=False)
        word_list = list(words)
        for word in word_list:
            if word in m.keys():
                m[word] +=1
            else:
                m[word] = 1
    
    idf = dict()
    
    for (k, v) in m.items():
        idf[k] = math.log(((1+company_num) / v), 10)
    
    # Normalize min-max
    v = idf.values()
    max_value = max(v)
    min_value = min(v)
    denom = max_value - min_value
    for (k, v) in idf.items():
        idf[k] = (idf[k] - min_value) / denom

    return idf

In [146]:
def generateEmbeddingsWithIDF(name, idf):
    words = jieba.cut(name, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        if word in model.vocab:
            v += model[word] * idf[word]
    
    v /= len(v)
    return v

In [50]:
def preprocess_entity_list(df, model):
    '''
    df: dafaframe
    model: word embedding model
    '''
    
    df['embeddings'] = ''
    for index, row in df.iterrows():
        # df.loc[index, 'embeddings'] = z
        name = row['name']
        if isinstance(name, float):
            continue
        name = name.lower()
        if name in model.vocab:
            vec = model[name]
        else:
            vec = generateEmbeddings(name)
        df.set_value(index, 'embeddings', vec)

    # print(df.head())
    return df

In [148]:
def preprocess_entity_list_withIDF(df, model, idf):
    '''
    df: dafaframe
    model: word embedding model
    idf: IDF for each word
    '''
    
    df['embeddings_idf'] = ''
    for index, row in df.iterrows():
        name = row['name']
        name = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", name)
        name = name.lower()
        if isinstance(name, float):
            continue
        if name in model.vocab:
            vec = model[name]
        else:
            vec = generateEmbeddingsWithIDF(name, idf)
        df.set_value(index, 'embeddings_idf', vec)

    # print(df.head())
    return df

## Entity Company

In [192]:
def linkCompany(output, model, company_df, company_threshold1, company_threshold2):
    for term in output:
        if len(term) <= 1:
            continue
        if term in model.vocab:
            term_vec = model[term]
            
            # Link Company
            company_candidate = dict()
            for index, row in company_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name_vec_idf = row['embeddings_idf']
                sim = calculate_cosine_similarity(term_vec, name_vec_idf)
                if (sim > company_threshold1):
                    company_candidate[name] = sim
            if len(company_candidate) != 0:
                company_candidate = sorted(company_candidate.items(), key=lambda item:item[1], reverse=True)
                print(f'company entity found: {term}->{company_candidate[0][0]}, sim = {company_candidate[0][1]}')
        else:
            term_vec = generateEmbeddings(term)
            for index, row  in company_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name_vec = row['embeddings']
                sim = calculate_cosine_similarity(term_vec, name_vec)
                #if term == "爱乐福" and name == "爱乐福（深圳）科技有限公司":
                #    print(f'term: {term}, sim: {sim}')
                if (sim == 0.0 or sim > company_threshold2):
                    is_match = exact_match(name, term)
                    if is_match:
                        print(f'company entity found by exact match: {term}->{name}')

In [193]:
def exact_match(entity, term):
    # print(f'exact_match, entity: {entity}, term: {term}')
    keyword = extractKeyword(term)
    is_match = False
    for key in keyword:
        idx = entity.find(key)
        if idx != -1:
            is_match = True
            print(f'match {key}')
    return is_match

In [194]:
def extractKeyword(term):
    '''
    Extract keyword in a term, whose embedding can not be found in vocabulary
    Return a list
    '''
    words = jieba.cut(term, cut_all=False)
    word_list = list(words)
    word_list = getNgrams(word_list, 2)
    v = np.zeros((200))
    keyword = []
    for word in word_list:
        if word not in model.vocab:
            # print(f'keyword found: {word}')
            keyword.append(word)
    return keyword

## Main Function

In [140]:
member_df = loadMember()
company_df = loadCompany()

                                              Resume
0  __团队成员#1__先生是公司创始人,也是中国最有影响力的商界领袖之一。1982年,__团队...
1  __团队成员#2__先生,现任TCL集团股份有限公司执行董事、总裁(COO)。1963年4月...
2  __团队成员#3__女士:1972年7月出生,中山大学法学博士,高级经济师。1993年6月至...
3  __团队成员#4__先生,1965年7月出生,东方电气集团党组副书记、副总经理,兼任东方电气...
4  __团队成员#5__女士,现任TCL多媒体集团有限公司非执行独立董事、A8新媒体集团非执行独...
before dedup, size: 11716
after dedup, size: 3571
<class 'pandas.core.frame.DataFrame'>
                 name
11669  淄博大亚金属科技股份有限公司
11681        紫光股份有限公司
11712      紫光西部数据有限公司
11713   紫罗兰家纺科技股份有限公司
11714    卒子科技（深圳）有限公司


In [149]:
company_idf = calculate_IDF(company_df)
company_df = preprocess_entity_list(company_df, model)
company_df = preprocess_entity_list_withIDF(company_df, model, company_idf)
print(company_df.head())



              name                                         embeddings  \
0   中国石油化工集团有限公司\n  [3.555182367563248e-05, -0.0002575131878256797...   
1  中国石油天然气集团有限公司\n  [0.00024397050961852073, -0.000617146184667945...   
2       国家电网有限公司\n  [0.00022917279973626138, -0.000968194492161274...   
3     中国建筑股份有限公司\n  [0.00037391087505966425, -0.001663828268647194...   
4   中国工商银行股份有限公司\n  [4.123697523027658e-05, -0.0013217980042099953...   

                                      embeddings_idf  
0  [0.00021867530420422555, 0.0003110994771122932...  
1  [0.0004270939901471138, -4.853351972997189e-05...  
2  [0.00041229628026485444, -0.000399581827223300...  
3  [0.0003339895379031077, -0.0004802231770008802...  
4  [0.00019588854687754064, -0.000362079134210944...  


In [152]:
text = '__团队成员#1__先生是公司创始人,也是中国最有影响力的商界领袖之一。1982年,__团队成员#1__先生于华南理工大学毕业,进入TCL的前身-TTK家庭电器有限公司。1985年,他担任新成立的TCL通讯设备公司总经理,创立了TCL品牌。2003年,__团队成员#1__担任TCL集团股份有限公司董事长兼CEO,随后TCL集团整体上市。在他的领导下,2004年TCL一举收购了法国汤姆逊全球彩电业务与阿尔卡特全球手机业务。目前TCL集团已经成为拥有6万名员工,业务遍及全球80多个国家和地区。2013年,TCL集团营业总收入超过855亿元,液晶电视全球销量1766万台,实际产量全球第三,品牌销售全球第三;TCL手机全球销量5520万台,行业排名全球第五。2012年__团队成员#1__被新华网评为“最具社会责任感企业家”;2011年荣获《中国企业家》“最具影响力的25位企业领袖”终身成就奖;2009年被评为“CCTV中国经济年度人物十年商业领袖”;2008年获改革开放30年经济人物称号;2004年被评为Fortune杂志“亚洲年度经济人物”、TIME杂志和CNN全球最具影响力的25名商界人士,同年法国总统希拉克向__团队成员#1__先生颁发了法国国家荣誉勋章。__团队成员#1__是中共第十六大代表,第十届、第十一届、第十二届全国人大代表。__团队成员#1__担任的社会职务包括:中国电子视像行业协会会长;中国国际商会副会长;全国工商联执行委员、广东省工商联(总商会)副主席。'

In [153]:
segListSanitized = preprocess(text)
output = generateNgrams(segListSanitized, 3)
linkCompany(output, model, company_df, 0.98, 0.9)
print()

Before sanitize, len: 325. After sanitize, len: 239
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0



In [154]:
text2 = '__团队成员#1__，Omnigo机器人CEO。毕业于华中科技大学，原uArm创始团队核心成员，uArm机械臂主创设计师。'

In [155]:
segListSanitized = preprocess(text2)
output = generateNgrams(segListSanitized, 3)
linkCompany(output, model, company_df, 0.98, 0.9)
print()

Before sanitize, len: 27. After sanitize, len: 18
match 酷刻
company entity found by exact match: 酷刻->广州酷刻科技有限公司



In [88]:
company_df.loc[3230,:]

name                                                 广州酷刻科技有限公司
embeddings    [0.00019766974612139165, -0.001492998823523521...
Name: 3230, dtype: object

In [162]:
text3 = '__团队成员#1__，北京爱国者新能源科技发展有限公司 CEO。'

In [163]:
segListSanitized = preprocess(text3)
output = generateNgrams(segListSanitized, 3)
linkCompany(output, model, company_df, 0.95, 0.9)
print()

Before sanitize, len: 16. After sanitize, len: 10
company entity found: 北京->宝希（北京）科技有限公司, sim = 0.9760900424845506
company entity found: 发展->深圳市迈迪加科技发展有限公司, sim = 0.9604227663245726
sim: 0.9713396702727501
company entity found: 爱国者->爱国者电子科技有限公司, sim = 0.9713396702727501
company entity found: 科技->多玛凯拔科技有限公司, sim = 1.0000000094492707
company entity found: 新能源->上海烯美新能源科技有限公司, sim = 0.9637615946079356



In [164]:
text4 = '__团队成员#1__,为爱漫科技（北京）有限公司执行董事'

In [165]:
segListSanitized = preprocess(text4)
output = generateNgrams(segListSanitized, 3)
linkCompany(output, model, company_df, 0.95, 0.9)
print()

Before sanitize, len: 15. After sanitize, len: 9
company entity found: 北京->宝希（北京）科技有限公司, sim = 0.9760900424845506
company entity found: 科技->多玛凯拔科技有限公司, sim = 1.0000000094492707
company entity found: 爱漫->爱漫科技（北京）有限公司, sim = 0.9739465827942052



In [166]:
text5 = '__团队成员#1__，Oracleen创始人。前腾讯电脑管家产品总监、奇虎360产品总监，十年互联网产品经验，熟悉智能硬件，曾研发过国内最早的NAS+路由智能设备。先后在安天实验室、腾讯、奇虎360、盛天网络担任产品总监，带队研发过3款用户过千万的产品。其中包括安天防线、腾讯电脑管家、360网站卫士、易乐玩、易乐游等。'

In [167]:
segListSanitized = preprocess(text5)
output = generateNgrams(segListSanitized, 3)
linkCompany(output, model, company_df, 0.95, 0.9)
print()

Before sanitize, len: 89. After sanitize, len: 63
company entity found: 网络->深圳市正星特网络有限公司, sim = 0.9775604418627177
company entity found: 腾讯->腾讯控股有限公司
, sim = 0.9712984916123004
company entity found: 爱芽->爱芽（北京）科技有限公司, sim = 0.977569224421832
company entity found: 互联网->深圳酷旗互联网有限公司, sim = 0.9751275155291305



In [195]:
text6 = '__团队成员#1__，ILIFE智能扫地机创始人。'

In [196]:
segListSanitized = preprocess(text6)
output = generateNgrams(segListSanitized, 3)
linkCompany(output, model, company_df, 0.95, 0.87)
print()

Before sanitize, len: 12. After sanitize, len: 7
match 爱乐福
company entity found by exact match: 1爱乐福->爱乐福（深圳）科技有限公司
match 爱乐福
company entity found by exact match: 1爱乐福智能->爱乐福（深圳）科技有限公司
match 爱乐福
company entity found by exact match: 成员1爱乐福->爱乐福（深圳）科技有限公司
match 爱乐福
company entity found by exact match: 爱乐福智能扫地机->爱乐福（深圳）科技有限公司
match 爱乐福
company entity found by exact match: 爱乐福->爱乐福（深圳）科技有限公司
match 爱乐福
company entity found by exact match: 爱乐福智能->爱乐福（深圳）科技有限公司



In [138]:
for index, row in member_df.iterrows():
    if index == 50:
        break
    print(f'Handle No.{index} text')
    start = datetime.datetime.now()
    text = row['Resume']
    segListSanitized = preprocess(text)
    output = generateNgrams(segListSanitized, 3)
    linkCompany(output, model, company_df, 0.96, 0.9)
    end = datetime.datetime.now()
    print(f'cost time: {end - start} sec')
    print()

Handle No.0 text
Before sanitize, len: 325. After sanitize, len: 239
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0
cost time: 0:04:21.161389 sec

Handle No.1 text
Before sanitize, len: 330. After sanitize, len: 264
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0
cost time: 0:04:09.386390 sec

Handle No.2 text
Before sanitize, len: 134. After sanitize, len: 108
cost time: 0:02:13.404943 sec

Handle No.3 text
Before sanitize, len: 187. After sanitize, len: 149
cost time: 0:02:27.970139 sec

Handle No.4 text
Before sanitize, len: 126. After sanitize, len: 94
cost time: 0:01:55.641639 sec

Handle No.5 text
Before sanitize, len: 139. After sanitize, len: 111
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0
cost time: 0:02:24.473510 sec

Handle No.6 text
Before sanitize, len: 140. After sanitize, len: 116
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0
cost time: 0:02:14.968615 sec

Handle No.7 text
Before sanitize, len: 574. After sanitize, len: 411


In [169]:
for index, row in member_df.iterrows():
    if index < 45:
        continue
    
    if index > 60:
        break
    print(f'Handle No.{index} text')
    start = datetime.datetime.now()
    text = row['Resume']
    segListSanitized = preprocess(text)
    output = generateNgrams(segListSanitized, 3)
    linkCompany(output, model, company_df, 0.96, 0.9)
    end = datetime.datetime.now()
    print(f'cost time: {end - start} sec')
    print()

Handle No.45 text
Before sanitize, len: 16. After sanitize, len: 10
company entity found: 北京->宝希（北京）科技有限公司, sim = 0.9760900424845506
company entity found: 发展->深圳市迈迪加科技发展有限公司, sim = 0.9604227663245726
sim: 0.9713396702727501
company entity found: 爱国者->爱国者电子科技有限公司, sim = 0.9713396702727501
company entity found: 科技->多玛凯拔科技有限公司, sim = 1.0000000094492707
company entity found: 新能源->上海烯美新能源科技有限公司, sim = 0.9637615946079356
cost time: 0:00:13.666006 sec

Handle No.46 text
Before sanitize, len: 16. After sanitize, len: 10
sim: 0.9713396702727501
company entity found: 爱国者->爱国者电子科技有限公司, sim = 0.9713396702727501
cost time: 0:00:11.050225 sec

Handle No.47 text
Before sanitize, len: 12. After sanitize, len: 7
cost time: 0:00:07.870590 sec

Handle No.48 text
Before sanitize, len: 13. After sanitize, len: 8
cost time: 0:00:09.269059 sec

Handle No.49 text
Before sanitize, len: 15. After sanitize, len: 9
company entity found: 北京->宝希（北京）科技有限公司, sim = 0.9760900424845506
company entity found: 科技->多玛凯拔科技有限