# Demo1
This is a demo. What I want to achieve?
+ link school, company, degree, position without order. Only find out those entities
+ batch process
+ have 3 mode: hard match, embedding, embedding with normalization

In [3]:
import jieba
import re
import chardet
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import math

## Load File

In [1]:
def loadSchool():
    path1 = 'data/chinese_university_list.csv'
    school_df = pd.read_csv(path1, header=None, delimiter=",", skiprows=4, names=["rank", "name", "code", "department", "city", "level", "notes"])
    school_df = pd.DataFrame(school_df, columns=['name'])
    print(school_df.shape[0])
    
    path2 = 'data/all_university.csv'
    school_global_df = pd.read_csv(path2, header=None, delimiter=',', skiprows=4, names=['Name_en', 'Name_ch', 'rank', 'score', 'location'])
    school_global_df = pd.DataFrame(school_global_df, columns=['Name_ch'])
    print(school_global_df.shape[0])
    
    school_df = pd.concat([school_df, school_global_df], axis=0, ignore_index=True)
    print(school_df.shape[0])
    return school_df

In [87]:
def loadCompany():
    path1 = 'data/company_list_ch.csv'
    company_ch_df = pd.read_csv(path1, header=None, delimiter=",", skiprows=2, names=['rank', 'name', 'Location', 'Income'])
    company_ch_df = pd.DataFrame(company_ch_df, columns=['name'])
    print(company_ch_df.shape[0])
    
    path2 = 'data/member-data.csv'
    company_df = pd.read_csv(path2, header=None, delimiter=",", skiprows=2, names=['name', 'No.', 'Resume', 'Position'])
    company_df = pd.DataFrame(company_df, columns=['name'])
    print(company_df.shape[0])
    
    company_ch_df = pd.concat([company_ch_df, company_df], axis=0, ignore_index=True)
    print(company_ch_df.shape[0])
    return company_ch_df

In [25]:
def loadDegree():
    degree = {'name': ['学士', '硕士', '博士']}
    degree_df = pd.DataFrame(degree)
    print(degree_df)
    return degree_df

In [26]:
def loadPosition():
    path = 'data/position.csv'
    position_df = pd.read_csv(path, header=None, delimiter=",", skiprows=1, names=['name'])
    print(position_df.head())
    return position_df

In [31]:
def loadMember():
    path = 'data/member-data.csv'
    member_df = pd.read_csv(path, header=None, delimiter=",", skiprows=1, names=['Company', 'No.', 'Resume', 'Position'])
    member_df = pd.DataFrame(member_df, columns=['Resume'])
    print(member_df.head())
    return member_df

# Preprocess Text
+ remove punctuations
+ remove stopwords
+ to lower letter for regconition later
+ generate N-grams to link entity

In [7]:
def removeStopWords(seglist):
    stopwords = {}
    fstop = open('data/stopwords_cn.txt', 'r', encoding='utf-8', errors='ignore')
    for w in fstop:
        stopwords[w.strip()] = w.strip()

    fstop.close()
    stopwords[' '] = ' '
    
    segListSanitized = []

    for word in seglist:
        if word not in stopwords:
            segListSanitized.append(word)
    return segListSanitized

In [8]:
def preprocess(text):
    # remove punctuations
    text = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", text)
    text = text.lower()
    # seperate words
    words = jieba.cut(text, cut_all=False)
    seglist = list(words)
    # remove stopwords
    segListSanitized = removeStopWords(seglist)
    print(f'Before sanitize, len: {len(seglist)}. After sanitize, len: {len(segListSanitized)}')

    return segListSanitized

## N-gram Algorithm

In [9]:
def getNgrams(wordList, n):
    '''
    This function only generete N-Grams
    '''
    output = set()
    for i in range (len(wordList) - n + 1):
        n_gram_temp = "".join(wordList[i:i+n])
        output.add(n_gram_temp)
    return output

In [29]:
def generateNgrams(wordList, n):
    '''
    This function genereates [1, N]-Grams
    '''
    result = set()
    for i in range(n):
        temp = getNgrams(wordList, i+1)
        result = result | temp
    
    return result

## Word Embedding

In [11]:
model = KeyedVectors.load('./test_50.bin')

In [12]:
def calculate_cosine_similarity(a, b):
    vector_a = np.mat(a)
    vector_b = np.mat(b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim

In [13]:
def generateEmbeddings(name):
    words = jieba.cut(name, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        if word in model.vocab:
            v += model[word]
    
    v /= len(v)
    return v

In [35]:
def calculate_IDF(df):
    company_num = 0
    m = dict()
    for index, row in df.iterrows():
        name = row['name']
        name = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", name)
        name = name.lower()
        if isinstance(name, float) or name == " ":
            continue
        company_num += 1
        words = jieba.cut(name, cut_all=False)
        word_list = list(words)
        for word in word_list:
            if word in m.keys():
                m[word] +=1
            else:
                m[word] = 1
    
    idf = dict()
    
    for (k, v) in m.items():
        idf[k] = math.log(((1+company_num) / v), 10)
    
    # Normalize min-max
    v = idf.values()
    max_value = max(v)
    min_value = min(v)
    denom = max_value - min_value
    for (k, v) in idf.items():
        idf[k] = (idf[k] - min_value) / denom

    return idf

In [14]:
def generateEmbeddingsWithIDF(name, idf):
    words = jieba.cut(name, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        if word in model.vocab:
            v += model[word] * idf[word]
    
    v /= len(v)
    return v

In [38]:
def preprocess_entity_list(df, model):
    '''
    df: dafaframe
    model: word embedding model
    '''
    
    df['embeddings'] = ''
    for index, row in df.iterrows():
        # df.loc[index, 'embeddings'] = z
        name = row['name']
        if isinstance(name, float):
            continue
        name = name.lower()
        if name in model.vocab:
            vec = model[name]
        else:
            vec = generateEmbeddings(name)
        df.set_value(index, 'embeddings', vec)

    # print(df.head())
    return df

In [90]:
def preprocess_entity_list_withIDF(df, model, idf):
    '''
    df: dafaframe
    model: word embedding model
    idf: IDF for each word
    '''
    
    df['embeddings'] = ''
    for index, row in df.iterrows():
        name = row['name']
        name = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", name)
        name = name.lower()
        if isinstance(name, float):
            continue
        if name in model.vocab:
            vec = model[name]
        else:
            vec = generateEmbeddingsWithIDF(name, idf)
        df.set_value(index, 'embeddings', vec)

    # print(df.head())
    return df

## Entity Link

In [63]:
def linkEntity(output, model, school_df, company_df, degree_df, position_df, school_threshold, company_threshold, degree_threshold, position_threshold):
    for term in output:
        if len(term) <= 1:
            continue
        if term in model.vocab:
            term_vec = model[term]
            school_candidate = dict()
            
            # Link School
            for index, row in school_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name_vec = row['embeddings']
                sim = calculate_cosine_similarity(term_vec, name_vec)
                if (sim > school_threshold):
                    school_candidate[name] = sim
            if len(school_candidate) != 0:
                school_candidate = sorted(school_candidate.items(), key=lambda item:item[1], reverse=True)
                print(f'university entity found: {term}->{school_candidate[0][0]}, sim = {school_candidate[0][1]}')
            
            # Link Company
            company_candidate = dict()
            for index, row in company_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name_vec = row['embeddings']
                sim = calculate_cosine_similarity(term_vec, name_vec)
                if (sim > company_threshold):
                    company_candidate[name] = sim
            if len(company_candidate) != 0:
                company_candidate = sorted(company_candidate.items(), key=lambda item:item[1], reverse=True)
                print(f'company entity found: {term}->{company_candidate[0][0]}, sim = {company_candidate[0][1]}')
            
            # Link Degree
            degree_candidate = dict()
            for index, row in degree_df.iterrows():
                name = row['name']
                name_vec = row['embeddings']
                sim = calculate_cosine_similarity(term_vec, name_vec)
                if (sim > degree_threshold):
                    degree_candidate[name] = sim
            if len(degree_candidate) != 0:
                degree_candidate = sorted(degree_candidate.items(), key=lambda item:item[1], reverse=True)
                print(f'degree entity found: {term}->{degree_candidate[0][0]}, sim = {degree_candidate[0][1]}')
            
            # Link Position
            position_candidate = dict()
            for index, row in position_df.iterrows():
                name = row['name']
                name_vec = row['embeddings']
                sim = calculate_cosine_similarity(term_vec, name_vec)
                if (sim > position_threshold):
                    position_candidate[name] = sim
            if len(position_candidate) != 0:
                position_candidate = sorted(position_candidate.items(), key=lambda item:item[1], reverse=True)
                print(f'position entity found: {term}->{position_candidate[0][0]}, sim = {position_candidate[0][1]}')

## Main Function

In [91]:
member_df = loadMember()
school_df = loadSchool()
company_df = loadCompany()
degree_df = loadDegree()
position_df = loadPosition()

                                              Resume
0  __团队成员#1__先生是公司创始人,也是中国最有影响力的商界领袖之一。1982年,__团队...
1  __团队成员#2__先生,现任TCL集团股份有限公司执行董事、总裁(COO)。1963年4月...
2  __团队成员#3__女士:1972年7月出生,中山大学法学博士,高级经济师。1993年6月至...
3  __团队成员#4__先生,1965年7月出生,东方电气集团党组副书记、副总经理,兼任东方电气...
4  __团队成员#5__女士,现任TCL多媒体集团有限公司非执行独立董事、A8新媒体集团非执行独...
     name
0    北京大学
1  中国人民大学
2    清华大学
3  北京交通大学
4  北京工业大学
                name
11711       紫光股份有限公司
11712     紫光西部数据有限公司
11713  紫罗兰家纺科技股份有限公司
11714   卒子科技（深圳）有限公司
11715   卒子科技（深圳）有限公司
  name
0   本科
1  研究生
2   博士
     name
0   首席执行官
1   首席运营官
2   首席财务官
3   首席信息官
4  人力资源总监


In [92]:
school_df = preprocess_entity_list(school_df, model)
    
company_idf = calculate_IDF(company_df)
company_df = preprocess_entity_list_withIDF(company_df, model, company_idf)
    
degree_df = preprocess_entity_list(degree_df, model)
    
position_idf = calculate_IDF(position_df)
position_df = preprocess_entity_list_withIDF(position_df, model, position_idf)



In [93]:
for index, row in member_df.iterrows():
    if index == 100:
        break
    print(f'Handle No.{index} text')
    text = row['Resume']
    segListSanitized = preprocess(text)
    output = generateNgrams(segListSanitized, 3)
    linkEntity(output, model, school_df, company_df, degree_df, position_df, 0.9, 0.96, 0.98, 0.9)
    print()

Handle No.0 text
Before sanitize, len: 325. After sanitize, len: 239
position entity found: 地区->地区销售经理, sim = 0.994823404036675
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0
position entity found: 董事长->总经理, sim = 0.9022811891008132
company entity found: 中国->尚科宁家（中国）科技有限公司, sim = 0.9840627450595995
position entity found: ceo->首席执行官, sim = 0.9130788393663668
position entity found: 总经理->总经理, sim = 1.0000000596046448
university entity found: 华南理工大学->华南理工大学, sim = 1.000000029802326

Handle No.1 text
Before sanitize, len: 330. After sanitize, len: 264
university entity found: 西安交通大学->西安交通大学, sim = 1.0000000596046448
company entity found: 科技->多玛凯拔科技有限公司, sim = 1.0000000094492705
position entity found: 总裁->第一副总裁, sim = 0.9138278115528521
company entity found: 深圳市->深圳市君和睿通科技股份有限公司, sim = 0.9628859211180791
company entity found: tcl集团股份有限公司->TCL集团股份有限公司, sim = 1.0
university entity found: 财经学院->宁波财经学院, sim = 0.9081512997142207
position entity found: 董事长->总经理, sim = 0.9022811891008132

position entity found: 董事长->总经理, sim = 0.9022811891008132
university entity found: 商学院->郑州商学院, sim = 0.9071498487162017
position entity found: 首席执行官->首席执行官, sim = 1.0
company entity found: 中国->尚科宁家（中国）科技有限公司, sim = 0.9840627450595995
position entity found: ceo->首席执行官, sim = 0.9130788393663668
company entity found: 发展->深圳市迈迪加科技发展有限公司, sim = 0.9571209545059199
position entity found: 副总裁->副总裁, sim = 1.000000029802326
position entity found: 市场部经理->市场总监, sim = 0.9245886161678205
position entity found: 总经理->总经理, sim = 1.0000000596046448
position entity found: 集团副总裁->副总裁, sim = 0.9149988839029601
university entity found: 大学->吉林外国语大学, sim = 0.9060334598685178
company entity found: 汽车->上海汽车集团股份有限公司
, sim = 0.9556356986257624
company entity found: 北京->宝希（北京）科技有限公司, sim = 0.9572426110775603

Handle No.12 text
Before sanitize, len: 188. After sanitize, len: 124
position entity found: 总裁->第一副总裁, sim = 0.9138278115528521
company entity found: 深圳市->深圳市君和睿通科技股份有限公司, sim = 0.9628859211180791
position e

company entity found: 发展->深圳市迈迪加科技发展有限公司, sim = 0.9571209545059199
university entity found: 湖南->湖南工商大学, sim = 0.9128547738195922
company entity found: 湖南->湖南恒茂高科股份有限公司, sim = 0.9930103142529615
university entity found: 衡阳师范学院->衡阳师范学院, sim = 1.0000000596046448
company entity found: 惠州市->惠州市耐利普科技有限公司, sim = 0.9897734262234279
university entity found: 大学->吉林外国语大学, sim = 0.9060334598685178

Handle No.27 text
Before sanitize, len: 152. After sanitize, len: 116
position entity found: 总裁兼->首席运营官, sim = 0.9214009344577789
university entity found: 师范大学->南宁师范大学, sim = 0.9128770340491656
position entity found: 首席技术官->首席运营官, sim = 0.930001825094223
university entity found: 大学->吉林外国语大学, sim = 0.9060334598685178
degree entity found: 博士->博士, sim = 1.0
university entity found: 华南师范大学->华南师范大学, sim = 1.0000000596046448
company entity found: 华南->广州华南信息技术有限公司, sim = 0.961838773852374

Handle No.28 text
Before sanitize, len: 208. After sanitize, len: 166
position entity found: 总裁->第一副总裁, sim = 0.9138278115

company entity found: 教育->广州智惟高教育科技有限公司, sim = 0.9628609500760072
position entity found: 运营总监->运营总监, sim = 1.0000000596046448
position entity found: 总监->市场总监, sim = 0.9021543206068469
position entity found: 联合创始人->首席运营官, sim = 0.9034993946552277
degree entity found: 研究生->研究生, sim = 1.0

Handle No.56 text
Before sanitize, len: 58. After sanitize, len: 38
university entity found: 武汉大学->武汉大学, sim = 1.0000000596046448
position entity found: 联合创始人->首席运营官, sim = 0.9034993946552277

Handle No.57 text
Before sanitize, len: 27. After sanitize, len: 17
university entity found: 大学->吉林外国语大学, sim = 0.9060334598685178

Handle No.58 text
Before sanitize, len: 19. After sanitize, len: 12

Handle No.59 text
Before sanitize, len: 11. After sanitize, len: 6

Handle No.60 text
Before sanitize, len: 195. After sanitize, len: 148
position entity found: 总经理->总经理, sim = 1.0000000596046448
position entity found: 董事长->总经理, sim = 0.9022811891008132
position entity found: 副总经理->总经理, sim = 0.9247318804264069
unive

degree entity found: 研究生->研究生, sim = 1.0
university entity found: 南京航空航天大学->南京航空航天大学, sim = 1.000000029802326
company entity found: 中国->尚科宁家（中国）科技有限公司, sim = 0.9840627450595995
company entity found: 南京->南京控驰科技有限公司, sim = 0.979792042693938
university entity found: 哈尔滨理工大学->哈尔滨理工大学, sim = 1.0
position entity found: 总监->市场总监, sim = 0.9021543206068469

Handle No.86 text
Before sanitize, len: 147. After sanitize, len: 110
company entity found: 科技股份->华立捷科技股份有限公司, sim = 0.9999999984295314
university entity found: 合肥学院->合肥学院, sim = 1.0
position entity found: 总监->市场总监, sim = 0.9021543206068469
company entity found: 安徽省->安徽省极索智能科技有限公司, sim = 0.9685017100256157
university entity found: 联合大学->北京联合大学, sim = 0.900326448717886
university entity found: 大学->吉林外国语大学, sim = 0.9060334598685178
position entity found: 销售经理->销售经理, sim = 1.0
company entity found: 中国->尚科宁家（中国）科技有限公司, sim = 0.9840627450595995
company entity found: 北京->宝希（北京）科技有限公司, sim = 0.9572426110775603
position entity found: 销售总监->销售经理, sim