# Demo6
In this notebook, an exact match demo will be completed, acts as a comparison to the present model.

In [50]:
import jieba
import re
import chardet
import pandas as pd
import prettytable as pt

## Load File

In [13]:
def loadSchool():
    path1 = 'data/chinese_university_list.csv'
    school_df = pd.read_csv(path1, header=None, delimiter=",", skiprows=4, names=["rank", "name", "code", "department", "city", "level", "notes"])
    school_df = pd.DataFrame(school_df, columns=['name'])
    
    path2 = 'data/all_university_list.csv'
    school_global_df = pd.read_csv(path2, header=None, delimiter=',', skiprows=2, names=['Name_en', 'name', 'rank', 'score', 'location'])
    school_global_df = pd.DataFrame(school_global_df, columns=['name'])
    
    school_df = pd.concat([school_df, school_global_df], axis=0, ignore_index=True)
    return school_df

In [14]:
def loadDegree():
    degree = {'name': ['本科', '硕士' ,'研究生', '博士']}
    degree_df = pd.DataFrame(degree)
    return degree_df

In [2]:
def loadCompany():
    path1 = 'data/company_list_ch.csv'
    company_ch_df = pd.read_csv(path1, header=None, delimiter=",", skiprows=2, names=['rank', 'name', 'Location', 'Income'])
    company_ch_df = pd.DataFrame(company_ch_df, columns=['name'])
    path2 = 'data/member-data.csv'
    company_df = pd.read_csv(path2, header=None, delimiter=",", skiprows=2, names=['name', 'No.', 'Resume', 'Position'])
    company_df = pd.DataFrame(company_df, columns=['name'])
    company_ch_df = pd.concat([company_ch_df, company_df], axis=0, ignore_index=True)
    
    print(f'before dedup, company size: {company_ch_df.shape[0]}')
    company_ch_df = company_ch_df.drop_duplicates(subset=['name'], keep='first')
    print(f'after dedup, company size: {company_ch_df.shape[0]}')
    return company_ch_df

In [15]:
def loadPosition():
    path5 = 'data/position.csv'
    position_df1 = pd.read_csv(path5, header=None, delimiter=",", skiprows=1, names=['name'])
    position_df1 = pd.DataFrame(position_df1)

    path6 = 'data/member-data.csv'
    position_df2 = pd.read_csv(path6, header=None, delimiter=",", skiprows=1, names=['Company', 'No.', 'Resume', 'name'])
    position_df2 = pd.DataFrame(position_df2, columns=['name'])

    member_position_list = []
    # process position in member-data
    for index, row in position_df2.iterrows():
        position = row['name']
        if isinstance(position, float) or position == " " or position.isalpha():
            continue
        if "&" in position:
            temp1 = position.split('&')
            member_position_list += temp1
        elif " " in position:
            temp2 = position.split( )
            member_position_list += temp2
        else:
            member_position_list.append(position)

    position_df3 = pd.DataFrame(member_position_list, columns=['name'])
    position_df1 = pd.concat([position_df1, position_df3], axis=0, ignore_index=True)
    print(f'before dedup, position size: {position_df1.shape[0]}')
    position_df1 = position_df1.drop_duplicates(subset=['name'], keep='first')
    print(f'after dedup, position size: {position_df1.shape[0]}')
    # print(df1.head())
    return position_df1

In [16]:
def loadMember():
    path = 'data/member-data.csv'
    member_df = pd.read_csv(path, header=None, delimiter=",", skiprows=1, names=['Company', 'No.', 'Resume', 'Position'])
    member_df = pd.DataFrame(member_df, columns=['Resume'])
    return member_df

## Preprocess Text

In [3]:
def removeStopWords(seglist):
    stopwords = {}
    fstop = open('data/stopwords_cn.txt', 'r', encoding='utf-8', errors='ignore')
    for w in fstop:
        stopwords[w.strip()] = w.strip()

    fstop.close()
    stopwords[' '] = ' '
    
    segListSanitized = []

    for word in seglist:
        # translation
        if word == 'omnigo':
            word = '酷刻'
        if word == 'Aibee':
            word = '爱笔'
        if word == 'ilife':
            word = '爱乐福'
        if word == 'oracleen':
            word = '爱芽'
        if word not in stopwords:
            segListSanitized.append(word)
    return segListSanitized

In [4]:
def preprocess(text):
    # remove punctuations
    text = re.sub(r"[\s+\.\!\/_,$%^*()?;；:【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", text)
    text = text.lower()
    # seperate words
    words = jieba.cut(text, cut_all=False)
    seglist = list(words)
    # remove stopwords
    segListSanitized = removeStopWords(seglist)
    print(f'Before sanitize, len: {len(seglist)}. After sanitize, len: {len(segListSanitized)}')

    return segListSanitized

## N-gram Algorithm

In [5]:
def getNgrams(wordList, n):
    '''
    This function only generete N-Grams
    '''
    output = []
    for i in range (len(wordList) - n + 1):
        n_gram_temp = "".join(wordList[i:i+n])
        output.append(n_gram_temp)
    return output

In [6]:
def generateNgramsV2(wordList, n):
    '''
    This function genereates [1, N]-Grams
    '''
    result = []
    for i in range(n):
        temp_list = getNgrams(wordList, i+1)
        temp = list(set(temp_list))
        temp.sort(key=temp_list.index)
        result.append(temp)
        
    return result

## Entity Link

In [32]:
def linkEntity(output, company_df, position_df, school_df, degree_df):
    company_entity = []
    position_entity = []
    school_entity = []
    degree_entity = []
    for index, li in enumerate(output):
        print(f'process {index}-Gram')
        
        for term in li:
            if len(term) <= 1:
                continue
            
            # Link Company
            for index, row in company_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name = name.lower()
                if term == name:
                    company_entity.append(term)
                    print(f'company entity found: {term}')
            
            # Link Position
            for index, row in position_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name = name.lower()
                if term == name:
                    position_entity.append(term)
                    print(f'position entity found: {term}')
            
            # Link School
            for index, row in school_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name = name.lower()
                if term == name:
                    school_entity.append(term)
                    print(f'school entity found: {term}')
            
            # Link Degree
            for index, row in degree_df.iterrows():
                name = row['name']
                if isinstance(name, float):
                    continue
                name = name.lower()
                if term == name:
                    degree_entity.append(term)
                    print(f'degree entity found: {term}')
            
    return company_entity, position_entity, school_entity, degree_entity

In [46]:
def print_table_company_position(company_entity, position_entity):
    company_list = []
    position_list = []
        
    min_len = min(len(company_entity), len(position_entity))
    
    for i, s in enumerate(company_entity):
        if i == min_len:
            break
        company_list.append(s)

    for i, d in enumerate(position_entity):
        if i == min_len:
            break
        position_list.append(d)

    tb = pt.PrettyTable()
    tb.add_column("Company", company_list)
    tb.add_column("Position", position_list)
    print(tb)

In [47]:
def print_table_school_degree(school_entity, degree_entity):
    school_list = []
    degree_list = []

    if len(degree_entity) == 0:
        d = dict()
        d= '本科'
        degree_entity.append(d)
        
    min_len = min(len(school_entity), len(degree_entity))
    
    for i, s in enumerate(school_entity):
        if i == min_len:
            break
        school_list.append(s)

    for i, d in enumerate(degree_entity):
        if i == min_len:
            break
        degree_list.append(d)

    tb = pt.PrettyTable()
    tb.add_column("School", school_list)
    tb.add_column("Degree", degree_list)
    print(tb)

In [48]:
def print_table(company_entity, position_entity, school_entity, degree_entity):
    print_table_company_position(company_entity, position_entity)
    print_table_school_degree(school_entity, degree_entity)

## Case Test

In [38]:
company_df = loadCompany()
position_df = loadPosition()
school_df = loadSchool()
degree_df = loadDegree()
member_df = loadMember()

before dedup, company size: 11716
after dedup, company size: 3571
before dedup, position size: 5983
after dedup, position size: 199


In [39]:
text1 = '__团队成员#1__先生是公司创始人,也是中国最有影响力的商界领袖之一。1982年,__团队成员#1__先生于华南理工大学毕业,进入TCL的前身-TTK家庭电器有限公司。1985年,他担任新成立的TCL通讯设备公司总经理,创立了TCL品牌。2003年,__团队成员#1__担任TCL集团股份有限公司董事长兼CEO,随后TCL集团整体上市。在他的领导下,2004年TCL一举收购了法国汤姆逊全球彩电业务与阿尔卡特全球手机业务。目前TCL集团已经成为拥有6万名员工,业务遍及全球80多个国家和地区。2013年,TCL集团营业总收入超过855亿元,液晶电视全球销量1766万台,实际产量全球第三,品牌销售全球第三;TCL手机全球销量5520万台,行业排名全球第五。2012年__团队成员#1__被新华网评为“最具社会责任感企业家”;2011年荣获《中国企业家》“最具影响力的25位企业领袖”终身成就奖;2009年被评为“CCTV中国经济年度人物十年商业领袖”;2008年获改革开放30年经济人物称号;2004年被评为Fortune杂志“亚洲年度经济人物”、TIME杂志和CNN全球最具影响力的25名商界人士,同年法国总统希拉克向__团队成员#1__先生颁发了法国国家荣誉勋章。__团队成员#1__是中共第十六大代表,第十届、第十一届、第十二届全国人大代表。__团队成员#1__担任的社会职务包括:中国电子视像行业协会会长;中国国际商会副会长;全国工商联执行委员、广东省工商联(总商会)副主席。'

In [40]:
segListSanitized = preprocess(text1)
output = generateNgramsV2(segListSanitized, 3)
company_entity, position_entity, school_entity, degree_entity = linkEntity(output, company_df, position_df, school_df, degree_df)
print_table(company_entity, position_entity, school_entity, degree_entity, f1)

Before sanitize, len: 325. After sanitize, len: 239
process 0-Gram
position entity found: 创始人
school entity found: 华南理工大学
school entity found: 华南理工大学
position entity found: 总经理
position entity found: 董事长
position entity found: ceo
process 1-Gram
position entity found: 公司创始人
process 2-Gram
company entity found: tcl集团股份有限公司


In [51]:
print_table(company_entity, position_entity, school_entity, degree_entity)

+---------------------+----------+
|       Company       | Position |
+---------------------+----------+
| tcl集团股份有限公司 |  创始人  |
+---------------------+----------+
+--------------+--------+
|    School    | Degree |
+--------------+--------+
| 华南理工大学 |  本科  |
+--------------+--------+
