# Trade Barriers Text Mining

## Part 1: Cleaning Data

In [1]:
import pandas as pd

df = pd.read_excel('trade_barrier.xlsx', header=None, names=['type', 'barrier', 'status'])
# Create country column
df['country'] = df.iloc[:, 0].str.extract(r'國家：(.+)', expand=False).fillna(method='ffill')
# Remove "「2015年對臺貿易障礙報告」調查表" rows
df = df[~df.iloc[:, 0].str.contains(r'調查表', na=False)].reset_index(drop=True)

In [2]:
# Remove newlines and carriage returns
df = df.replace(r'[\n\r]', r'', regex=True)
# Clean barrir type column
df['type'] = df['type'].str.replace(r' ', r'').str.replace(r'.+、(.*)', r'\1').replace(
    [r'.*檢驗.*', r'.*其他.*', r'.*人員移動.*', r'.*標準.*'],
    [r'檢驗與檢疫', r'其他', r'人員移動', r'標準與認證'], regex=True).fillna(method='ffill')
# Replace commas, semicolons, parentheses, and non-interword whitespaces
df = df.replace(
    [r'([^a-zA-Z0-9.\' ]{1}),([^a-zA-Z0-9.\' ]{1})|([^a-zA-Z0-9.\' ]{1}),(.{1})|(.{1}),([^a-zA-Z0-9.\' ]{1})',
     r'([^a-zA-Z]{1});',
     r'（',
     r'）',
     r'([^a-zA-Z]{1}) ',
     r' ([^a-zA-Z]{1})',
    ],
    [r'\1，\2',
     r'\1；',
     r'(',
     r')',
     r'\1',
     r'\1'
    ], regex=True)
# Fix incorrect spacing
df = df.replace(r'([a-z])([A-Z])', r'\1 \2', regex=True)

## Part 2: Compiling International Trade Terms Dictionary

In [3]:
%%time

import requests
import re
import string

def get_trade_dict():

    def scrape_boft():
        resp = requests.get('http://www.trade.gov.tw/Pages/Detail.aspx?nodeID=134&pid=58045')
        data = pd.read_html(resp.text)[1]
        terms = data[1].str.replace(r'\(.*?\)', r'')
        acronyms = re.findall(r'\((.*?)\)', ''.join(data[2]))
        acronyms = [re.sub(r'^ | $', r'', x) for x in acronyms]
        acronyms = [re.sub(r'、', r', ', x) for x in acronyms]
        acronyms = list(filter(lambda x: not re.search(
                    r'^[A-Z]{1}[a-z]*$|^[a-z]*$', x), acronyms))
        boft_terms = list(terms) + acronyms
        return boft_terms

    def scrape_cier():
        cier_terms = []
        alphabet = list(string.ascii_uppercase)
        alphabet.remove('X')
        for l in alphabet:
            url = 'http://web.wtocenter.org.tw/DictionaryList.aspx?formclass=' + l
            resp = requests.get(url)
            col = pd.read_html(resp.text)[0].iloc[1:, 1]
            parentheses_rm = col.str.replace(r'（.*?）', r'').str.replace(
                r'\(.*?\)', r'').str.replace(r' ', r'')
            terms = list(parentheses_rm.str.extract(r'(.*?)-', expand=False))
            acronyms = re.findall(r'([A-Z]{2,})', ''.join(col.values))
            cier_terms += terms + acronyms
        return cier_terms
    
    trade_dict = scrape_boft() + scrape_cier()
    composites = list(filter(lambda x: re.search('vs.', x), trade_dict))
    pures = list(filter(lambda x: not re.search('vs.', x), trade_dict))
    flatten = [item for sublist in [re.split('vs.|V.S.', x) for x in composites]
               for item in sublist]
    trade_dict = list(set(pures + flatten))
    trade_dict.sort()
    with open('trade.txt', mode='w', encoding='utf-8') as file:
        file.write('\n'.join(trade_dict))
    return

get_trade_dict()

Wall time: 33.3 s


## Part 3: Text Segmentation and Keyword Extraction

In [5]:
import jieba
import jieba.analyse
import numpy as np

# Using default dictionary of jieba-zh_TW
jieba.set_dictionary('C:/Program Files/Anaconda3/Lib/site-packages/jieba/dict.txt')
# Alternative dictionary provided in jieba extra_dict:
# jieba.set_dictionary('dict.txt.big.txt')
# Add custom dictionary
jieba.load_userdict('trade.txt')
# Custom stop words
# jieba.analyse.set_stop_words(file_name)
# Custom Idf
jieba.analyse.set_idf_path('idf.txt')

np.random.seed(7)
for ind in np.random.randint(0, len(df), 10):
    seg_list = jieba.cut(df['status'][ind])
    print(('Trade barrier document {}:\n\n'.format(ind)
           + '|'.join(seg_list) + '\n\n'
           + '*Keywords: {}\n\n--\n'.format(', '.join(
                    jieba.analyse.extract_tags(df['status'][ind], 10, withWeight=False)))))

Trade barrier document 175:

1|.|泰國|於|1999|年|開始|實施|外商|法後|，|外商|禁止|經營|外商|法|附錄|中第|一類|所列|9|大|產業|；|惟|經|取得|內閣|允許|或|特許|執照|，|得|經營|第|二|、|三類|所列|各|產業|。|2|.|外商|法|對於|外國|法人|之|定義|及|對|外商|營業|之|限制|，|對|外商|在泰|投資|經營|發展|不利|。|(|註|：|依據|泰國|外商|法|，|泰國|限制|外商|經營|之|服務業|項目|如下|：|第|一|類|：|因|特殊|理由|禁止|外國人|經營|的|業別|包含|：|1|.|報業|、|廣播|電台|、|電視台|；|2|.|農業|、|灌溉|或|園藝|；|3|.|動物|育種|；|4|.|林業|及|木材|保育|；|5|.|漁業|；|6|.|泰|草本|粹取|；|7|.|泰國|古董|貿易|：|8|.|製造|佛像|及缽|；|9|.|土地|販售|。|第|二類|：|基於|國家|安全|、|維護|風土|民俗|、|保障|當地|製造|及|保護|天然|資源|等|原因|，|禁止|外國人|經營業|，|需|內閣|同意|後|才|允許|經營|相關|產業|。|第|三類|：|泰國人|對|外國人|未|具|競爭力|之|業務|，|惟|需|先|取得|特許|執照|：|碾米|與|麵粉|製造|、|包括|魚類|等|水產|養殖|、|植木|、|會計|服務業|；|法律|服務業|；|建築|服務業|；|工程|服務業|；|旅店|業|、|不|含|旅店|管理|；|導遊|業|(|包括|旅行社|業務|)|；|除|部會|級|法規|規定|的|服務業|以外|的|其|他|服務業|；|仲介|代理業|，|但|不|包括|(|a|)|證券|交易|仲介|或|代理|、|農產品|期貨|交易|、|有價證券|買賣|服務|；|(|b|)|為|聯營|企業|的|生產|、|服務|需要|提供|買賣|、|採購|、|尋求|服務|的|仲介|或|代理|業務|；|(|c|)|為|外國人|投入|最|低|資本|1|億|泰銖|以上|的|行銷|國內|產品|或|進口|產品|的|國際|貿易|企業|提供|買賣|、|採購|、|推銷|、|尋求|國內外|市場|的|仲介|或|代理|業務|。|(|d|)|部級|法規|規定|的|其|他|仲介|或|代理業|)|。|3|.|倘|以|泰國人|註冊|公司|及|其|營運|項目|，