In [1]:
import pandas as pd
import re
import json
import time

# Data Read-in

In [2]:
# Display all content
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', None)

In [3]:
data = pd.read_csv('labeled_data.csv')

In [4]:
# Just a bunch of cleaning stuff for irrelevant data
data = data[['item_name', 'title', 'content_','label']]
data.columns = ['tag', 'title', 'content', 'label']

# Text Cleaning

In [5]:
# Remove Whitespaces and Newlines
pattern = r'[\n ]'
data['content'] = pd.Series([re.sub(pattern, '', text) for text in data.content])

# Split Text

In [6]:
# Split text into blocks, by 500 characters, separated by period(, or comma), or whatever
# Return a list of strings

def split_text(text, size = 500):
    
    # Argument Restrictions
    if type(text) != str:
        raise ValueError("Input text must be a string!")
    
    if size < 100:
        raise ValueError("Split size must be g.o.e 100!")
    elif size > 600:
        raise ValueError("Split size must be l.o.e 600!")
    
    # Initialize
    length = len(text)
    text_block = []
    i, j = 0, size
    
    while j < length:
        while text[j - 1] != '。':
            j -= 1
            if j == i:
                j = i + size
                while text[j - 1] != '，':
                    j -= 1
                    if j == i:
                        j = i + size
                        break
                break
        text_block.append(text[i:j])
        i = j
        j = i + size
    text_block.append(text[i:length])
    
    return text_block

In [7]:
data['split'] = pd.Series([split_text(text) for text in data.content])

In [8]:
# Remove Stopwords
stopwords = [line.strip() for line in open('stopwords.txt', encoding = 'UTF-8').readlines()]

In [9]:
def move_stopwords(wordlist, stopwords):
    output = [word for word in wordlist if word not in stopwords]
    return output

# Baidu

In [10]:
# encoding:utf-8
import requests 
from aip import AipNlp

# client_id 为官网获取的AK， client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=WhgBKh94vkLS7IEN01GmynsA&client_secret=XWYOMfoCadaQnTEZDgCARpwoq41eNM4y'
response = requests.get(host)
if response:
    print(response.json())

client = AipNlp('23257388', 'WhgBKh94vkLS7IEN01GmynsA', 'XWYOMfoCadaQnTEZDgCARpwoq41eNM4y')

{'refresh_token': '25.23a04c9ad0f16287520e51a0653ba1ad.315360000.1924226934.282335-23257388', 'expires_in': 2592000, 'session_key': '9mzdDZRpFFifHaMyYxDbJDmRt987hjglz5aSPVyQOxFQHZ6CH2fVwNZJAw8ianer3IWhbQK5Sy7tMB5O//GIXTJL2a6kXQ==', 'access_token': '24.6b1608ffdf43807b09c707d81e7f08ad.2592000.1611458934.282335-23257388', 'scope': 'public nlp_simnet nlp_wordemb nlp_comtag nlp_dnnlm_cn brain_nlp_lexer brain_all_scope brain_nlp_comment_tag brain_nlp_dnnlm_cn brain_nlp_word_emb_vec brain_nlp_word_emb_sim brain_nlp_sentiment_classify brain_nlp_simnet brain_nlp_depparser brain_nlp_wordembedding brain_nlp_dnnlm_cn_legacy brain_nlp_simnet_legacy brain_nlp_comment_tag_legacy brain_nlp_lexer_custom brain_nlp_keyword brain_nlp_topic brain_nlp_ecnet brain_nlp_emotion brain_nlp_comment_tag_custom brain_nlp_news_summary brain_nlp_sentiment_classify_custom brain_nlp_address wise_adapt lebo_resource_base lightservice_public hetu_basic lightcms_map_poi kaidian_kaidian ApsMisTest_Test权限 vis-classify_flow

In [11]:
# Convert utf-8 to gbk

def killAnUnseen(s):
    try:
        s.encode('gbk')
        return s
    except UnicodeEncodeError as err:
        mode = re.findall(r'position ([0-9]*): illegal multibyte sequence', str(err))
        news = s.replace(s[int(mode[0])], "", 1)
        return killAnUnseen(news)

In [33]:
# Get both baidu segmentation and sentiment

def baidu(text_list):
    seg_list = []
    prob_list = []
    length = len(text_list)
    i = 0
    
    # 2 operations per loop
    while i + 1 < length:
        prob_list.append(baidu_prob(text_list[i]))
        prob_list.append(baidu_prob(text_list[i+1]))
        seg_list.append(baidu_seg(text_list[i]))
        seg_list.append(baidu_seg(text_list[i+1]))
        i += 2
        time.sleep(1)
    
    if i == length - 1:
        prob_list.append(baidu_prob(text_list[i]))
        seg_list.append(baidu_seg(text_list[i]))
        time.sleep(1)
        
    return seg_list, prob_list


def baidu_seg(text):
    result_list = []
    result = client.lexer( killAnUnseen(text))
    while len(result) == 2:
        time.sleep(1)
        result = client.lexer( killAnUnseen(text))
    for item in result['items']: 
        result_list.append(item['item'])
    result_list = move_stopwords(result_list, stopwords)
    
    return result_list


def baidu_prob(text):
    result = client.sentimentClassify( killAnUnseen(text) )
    while len(result) == 2:
        time.sleep(1)
        result = client.sentimentClassify( killAnUnseen(text) )
    return result['items'][0]['negative_prob']

In [13]:
data['baidu_seg'] = ''
data['baidu_prob'] = ''

In [14]:
data = data[['content', 'split', 'baidu_seg', 'baidu_prob', 'label']]

In [34]:
def write_value(i):
    length = len(data.index)
    
    while i < length:
        seg, prob = baidu(data.at[i, 'split'])
        data.at[i, 'baidu_seg'] = seg
        data.at[i, 'baidu_prob'] = prob
        print(str(i))
        i += 1
        
    print("Finish All!")

In [43]:
write_value(1648)

1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754


ConnectionError: HTTPSConnectionPool(host='aip.baidubce.com', port=443): Max retries exceeded with url: /rpc/2.0/nlp/v1/sentiment_classify?access_token=24.6a99f9f3809f01b6353f1eb0c28ffd9b.2592000.1611458935.282335-23257388 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022BF16B9850>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。'))

In [41]:
data.loc[34]

content                                                                                                                    昨天，在中建一局底价8.83亿摘获武清新城地块时，大张庄016号地却宣布“因故停牌”了！2019年11月29日我局发布了宗地编号为津辰张（挂）2019-016号地块国有建设用地使用权的公开挂牌出让公告，现因故停牌，特此公告。四宗地块原计划于2020年1月2日同天摘牌。结果没想到临近出让，大张庄016号地却下架了。该宗地出让土地面积39417.2平米，土地用途为城镇住宅、商服、公共管理与公共服务，国有建设用地使用权出让年限为城镇住宅70年、商服40年、公共管理与公共服务50年。
split                                                                                                                    [昨天，在中建一局底价8.83亿摘获武清新城地块时，大张庄016号地却宣布“因故停牌”了！2019年11月29日我局发布了宗地编号为津辰张（挂）2019-016号地块国有建设用地使用权的公开挂牌出让公告，现因故停牌，特此公告。四宗地块原计划于2020年1月2日同天摘牌。结果没想到临近出让，大张庄016号地却下架了。该宗地出让土地面积39417.2平米，土地用途为城镇住宅、商服、公共管理与公共服务，国有建设用地使用权出让年限为城镇住宅70年、商服40年、公共管理与公共服务50年。]
baidu_seg     [[昨天, 中建一局, 底价, 8.83亿, 摘, 获, 武清, 新城, 地块, 大张庄, 016号, 宣布, 因故, 停牌, 2019年11月29日, 局, 发布, 宗地, 编号, 津辰张, 挂, 2019, 016号, 地块, 国有建设用地使用权, 公开, 挂牌, 出让, 公告, 现, 因故, 停牌, 特此, 公告, 四宗, 地块, 原, 计划, 2020年1月2日, 同天, 摘牌, 没想到, 临近, 出让, 大张庄, 016号, 下架, 宗地, 出让, 土地, 面积, 39417.2平米, 土地, 用途,