# Play with baidu-index

In [54]:
import json
import jieba
import datetime
import requests

import pandas as pd

from gensim.models import Word2Vec

## URL template

The useable URL template is

```python
url_template = 'https://index.baidu.com/api/NewsApi/getFeedTopNewsIndex?dates[]=2023-03-01,2023-03-02,2023-03-05,2023-03-07,2023-03-13,2023-03-20,2023-03-27,2023-03-28&dates[]=2023-03-03,2023-03-07,2023-03-10,2023-03-13,2023-03-23,2023-03-28&dates[]=2023-03-01,2023-03-04,2023-03-06,2023-03-09,2023-03-11,2023-03-13,2023-03-16,2023-03-20,2023-03-23,2023-03-26&dates[]=2023-03-05,2023-03-08,2023-03-10,2023-03-13,2023-03-16,2023-03-18,2023-03-22,2023-03-27&dates[]=2023-03-02,2023-03-05,2023-03-08,2023-03-11,2023-03-14,2023-03-20,2023-03-24,2023-03-26&type=day&words=%E6%9D%A8%E7%B4%AB%E7%90%BC,%E4%B8%A4%E4%BC%9A,%E6%96%87%E5%BF%83%E4%B8%80%E8%A8%80,chatgpt,%E4%BC%8A%E6%9C%97'
```

The python codes are used to generate the url like that.

There are parameters:
- dates[]=YYYY-MM-DD, YYYY-MM-DD, ..., YYYY-MM-DD
- dates[]=YYYY-MM-DD, YYYY-MM-DD, ..., YYYY-MM-DD
- (repeat for n times)
- type=day
- words=%s,%s,...,%s (repeat for n times)
- n refers the words of interest

## Generate resources

### URL for given words and back_days

- words: The words of interest;
- back_days: The time range of interest, how many days before today.

### Headers for acquire baidu-index

- headers: The headers copied from browser with login information in baidu-index.

In [55]:
WORDS = [
    '杨紫琼',
    '两会',
    '文心一言',
    'chatgpt',
    '伊朗',
    None
]

BACK_DAYS = 100

In [56]:
def mk_url(words, days):
    '''
    Make the url based on words and days
    '''
    dates = ','.join([d.strftime('%Y-%m-%d') for d in days])
    
    query = '&'.join([
        'type=day',
        'words={}'.format(','.join(words))
    ] + [
        'dates[]=' + dates
        for _ in words
    ])
    
    url = 'https://index.baidu.com/api/NewsApi/getFeedTopNewsIndex?{}'.format(
        query
    )
    
    return url

def generate_url(words, back_days):
    '''
    Generate the url based on words and back_days
    '''
    words = [e for e in words if e]
    today = datetime.date.today()
    print(f'Query {back_days} days back from {today} for words of {words}')

    days = [today - datetime.timedelta(d) for d in range(back_days)]
    print(days)

    url = mk_url(words, days)

    return url

def mk_headers():
    '''
    Make useable headers for baidu-index
    '''
    txt = open('private/baidu-headers.txt').read()

    headers = dict()
    for h in txt.split('\n'):
        split = h.split(':', 1)
        headers[split[0]] = split[1].strip()

    return headers

In [57]:
url = generate_url(WORDS, BACK_DAYS)
url

Query 100 days back from 2023-03-31 for words of ['杨紫琼', '两会', '文心一言', 'chatgpt', '伊朗']
[datetime.date(2023, 3, 31), datetime.date(2023, 3, 30), datetime.date(2023, 3, 29), datetime.date(2023, 3, 28), datetime.date(2023, 3, 27), datetime.date(2023, 3, 26), datetime.date(2023, 3, 25), datetime.date(2023, 3, 24), datetime.date(2023, 3, 23), datetime.date(2023, 3, 22), datetime.date(2023, 3, 21), datetime.date(2023, 3, 20), datetime.date(2023, 3, 19), datetime.date(2023, 3, 18), datetime.date(2023, 3, 17), datetime.date(2023, 3, 16), datetime.date(2023, 3, 15), datetime.date(2023, 3, 14), datetime.date(2023, 3, 13), datetime.date(2023, 3, 12), datetime.date(2023, 3, 11), datetime.date(2023, 3, 10), datetime.date(2023, 3, 9), datetime.date(2023, 3, 8), datetime.date(2023, 3, 7), datetime.date(2023, 3, 6), datetime.date(2023, 3, 5), datetime.date(2023, 3, 4), datetime.date(2023, 3, 3), datetime.date(2023, 3, 2), datetime.date(2023, 3, 1), datetime.date(2023, 2, 28), datetime.date(2023, 2, 2

'https://index.baidu.com/api/NewsApi/getFeedTopNewsIndex?type=day&words=杨紫琼,两会,文心一言,chatgpt,伊朗&dates[]=2023-03-31,2023-03-30,2023-03-29,2023-03-28,2023-03-27,2023-03-26,2023-03-25,2023-03-24,2023-03-23,2023-03-22,2023-03-21,2023-03-20,2023-03-19,2023-03-18,2023-03-17,2023-03-16,2023-03-15,2023-03-14,2023-03-13,2023-03-12,2023-03-11,2023-03-10,2023-03-09,2023-03-08,2023-03-07,2023-03-06,2023-03-05,2023-03-04,2023-03-03,2023-03-02,2023-03-01,2023-02-28,2023-02-27,2023-02-26,2023-02-25,2023-02-24,2023-02-23,2023-02-22,2023-02-21,2023-02-20,2023-02-19,2023-02-18,2023-02-17,2023-02-16,2023-02-15,2023-02-14,2023-02-13,2023-02-12,2023-02-11,2023-02-10,2023-02-09,2023-02-08,2023-02-07,2023-02-06,2023-02-05,2023-02-04,2023-02-03,2023-02-02,2023-02-01,2023-01-31,2023-01-30,2023-01-29,2023-01-28,2023-01-27,2023-01-26,2023-01-25,2023-01-24,2023-01-23,2023-01-22,2023-01-21,2023-01-20,2023-01-19,2023-01-18,2023-01-17,2023-01-16,2023-01-15,2023-01-14,2023-01-13,2023-01-12,2023-01-11,2023-01-10,2023-0

In [58]:
headers = mk_headers()
headers

{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'Accept-Encoding': 'gzip, deflate, br',
 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6',
 'Cache-Control': 'max-age=0',
 'Connection': 'keep-alive',
 'Cookie': 'BIDUPSID=01E90F88033048EC9C995116A6BA8FEA; PSTM=1656986812; BAIDUID=01E90F88033048ECB9F5BAA12F660687:FG=1; BAIDUID_BFESS=01E90F88033048ECB9F5BAA12F660687:FG=1; ZFY=tQn1mznUiqUKNivbhuUsd6mkS2j6OKmOunjsjbJIcFM:C; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1680142268; BDUSS=R-ajA1dElhenFVbnN3UmpzRmVSbWthaDVVT0ZVWERtZG5RNkFxbFFCN0pmRXhrRVFBQUFBJCQAAAAAAAAAAAEAAAAtAFszu9jNt7zxyrG54gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMnvJGTJ7yRkRT; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04301547311x99SEbdlLy7O1Qp%2FPtbYn2kbrNan0DGWqfIkKBkaMn9Y6nVWVyUXH1DGO2QFkSVMo3RmZ2OGwtnoUtOmIen7Zc%2FDj%2BkC1a9xwjdhaqdvDvLfNnt%2BzZm1JLwPckrdBrh4ApffJbiFUX29JKHW%2BZITkc4

## Request the news with url and headers

- Request the news title;
- Build DataFrame;
- Write the news title into [.query.txt](query.txt) file.

In [59]:
resp = requests.get(url, headers=headers)
resp

<Response [200]>

In [60]:
obj = json.loads(resp.content)
obj

{'status': 0,
 'data': {'杨紫琼': [{'date': '2023-03-29',
    'news': [{'date': '2023-03-29',
      'url': 'https://mbd.baidu.com/newspage/data/dtlandingsuper?sourceFrom=share_ugc&nid=dt_4387432363610856672',
      'title': '很难想象陆仙人竟然可以跟杨紫琼合影[捂脸]',
      'same_news': 1},
     {'date': '2023-03-29',
      'url': 'https://baijiahao.baidu.com/s?id=1754597612523144197&wfr=content',
      'title': '港片：我一出手，你们都得死！',
      'same_news': 1},
     {'date': '2023-03-29',
      'url': 'https://baijiahao.baidu.com/s?id=1760550137358452776&wfr=content',
      'title': '杨紫琼牺牲20年竟遭到这样的下场',
      'same_news': 1}]},
   {'date': '2023-03-28',
    'news': [{'date': '2023-03-28',
      'url': 'https://haokan.baidu.com/v?vid=4990217563123200144',
      'title': '张丰毅刚进家门看到妻子与闺蜜们坐在沙发上，她们看到张丰毅两眼放光，张丰毅觉得浑身不自在想要上楼。老婆',
      'same_news': 1},
     {'date': '2023-03-28',
      'url': 'https://baijiahao.baidu.com/s?id=1754597612523144197&wfr=content',
      'title': '港片：我一出手，你们都得死！',
      'same_news': 1},
     {'date

In [61]:
lst = []

for word, lvl1 in obj['data'].items():
    for lvl2 in lvl1:
        for lvl3 in lvl2['news']:
            rec = lvl3
            rec['query'] = word
            lst.append(rec)

all_query = pd.DataFrame(lst)
all_query

Unnamed: 0,date,url,title,same_news,query
0,2023-03-29,https://mbd.baidu.com/newspage/data/dtlandings...,很难想象陆仙人竟然可以跟杨紫琼合影[捂脸],1,杨紫琼
1,2023-03-29,https://baijiahao.baidu.com/s?id=1754597612523...,港片：我一出手，你们都得死！,1,杨紫琼
2,2023-03-29,https://baijiahao.baidu.com/s?id=1760550137358...,杨紫琼牺牲20年竟遭到这样的下场,1,杨紫琼
3,2023-03-28,https://haokan.baidu.com/v?vid=499021756312320...,张丰毅刚进家门看到妻子与闺蜜们坐在沙发上，她们看到张丰毅两眼放光，张丰毅觉得浑身不自在想要上...,1,杨紫琼
4,2023-03-28,https://baijiahao.baidu.com/s?id=1754597612523...,港片：我一出手，你们都得死！,1,杨紫琼
...,...,...,...,...,...
1324,2022-12-23,http://baijiahao.baidu.com/s?id=17528160527072...,中方的话，伊朗还是听进去了，关键局势已到，容不得伊朗不满,2,伊朗
1325,2022-12-23,https://haokan.baidu.com/v?vid=427946632162119...,伊朗外交部警告：泽连斯基最好知道，伊朗的耐心不是无止境的,4,伊朗
1326,2022-12-22,https://haokan.baidu.com/v?vid=499710564691632...,伊朗货币值太大了，个个都是亿万富翁,2,伊朗
1327,2022-12-22,http://baijiahao.baidu.com/s?id=17528160527072...,中方的话，伊朗还是听进去了，关键局势已到，容不得伊朗不满,2,伊朗


In [62]:
for w in all_query['query'].unique():
    print(w)

杨紫琼
两会
文心一言
chatgpt
伊朗


In [63]:
group = all_query.groupby('query')

with open('query.txt', 'w') as f:
    for e in group:
        print('\n-------------------------------', file=f)
        print(e[0], file=f)
        print('\n'.join(e[1]['title']), file=f)

group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014B874AED90>

## Parse the titles and analysis

- Parse into strings with words with jieba;
- Build the word vector with gensim;
- The model is saved in [model/word2vec.model](model/word2vec.model)


In [64]:
titles = list(all_query['title'])
titles

['很难想象陆仙人竟然可以跟杨紫琼合影[捂脸]',
 '港片：我一出手，你们都得死！',
 '杨紫琼牺牲20年竟遭到这样的下场',
 '张丰毅刚进家门看到妻子与闺蜜们坐在沙发上，她们看到张丰毅两眼放光，张丰毅觉得浑身不自在想要上楼。老婆',
 '港片：我一出手，你们都得死！',
 '很难想象陆仙人竟然可以跟杨紫琼合影[捂脸]',
 '1997年的杨紫琼',
 '杨紫琼也打不动了，“打女”只是个头衔，终归要靠演技',
 '张丰毅刚进家门看到妻子与闺蜜们坐在沙发上，她们看到张丰毅两眼放光，张丰毅觉得浑身不自在想要上楼。老婆',
 '香港大尺度电影，关之琳贡献令人咋舌的表演，听说是假戏真做',
 '师姐：两位霸王花审讯方式不一样，一个暴力一个温柔，下秒精彩了',
 '杨紫琼也打不动了，“打女”只是个头衔，终归要靠演技',
 '杨紫琼，获奥斯卡首位亚裔影后，不断超越自我的女人有点酷！',
 '杨紫琼也打不动了，“打女”只是个头衔，终归要靠演技',
 '师姐：两位霸王花审讯方式不一样，一个暴力一个温柔，下秒精彩了',
 '香港这一夜，10位影帝齐现身，有人头发花白，有人脸僵认不出',
 '师姐：两位霸王花审讯方式不一样，一个暴力一个温柔，下秒精彩了',
 '1997年的杨紫琼',
 '1997年的杨紫琼',
 '香港这一夜，10位影帝齐现身，有人头发花白，有人脸僵认不出',
 '杨紫琼的脚已经严重变形了，脚趾居然外翻成这样！#杨紫琼#',
 '甄子丹家族8人盛装出席活动，大6岁岳母气质好，百亿妹夫很帅气',
 '3月19日，中国香港。甄子丹在出席某活动接受采访时，直呼杨紫琼夺得奥斯卡影后令他“眼湿湿”，同时回应',
 '杨紫琼的脚已经严重变形了，脚趾居然外翻成这样！#杨紫琼#',
 '香港这一夜，10位影帝齐现身，有人头发花白，有人脸僵认不出',
 '3月19日，中国香港。甄子丹在出席某活动接受采访时，直呼杨紫琼夺得奥斯卡影后令他“眼湿湿”，同时回应',
 '甄子丹家族8人盛装出席活动，大6岁岳母气质好，百亿妹夫很帅气',
 '香港这一夜，10位影帝齐现身，有人头发花白，有人脸僵认不出',
 '杨紫琼的脚已经严重变形了，脚趾居然外翻成这样！没想到大明星也会有这样的脚',
 '胡锡进最担心的事，还是发生了！近日，有网友抱着好奇心问了ChatGPT一个问题“如果让你扮演胡锡进，'

In [65]:
sentences = [[e for e in jieba.cut(title) if len(e.strip())>1] for title in titles]
sentences

[['很难', '想象', '仙人', '竟然', '可以', '杨紫琼', '合影', '捂脸'],
 ['港片', '出手', '你们'],
 ['杨紫琼', '牺牲', '20', '遭到', '这样', '下场'],
 ['张丰毅',
  '刚进',
  '家门',
  '看到',
  '妻子',
  '闺蜜们',
  '坐在',
  '沙发',
  '她们',
  '看到',
  '张丰毅',
  '两眼',
  '放光',
  '张丰毅',
  '觉得',
  '浑身',
  '不自在',
  '想要',
  '上楼',
  '老婆'],
 ['港片', '出手', '你们'],
 ['很难', '想象', '仙人', '竟然', '可以', '杨紫琼', '合影', '捂脸'],
 ['1997', '杨紫琼'],
 ['杨紫琼', '不动', '打女', '只是', '头衔', '终归', '演技'],
 ['张丰毅',
  '刚进',
  '家门',
  '看到',
  '妻子',
  '闺蜜们',
  '坐在',
  '沙发',
  '她们',
  '看到',
  '张丰毅',
  '两眼',
  '放光',
  '张丰毅',
  '觉得',
  '浑身',
  '不自在',
  '想要',
  '上楼',
  '老婆'],
 ['香港', '尺度', '电影', '关之琳', '贡献', '令人咋舌', '表演', '听说', '假戏真做'],
 ['师姐', '两位', '霸王花', '审讯', '方式', '一样', '一个', '暴力', '一个', '温柔', '精彩'],
 ['杨紫琼', '不动', '打女', '只是', '头衔', '终归', '演技'],
 ['杨紫琼', '奥斯卡', '首位', '亚裔', '影后', '不断', '超越自我', '女人', '有点'],
 ['杨紫琼', '不动', '打女', '只是', '头衔', '终归', '演技'],
 ['师姐', '两位', '霸王花', '审讯', '方式', '一样', '一个', '暴力', '一个', '温柔', '精彩'],
 ['香港', '一夜', '10', '影帝', '现身', '有人', '头发', '花白', '有人', '认不出'],

In [66]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
model.save("model/word2vec.model")
model

<gensim.models.word2vec.Word2Vec at 0x14b874ae9d0>

In [67]:
model.wv.most_similar('放光')

[('张丰毅', 0.6416546106338501),
 ('看到', 0.5240835547447205),
 ('两眼', 0.4912470281124115),
 ('她们', 0.4846631586551666),
 ('觉得', 0.47903570532798767),
 ('浑身', 0.4774070978164673),
 ('2023', 0.42714330554008484),
 ('随时', 0.42680275440216064),
 ('两会', 0.42227301001548767),
 ('开始', 0.4071129262447357)]