# 1、离线数据获取

## 1.1数据清洗

In [64]:
sid_message_list_last = []
invalid_sid = []
for sid_message in sid_message_list:
    if len(sid_message.waitersend.value_counts().keys()) == 2:
        sid_message_list_last.append(sid_message)
    else:
        invalid_sid.append(sid_message)
print(len(sid_message_list))
print(len(sid_message_list_last))

25538
24795


In [71]:
len(invalid_sid)

743

# 2、ES数据存储/删除/查询

In [36]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import sys
import time
from datetime import datetime,timedelta
import json

## 2.1 ES配置

In [37]:
# 集群名称
user = "jiesi-jdos-cs-rec"
# 集群密码
pss = "227DB7650798709B"
# 链接信息
hostArr = ["prod-4-40000-jiesi-jdos-cs-rec.jd.com:40000","prod-1-40000-jiesi-jdos-cs-rec.jd.com:40000","prod-2-40000-jiesi-jdos-cs-rec.jd.com:40000"]

In [38]:
es = Elasticsearch(
    hostArr,
    http_auth=(user, pss)
)

## 2.2 创建索引

In [303]:
# 索引名称
index_name='speechcraft_recommend'

In [304]:
mapping = '''
{
  "settings": {
        "index": {
            "number_of_shards": "4",
            "analysis": {
                "analyzer": {
                    "semi_analyzer": {
                        "pattern": ";",
                        "type": "pattern"
                    },
                    "comma_analyzer": {
                        "pattern": "@",
                        "type": "pattern"
                    }
                }
            },
            "number_of_replicas": "1"
        }
    },
  "mappings": {
    "properties": {
        "preMessage":{"type":"text","analyzer": "comma_analyzer"},
        "lastText":{"type":"text","analyzer": "comma_analyzer"},
        "preMessageString":{"type":"text"},
        "lastTextString":{"type":"text"},
        "nextTextString":{"type":"text"},
        "waiter":{"type":"text"}
    }
  }
}
'''
es.indices.create(index=index_name, ignore=400, body=mapping)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'speechcraft_recommend'}

In [305]:
# 分片名称
alias_name = 'cs_speechcraft_recommend'

In [306]:
es.indices.put_alias(index=index_name, name=alias_name)

{'acknowledged': True}

## 2.3 删除索引

In [302]:
es.indices.delete(index=index_name, ignore=[400, 404])

{'acknowledged': True}

## 2.4 单条写入数据

In [234]:
import jieba
import warnings
warnings.filterwarnings("ignore")

In [235]:
def process_data_to_db(df):
    content_list = list(df.content)
    waitersend_list = list(df.waitersend)
    sid_len = len(waitersend_list)
    pre_list = []
    cur_list = []
    pre_string_list = []
    cur_string_list = []
    next_string_list = []
    
    for i in range(1,sid_len):
        if waitersend_list[i] == 'WAITER' and waitersend_list[i-1] != 'WAITER':
            pre_list.append('@'.join(list(jieba.cut(''.join(content_list[:i-1])))))
            cur_list.append('@'.join(list(jieba.cut(content_list[i-1]))))
            pre_string_list.append(';'.join(content_list[:i-1]))
            cur_string_list.append(content_list[i-1])
            next_string_list.append(content_list[i])
    
    df_new = pd.DataFrame()
    df_new['preMessage'] = pre_list
    df_new['lastText'] = cur_list
    df_new['preMessageString'] = pre_string_list
    df_new['lastTextString'] = cur_string_list
    df_new['nextTextString'] = next_string_list
    df_new['waiter'] = list(df.waiter)[0]
    return df_new

In [236]:
len(sid_message_list_last)

24795

In [307]:
df_db_list = []
for sid_message in sid_message_list_last:
    df_db_list.append(process_data_to_db(sid_message))

In [308]:
df_db = pd.concat(df_db_list,axis = 0)
df_db.shape

(121411, 6)

In [311]:
df_db = df_db.reset_index(drop=True)

In [242]:
# from decimal import *
for index, row in df_db.iterrows():
    doc = {
        "preMessage" :row["preMessage"],
        "lastText" :row["lastText"],
        "preMessageString" :row["preMessageString"],
        "lastTextString":row["lastTextString"],
        "nextTextString" :row["nextTextString"],
        "waiter" :row["waiter"]
    }
    res = es.index(index=index_name, id=f"{index}", body=doc)

In [313]:
df_db.shape

(121411, 6)

## 2.5 批量写入数据

In [321]:
import json
from decimal import *
i = 0
k = 0
actions = []
for item in (df_db.iterrows()):
    data = {}
    i += 1
    for col in df_db.columns:
        data[col] = item[1][col]
    action = {
                "_index": index_name,
                "_type": "_doc",
                "_id": item[0],
                "_source": data
              }
    
    actions.append(action)
    if i == 3000:
        try:
            blk=helpers.bulk(es, actions,request_timeout=20)
            i = 0
            actions = []
            k += 1
        except:
            traceback.print_exc()
            print(action)
            pass
try:
    blk=helpers.bulk(es, actions,request_timeout=20)
    k += 1
except:
    traceback.print_exc()
print((k-1)*3000 + i)

41


## 2.6 读取数据

In [None]:
# term:明确的值
# match:知道分词器的存在,模糊查询，只能指定一个字段查询
# multi_match：可以指定多个查询字段，同match一样，query会对field进行分词操作，然后再查询
# query:
# "_source":["name","interest"],  //查询结果只展示name和interest字段
#   "_source":{
#     "includes":["name","interest"],  // 包含哪些字段
#     "excludes":["company"]} //排除哪些字段，优先级高于includes

# must:必须满足
# must_not:必须不满足 不计算相关度分数
# should:可能满足

In [385]:
query = {
    "query":
    {
        "bool":
        {
            "must":[],
            "must_not":[],
            "should":
            [
                {
                    "match":
                    {
                        "lastText":
                        {
                            "analyzer": "comma_analyzer",
                            "query": "转@人工",
                            "operator": "OR",
                            "zero_terms_query": "NONE",
                            "minimum_should_match":1,
                            "boost": 5
                        }
                    }
                },
                {
                    "match":
                    {
                        "preMessage":
                        {
                            "analyzer": "comma_analyzer",
                            "query": "我要@转@人工",
                            "operator": "OR",
                            "zero_terms_query": "NONE",
                            "minimum_should_match":1,
                            "boost": 2
                        }
                    }
                }
            ]
        }
    },
    "from":0,
    "size":1,
    "sort":[],
    "aggs":{},
    "_source":['nextTextString','waiter']
    #"explain": True
}

In [386]:
#es查询
res = es.search(index=index_name,body=query,search_type='dfs_query_then_fetch')

## 2.7 读取超过10000的数据量

In [356]:
from elasticsearch import Elasticsearch, helpers
import time
query = {"query":{"bool":{"must":[{"match_all":{}}],"must_not":[],"should":[]}},"from":0,"size":100000,"sort":[],"aggs":{}}
start_time = time.time()
 
# helpers.scan()生成的是一个迭代器
res = helpers.scan(es, index=index_name, scroll='2m', query=query)
save_data = []
count = 0
for data in res:
    save_data.append(data)
    count += 1
print(len(save_data))
print(count)
end_time = time.time()

print(f"耗时：{end_time - start_time}")

121411
121411
耗时：3.5662176609039307
