In [16]:
import pandas as pd
from datetime import datetime, timedelta
import time
from urllib import parse
import requests
from urllib.request import urlretrieve
import json
import re

from Models import ElasticSearchClass
import importlib
importlib.reload(ElasticSearchClass)

pd.options.display.max_colwidth = 512
INSTANCES = {"ins1":{"host":"http://user:pwd@xxx.xxx.xxx.xxx:port",
                     "index":"index1"},
             "ins2":{"host":"http://user:pwd@xxx.xxx.xxx.xxx:port",
                     "index":"index2"}
            }

STARTDATE="20180320"

In [17]:
def acceptQuery(query, instance):
    if instance is None:
        return True
    if query is None:
        return False
    
    #1.drop the query only have one condition
    #2.drop the query including test
    if  re.search("and", query, re.IGNORECASE) is None \
        or re.search("test", query, re.IGNORECASE) is not None:
            return False     
        
    return True 
        
def exists(data, record):
    if data is None:
        return False
    if record is None:
        raise ValueError("record None")
    
    return record in data


def formatResult(doc):
    return [doc['_source']['_id'], doc['_source']['body']]

In [18]:
def loadPercolatorQueries(indexName, instance, hosts, maxCount=-1):
    if indexName is None or hosts is None:
        raise ValueError("indexName, hosts invalid.")
    
    esUtil = ElasticSearchClass.ElasticSearchClass(hosts)
    res = esUtil.scrollSearch(indexName=indexName)
    count = 0
    data = []
    for doc in res:
        if maxCount > 0 and count >= maxCount:
            break
       
        query = str(doc['_source']['query']['query_string']['query'])
        if acceptQuery(query, instance):
            data.append([doc['_id'], query, str(doc['_source']['name'])])
            count += 1
    return data

In [19]:
def retrieveFromES(queryString, hosts, objname, 
                   maxCount=200, startDate=None, toleranceDays=30):
    if hosts is None or objname is None:
        raise ValueError("hosts, objname invalid.")
        
    dateformat="%Y%m%d"
    if startDate is None:
        startDate =  datetime.datetime.now().strftime(dateformat)
    
    count = 0
    data_retrieved = []
    for i in range(toleranceDays):
        dataDate = datetime.strptime(startDate, dateformat) - timedelta(days=i)
        aliasName = "log-" + dataDate.strftime(dateformat) + "-" + objname
        requestString = hosts + "/" + aliasName + "/_search?q=" + parse.quote(queryString) + "&size=" + str(maxCount)
        #print(requestString)
        #response = urlretrieve(requestString)
        response = requests.get(requestString)
        results = json.loads(response.text)
        try:
            data = [doc for doc in results['hits']['hits']]
            for doc in data:
                if isNcrazerHandled(doc):
                    continue
                data_retrieved.append(formatResult(doc))
                count += 1
                if maxCount > 1 and count >= maxCount:
                    return data_retrieved
        except Exception as e:
            #print(e)
            pass
            
    return data_retrieved

In [None]:
if __name__ == "__main__":

    for key,value in INSTANCES.items():
        #print(key,value)
        instance = key
        index_name = value['index']
        host = value['host']
        queries = loadPercolatorQueries(index_name, instance, host)
        df_queries = pd.DataFrame(queries, columns=['id', 'query', 'objname'])
        df_queries.to_csv("{}-{}.csv".format(index_name, instance), sep=',', encoding='utf-8', index=False)

        df_logs = pd.DataFrame()
        for index, row in df_queries.iterrows():
            queryString = row['query']
            objname = row['objname'].lstrip("['").rstrip("']")
            data_tmp = retrieveFromES(queryString, host, objname, 100, STARTDATE)
            df_log_tmp = pd.DataFrame(data_tmp, columns=['objname', 'body'])
            df_log_tmp['id'] = row['id']
            df_logs = df_logs.append(df_log_tmp, ignore_index=True)
            print("{} query done, records {}".format(row['id'], len(data_tmp)))
        df_logs.to_csv("{}-{}-logs.csv".format(index_name, instance), sep=",", encoding="utf-8", index=False)