# Getting results from ElasticSearch
# Data cleaning and concatenating using pandas

Recommend installing the following libraries:
- Pandas, Numpy, matplotlib (from Anaconda3)
- elasticsearch

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch(['http://eep16.fcr-it.top:9200'])
es_index="fscrawler-mount"

In [3]:
search_term = "hsbc exchange rate forecast g10"

In [51]:
# highlighting doesn't work quite well yet.
# keep on studying...

res = es.search(
        index=es_index,
        size=20,
        body={
            "query": {                
                "more_like_this": {
                    "fields": [
                        "content"
                    ],
                    "like": search_term,
                    "min_term_freq": 1,
                    "max_query_terms": 20
                }                 
            },
            "highlight": {
                "type" : "unified",
                "number_of_fragments" : 3,
                "require_field_match": False,
                "pre_tags" : ["<b>"],
                "post_tags" : ["</b>"],
                "fields": {
                  "*": {}
                }
            }
        }
    )

In [52]:
# Show the ElasticSearch result format
res['hits']['hits']

[{'_id': 'eb6d8d2be131e9fbbc27e386847ec56c',
  '_index': 'fscrawler-mount',
  '_score': 12.431005,
  '_source': {'content': '\n \n\nDisclosures & Disclaimer \n\nThis report must be read with the disclosures and the analyst certifications in \n\nthe Disclosure appendix, and with the Disclaimer, which forms part of it. \n\n  \n\n \n\nIssuer of report: HSBC Securities and Capital \nMarkets (India) Private Limited \n\nView HSBC Global Research at: \n\nhttps://www.research.hsbc.com \n\n\uf061\uf062\uf063\uf020\n\uf020\n\n \n\n\uf034 First half results were strong, but performance in Egypt was \n\nstronger than the headline numbers indicate  \n\n\uf034 Although small, new investments – entry into Nigeria and \n\nplanned entry into Egyptian radiology are positive  \n\n\uf034 Maintain target price of USD5.60 and Buy rating \n\nStronger than it looks: IDH reported revenue growth of 26%, 60bps margin gains and \n\n39% earnings growth in H1 18. However, excluding the impact of recently acquired \

In [48]:
# Show in pandas format
df = pd.DataFrame(res['hits']['hits'])
# Split the _source column
df = pd.concat([df.drop(['_source'], axis=1), df['_source'].apply(pd.Series)], axis=1)
#df = pd.concat([df.drop(['meta'], axis=1), df['meta'].apply(pd.Series)], axis=1)
#df = pd.concat([df.drop(['path'], axis=1), df['path'].apply(pd.Series)], axis=1)
#df = pd.concat([df.drop(['file'], axis=1), df['file'].apply(pd.Series)], axis=1)
#df = pd.concat([df.drop(['raw'], axis=1), df['raw'].apply(pd.Series)], axis=1)
#df = df.drop(['raw'],axis=1)
df

Unnamed: 0,_id,_index,_score,_type,content,meta,file,path
0,eb6d8d2be131e9fbbc27e386847ec56c,fscrawler-mount,12.431005,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-08-20T01:02:55.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
1,efa78c91cd9e8c2aeb2fda3faec54bf,fscrawler-mount,12.37296,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-10-24T07:31:57.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
2,562a54f521ea6b5b43ea3f4492e7330,fscrawler-mount,11.687666,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-08-20T01:02:54.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
3,fc562f9510a6aaabbc343b7e312ffc78,fscrawler-mount,10.996032,_doc,\nTrade recommendation update\nTake profit on ...,"{'date': '2018-07-03T07:42:24.000+0000', 'form...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
4,f721ede84e1c6e90934ce7aa1b6f1ae,fscrawler-mount,1.88374,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-10-24T07:32:42.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
5,c5d06796845f6f6a20eb495345ce4a46,fscrawler-mount,1.843428,_doc,\n \n\nDisclaimer & Disclosures \n\nThis repor...,"{'date': '2018-10-24T07:36:34.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
6,5ef1a33cbb553f46c6a7e3a7a47290c7,fscrawler-mount,1.708918,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-10-24T07:33:06.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
7,e3c3a2d1164b1e367ab724ddb26e71a,fscrawler-mount,1.424201,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-10-24T07:31:34.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
8,d12f116ef936fd1c4c812516630dfab,fscrawler-mount,1.304784,_doc,\n \n\nDisclosures & Disclaimer \n\nThis repor...,"{'date': '2018-10-24T07:34:10.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."
9,b5686cc9e9895b35a9d95455ef48a7,fscrawler-mount,0.450699,_doc,\n \n\nDisclaimer & Disclosures \n\nThis repor...,"{'date': '2018-10-24T07:38:07.000+0000', 'lang...","{'extension': 'pdf', 'content_type': 'applicat...","{'root': '7c31416ded04ecb6137ddc062de7dcb', 'v..."


In [None]:
# Try to check if the content column contains specific string.
# May further enhance it to match the string, and get the subsequent 100 characters and show that as a result
# This can reduce the content in each row to be shown in the UI

# Reference：  http://pandas.pydata.org/pandas-docs/stable/text.html#extracting-substrings

pattern = 'Disclaimer'
#df.content.str.contains(pattern)
df.content.str.find(pattern)
df.content.str.extract(pattern)