## Sample Python / ElasticSearch integration
from http://blog.qbox.io/building-an-elasticsearch-index-with-python

Basic variables/settings

In [1]:
FILE_URL = "http://apps.sloanahrens.com/qbox-blog-resources/kaggle-titanic-data/test.csv"

ES_HOST = {"host" : "localhost", "port" : 9200}

INDEX_NAME = 'titanic'
TYPE_NAME = 'passenger'

ID_FIELD = 'passengerid'

Load the data

In [2]:
import csv
import urllib2

response = urllib2.urlopen(FILE_URL)
csv_file_object = csv.reader(response)
 
header = csv_file_object.next()
header = [item.lower() for item in header]

Build dirctionary

In [3]:
bulk_data = [] 

for row in csv_file_object:
    data_dict = {}
    for i in range(len(row)):
        data_dict[header[i]] = row[i]
    op_dict = {
        "index": {
        	"_index": INDEX_NAME, 
        	"_type": TYPE_NAME, 
        	"_id": data_dict[ID_FIELD]
        }
    }
    bulk_data.append(op_dict)
    bulk_data.append(data_dict)

create index using the Python ES client

In [4]:
from elasticsearch import Elasticsearch

# create ES client, create index
es = Elasticsearch(hosts = [ES_HOST])

if es.indices.exists(INDEX_NAME):
    print("deleting '%s' index..." % (INDEX_NAME))
    res = es.indices.delete(index = INDEX_NAME)
    print(" response: '%s'" % (res))

# since we are running locally, use one shard and no replicas
request_body = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 0
    }
}

print("creating '%s' index..." % (INDEX_NAME))
res = es.indices.create(index = INDEX_NAME, body = request_body)
print(" response: '%s'" % (res))



creating 'titanic' index...
 response: '{u'acknowledged': True}'


perform a bulk index on the data

In [5]:
# bulk index the data
print("bulk indexing...")
res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)

bulk indexing...


run a simple match_all query to ensure that everything is in order

In [6]:
# sanity check
res = es.search(index = INDEX_NAME, size=2, body={"query": {"match_all": {}}})
print(" response: '%s'" % (res))

 response: '{u'hits': {u'hits': [{u'_score': 1.0, u'_type': u'passenger', u'_id': u'892', u'_source': {u'fare': u'7.8292', u'name': u'Kelly, Mr. James', u'embarked': u'Q', u'age': u'34.5', u'parch': u'0', u'pclass': u'3', u'sex': u'male', u'sibsp': u'0', u'passengerid': u'892', u'ticket': u'330911', u'cabin': u''}, u'_index': u'titanic'}, {u'_score': 1.0, u'_type': u'passenger', u'_id': u'893', u'_source': {u'fare': u'7', u'name': u'Wilkes, Mrs. James (Ellen Needs)', u'embarked': u'S', u'age': u'47', u'parch': u'0', u'pclass': u'3', u'sex': u'female', u'sibsp': u'1', u'passengerid': u'893', u'ticket': u'363272', u'cabin': u''}, u'_index': u'titanic'}], u'total': 418, u'max_score': 1.0}, u'_shards': {u'successful': 1, u'failed': 0, u'total': 1}, u'took': 6, u'timed_out': False}'


More structured

In [7]:
print("results:")
for hit in res['hits']['hits']:
    print(hit["_source"])

results:
{u'fare': u'7.8292', u'name': u'Kelly, Mr. James', u'embarked': u'Q', u'age': u'34.5', u'parch': u'0', u'pclass': u'3', u'sex': u'male', u'sibsp': u'0', u'passengerid': u'892', u'ticket': u'330911', u'cabin': u''}
{u'fare': u'7', u'name': u'Wilkes, Mrs. James (Ellen Needs)', u'embarked': u'S', u'age': u'47', u'parch': u'0', u'pclass': u'3', u'sex': u'female', u'sibsp': u'1', u'passengerid': u'893', u'ticket': u'363272', u'cabin': u''}
