In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import configparser

config = configparser.ConfigParser()
config.read('foobar.ini')
cloud_id = config["cloud-connection"]["cloud_id"]
user = config["cloud-connection"]["user"] # by default user = "elastic"
password = config["cloud-connection"]["password"]

client = Elasticsearch(
    cloud_id=cloud_id,  # cloud id can be found under deployment management
    basic_auth=(user, password) # your username and password for connecting to elastic, found under Deplouments - Security
)

client.info()

ObjectApiResponse({'name': 'instance-0000000000', 'cluster_name': 'fdcc4e10e5a34385884a3eda9350099a', 'cluster_uuid': '1v8os-EZTPmrZoF6uXeWKA', 'version': {'number': '8.9.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d', 'build_date': '2023-07-19T14:43:58.555259655Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [27]:
import re
import pandas as pd 

hp_script = pd.read_csv("data/Harry_Potter_2.csv", sep = ";" )
hp_script = hp_script.applymap(lambda x: re.sub(r'[^ \w+]', '', str(x).strip()))
hp_script["Line_number"] = hp_script.index
hp_script.head()

Unnamed: 0,Character,Sentence,Line_number
0,HARRY,I cant let you out Hedwig,0
1,HARRY,Im not allowed to use magic outside of school,1
2,HARRY,Besides if Uncle Vernon,2
3,VERNON,Harry Potter,3
4,HARRY,Now youve done it,4


In [30]:
index = "hp_script_2"
settings = {}
mappings = {
    "_meta" : {
        "created_by" : "Iulia Feroli"
    },
    "properties" : {
        "Line_number" : {
            "type" : "long"
        },
        "Character" : {
            "type" : "keyword",
            "type" : "text"
        },
        "Sentence" : {
            "type" : "text"
        }
    }
}

client.indices.create(index=index, settings=settings, mappings=mappings)

In [3]:
from json import loads
docs = hp_script.to_json(orient = "records")
hp_script_docs_2 = loads(docs)
hp_script_docs_2[0:5]

[{'Character': 'HARRY',
  'Sentence': 'I cant let you out Hedwig',
  'Line_number': 0},
 {'Character': 'HARRY',
  'Sentence': 'Im not allowed to use magic outside of school',
  'Line_number': 1},
 {'Character': 'HARRY',
  'Sentence': 'Besides if Uncle Vernon',
  'Line_number': 2},
 {'Character': 'VERNON', 'Sentence': 'Harry Potter', 'Line_number': 3},
 {'Character': 'HARRY', 'Sentence': 'Now youve done it', 'Line_number': 4}]

In [8]:
response = bulk(client = client, index = index, actions = iter(hp_script_docs), stats_only = True )

In [5]:
import re
hp_script = pd.read_csv("data/Harry_Potter_3.csv", sep = ";" )
hp_script = hp_script.applymap(lambda x: re.sub(r'[^ \w+]', '', str(x).strip()))
hp_script["Line_number"] = hp_script.index
hp_script.rename(columns = {'CHARACTER':'Character'}, inplace = True)
hp_script.rename(columns = {'SENTENCE':'Sentence'}, inplace = True)

index = "hp_script_3"
#client.indices.create(index=index, settings=settings, mappings=mappings)

docs = hp_script.to_json(orient = "records")
hp_script_docs_3 = loads(docs)
hp_script_docs_3[0:5]

#response = bulk(client = client, index = index, actions = iter(hp_script_docs_3), stats_only = True )

[{'Character': 'HARRY', 'Sentence': 'Lumos Maxima', 'Line_number': 0},
 {'Character': 'HARRY', 'Sentence': 'Lumos Maxima', 'Line_number': 1},
 {'Character': 'HARRY', 'Sentence': 'Lumos Maxima', 'Line_number': 2},
 {'Character': 'HARRY', 'Sentence': 'Lumos MAXIMA', 'Line_number': 3},
 {'Character': 'AUNT PETUNIA', 'Sentence': 'Harry Harry', 'Line_number': 4}]

In [6]:
hp_script = pd.read_csv("data/Harry_Potter_1.csv", sep = ";" )
hp_script = hp_script.applymap(lambda x: re.sub(r'[^ \w+]', '', str(x).strip()))
hp_script["Line_number"] = hp_script.index

docs = hp_script.to_json(orient = "records")
hp_scripts = loads(docs)

In [7]:
hp_scripts = hp_scripts + hp_script_docs_2 + hp_script_docs_3

In [34]:
index = "hp_scripts"
client.indices.create(index=index, settings=settings, mappings=mappings)
response = bulk(client = client, index = index, actions = iter(hp_scripts), stats_only = True )

In [10]:
hp_scripts

[{'Character': 'Dumbledore',
  'Sentence': 'I shouldve known that you would be here Professor McGonagall',
  'Line_number': 0},
 {'Character': 'McGonagall',
  'Sentence': 'Good evening Professor Dumbledore',
  'Line_number': 1},
 {'Character': 'McGonagall',
  'Sentence': 'Are the rumors true Albus',
  'Line_number': 2},
 {'Character': 'Dumbledore',
  'Sentence': 'Im afraid so professor',
  'Line_number': 3},
 {'Character': 'Dumbledore',
  'Sentence': 'The good and the bad',
  'Line_number': 4},
 {'Character': 'McGonagall', 'Sentence': 'And the boy', 'Line_number': 5},
 {'Character': 'Dumbledore',
  'Sentence': 'Hagrid is bringing him',
  'Line_number': 6},
 {'Character': 'McGonagall',
  'Sentence': 'Do you think it wise to trust Hagrid with something as important as this',
  'Line_number': 7},
 {'Character': 'Dumbledore',
  'Sentence': 'Ah Professor I would trust Hagrid with my life',
  'Line_number': 8},
 {'Character': 'Hagrid',
  'Sentence': 'Professor Dumbledore sir',
  'Line_number

## Importing the books

In [34]:
hp_books = pd.read_csv("data/Harry_Potter_all_books_preprocessed.txt", sep=".", header=None)
hp_books = hp_books.T

In [36]:
hp_books.rename(columns = {0:'Sentence'}, inplace = True)

docs = hp_books.to_json(orient = "records")
hp_books = loads(docs)

hp_books[0:5]

[{'Sentence': 'THE BOY WHO LIVED Mr and Mrs Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much '},
 {'Sentence': 'They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense '},
 {'Sentence': 'Mr Dursley was the director of a firm called Grunnings which made drills '},
 {'Sentence': 'He was a big beefy man with hardly any neck although he did have a very large mustache '},
 {'Sentence': 'Mrs Dursley was thin and blonde and had nearly twice the usual amount of neck which came in very useful as she spent so much of her time craning over garden fences spying on the neighbors '}]

In [37]:
index = "hp_books"
settings = {}
mappings = {
    "_meta" : {
        "created_by" : "Iulia Feroli"
    },
    "properties" : {
        "Sentence" : {
            "type" : "text"
        }
    }
}

client.indices.create(index=index, settings=settings, mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'hp_books'})

In [38]:
response = bulk(client = client, index = index, actions = iter(hp_books), stats_only = True )