## Elasticsearch Indexing  

This notebook shows you how to create a Elasticsearch connection, how to create a index, and then how to index your data.

### 1. Install Elasticsearch library

In [1]:
!pip3 install elasticsearch===7.10.1

Looking in indexes: https://abhilasha.mangal%40ibm.com:****@na.artifactory.swg-devops.com/artifactory/api/pypi/wcp-nlp-pypi-virtual/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


<span style="color:blueviolet">Step 1. Provide Elastic user name and password</span>

In [13]:
username = 'Add user name'
password = 'Add password here'

### 2. Import libraries

In [14]:
import os
import tika
tika.initVM()
from tika import parser  
import re
from datetime import date
import pandas as pd
import json
from datetime import datetime
import requests
import PyPDF2

from pathlib import Path
from dateutil.parser import parse

### 3. Elastic Search Connection
<span style="color:blueviolet">Step 2. Provide user name and password and Elasticsearch url for connection</span>

In [15]:
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import RequestError

# Create an instance of Elasticsearch with TLS options
es_client = Elasticsearch(
    'https://<username>:<password>@3d862675-f715-499c-b9e4-ffba4d8321a0.2adb0220806343e3ae11df79c89b377f.databases.appdomain.cloud:32062',
    ca_certs='/Users/abhilashamangal/Downloads/5cb6eb86-ae1c-11e9-99c9-6a007ab2fc0b'
)
info = es_client.info()
print(info)

index_name = 'superknowa'
index_mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "text"},
            "published_source": {"type": "text"},
            "content": {"type": "text"},
            "url": {"type": "text"},
            "keywords": {"type": "text"},
            "categories": {"type": "text"},
            "publish_date": {"type": "date"},
            "last_update_date": {"type": "date"},
            "indexing_date": {"type": "date"}
            
        }
    }
}
try:
    es_client.indices.create(index=index_name)
    print(f"Index '{index_name}' created successfully.")
except RequestError as e:
    if e.error == 'resource_already_exists_exception':
        print(f"Index '{index_name}' already exists.")
    else:
        print(f"An error occurred while creating index '{index_name}': {e}")


<span style="color:blueviolet">Step 3. Cheking Elastic Search Configration</span>

In [5]:
info = es_client.info()
print(info)


{'name': 'm-2.3d862675-f715-499c-b9e4-ffba4d8321a0.d5c42fad68fd498ba08f6af6107b71cd.2adb0220806343e3ae11df79c89b377f.databases.appdomain.cloud', 'cluster_name': '3d862675-f715-499c-b9e4-ffba4d8321a0', 'cluster_uuid': 'UsM9ak-LRYajVAwt5yeQxw', 'version': {'number': '7.10.2', 'build_flavor': 'oss', 'build_type': 'tar', 'build_hash': '747e1cc71def077253878a59143c1f785afa92b9', 'build_date': '2021-01-13T00:42:12.435326Z', 'build_snapshot': False, 'lucene_version': '8.7.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


<span style="color:blueviolet">Step 4. Providing Elastic Search index name and index mapping</span>

In [6]:
index_name = 'superknowa'
index_mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "text"},
            "published_source": {"type": "text"},
            "content": {"type": "text"},
            "url": {"type": "text"},
            "keywords": {"type": "text"},
            "categories": {"type": "text"},
            "publish_date": {"type": "date"},
            "last_update_date": {"type": "date"},
            "indexing_date": {"type": "date"}
            
        }
    }
}

<span style="color:blueviolet">Step 5. Creating Elastic Search index.</span>

In [7]:
# Test the connection and create an index
try:
    es_client.indices.create(index=index_name)
    print(f"Index '{index_name}' created successfully.")
except RequestError as e:
    if e.error == 'resource_already_exists_exception':
        print(f"Index '{index_name}' already exists.")
    else:
        print(f"An error occurred while creating index '{index_name}': {e}")


Index 'superknowa' already exists.


<span style="color:blueviolet">Step 6. Processing data</span>

In [7]:
## Extracting data
today = date.today()
print("Today's date:", today)

def get_all_files(folder_name):
    # Change the directory
    os.chdir(folder_name)
    # iterate through all file
    file_path_list =[]
    for file in os.listdir():
        if ".txt" in file:
            file_path = f"{folder_name}/{file}"
            file_path_list.append(file_path)
    return file_path_list

Today's date: 2023-07-11


In [8]:
def pre_processingtext(text_data):
    replaced = re.sub("</?p[^>]*>", "", text_data)
    replaced = re.sub("</?a[^>]*>", "", replaced)
    replaced = re.sub("</?h*[^>]*>", "", replaced)
    replaced = re.sub("</?em*[^>]*>", "", replaced)
    replaced = re.sub("</?img*[^>]*>", "", replaced)
    replaced = re.sub("&amp;", "", replaced)
    replaced = re.sub("id=*>;", "", replaced)
    return replaced

In [11]:
def readdata_frompdf(file_name):
    content=''
    try:
        pdfFileObj = open(file_name, 'rb')
        pdfReader = PyPDF2.PdfReader(pdfFileObj)
        for i in range(len(pdfReader.pages)):
            pageObj = pdfReader.pages[i]
            content =content+" "+pageObj.extract_text()

        pdfFileObj.close()
        return content
    except:
         print("file is empty")
         return content

In [16]:
### Developer data

In [9]:
# Directory containing the documents you can change these dir from your directories 
ibm_docs_dir = 'Scraper/scrape_data/ibm_developer_metadata/'
white_paper_docs_dir ='Scraper/scrape_data/white_paper_metadata/data-2/'
redbooks_data_dir ='Scraper/scrape_data/redbooks_data/'
ibm_cloud_docs_dir ='Scraper/scrape_data/ibm_cloud_docs_process_metdata_new/'
ibm_cloud_metadta_file ='Scraper/scrape_data/ibm_cloud_docs_metadata5.txt'
ibm_medium_blog ='Scraper/scrape_data/Medium/text'
ibm_medium_blog_csv ='Scraper/scrape_data/Medium/csv'

## 1. IBM Developer

In [10]:
def index_ibm_developerdata(file_path,esdocs):
    with open(file_path, 'r', encoding="latin1") as file:
        if ".txt" in file_path:
            content = file.read()
            print(len(content), file_path)
            content_value = content.split("content:")
            content = pre_processingtext(content_value[1])
            categories_val = content_value[0].split("categories:")
            categories = categories_val[1]
            sub_title =  categories_val[0].split("sub_title:")
            title_val =  sub_title[0].split("title:")
            title = title_val[1]
            pd_val =  title_val[0].split("publish_date:")
            publish_date = pd_val[1]
            ld_val =  pd_val[0].split("updated_date:")
            updated_date = ld_val[1]
            print("values ---",len(ld_val))
            urls =  ld_val[0].split("URL:")
            print(len(urls))
           
            url = "https://developer.ibm.com/blogs/"+urls[1]

            indexing_date = today
            source = "IBM Developer"
            data = "{'id' : '"+str(title)+"', 'published_source' : '"+source+"', 'publish_date' : '"+str(publish_date)+"','last_update_date' : '"+str(updated_date)+"','indexing_data' : '"+str(indexing_date)+"', 'url' : '"+url+"','content' : '"+str(content)+"','keywords' : '"+str(sub_title)+"','categories' : '"+str(categories)+"'}"
           
            publish_date = publish_date.replace("\n","").strip()
            updated_date = updated_date.replace("\n","").strip()
            print(publish_date)
            print("update_date ",updated_date)
            publish_date_obj = datetime.strptime(publish_date,"%Y-%m-%dT%H:%M:%S")
            publish_date = publish_date_obj.date()

            updated_date_obj = datetime.strptime(updated_date,"%Y-%m-%dT%H:%M:%S")
            updated_date = updated_date_obj.date()

        
            esdocs.append({
            "id": ""+title+"",
            "published_source": ""+source+"",
            "publish_date": ""+str(publish_date)+"",
            "last_update_date": ""+str(updated_date)+"",
            "indexing_date": ""+str(indexing_date)+"",
            "content": ""+content+"",
            "url": ""+url+"",
            "keywords": ""+str(sub_title)+"",
            "categories": ""+str(categories)+"",
        })
    return esdocs

## 2. Redbooks

In [None]:
### Red Books data
esdocs =[]
i=0
for filename in os.listdir(redbooks_data_dir):
        print("processing i---",i)
        file_path = os.path.join(redbooks_data_dir, filename)
        if i > 1199:
            with open(file_path, 'r', encoding="latin1") as file:
                extracted_text=''
                if ".json" in file_path:
                    content = file.read()
                    source ="Redbooks"
                    content_list = content.split("\n")
                    publish_date = content_list[2].replace("publish_date: ","")
                    updated_date = content_list[1].replace("updated_date: ","")
                    if publish_date != '':
                        publish_date_obj = datetime.strptime(publish_date,"%Y-%m-%d")
                        publish_date = publish_date_obj.date()
                    if updated_date != '':
                        updated_date_obj = datetime.strptime(updated_date,"%Y-%m-%d")
                        updated_date = updated_date_obj.date()
                    if updated_date is None:
                        updated_date = publish_date
                    if publish_date is None:
                        updated_date =today
                        publish_date =today
                    indexing_date = today
                    url = content_list[0].replace("URL: ","")
                    file_name = content_list[3].replace("file_name: ","")
                    print(file_name)
                    file_path= redbooks_data_dir+file_name
                    extracted_text = readdata_frompdf(file_path)
                    if extracted_text is not None:
                                document ={
                                "id": ""+file_name.replace(".pdf","")+"",
                                "published_source": ""+source+"",
                                "publish_date": ""+str(publish_date)+"",
                                "last_update_date": ""+str(indexing_date)+"",
                                "indexing_date": ""+str(indexing_date)+"",
                                "content": ""+extracted_text+"",
                                "url": ""+url+"",
                                "keywords": "",
                                "categories": "",
                        }
                    response = es_client.index(index=index_name, body=document)
                    print(response)
        i=i+1

## 3. White Paper

In [45]:
### White Paper Indexing 
esdocs =[]
def indexwhitepaerdata(esdocs):
    i =0
    for filename in os.listdir(white_paper_docs_dir):
        print("processing i---",i)
        file_path = os.path.join(white_paper_docs_dir, filename)
        
        # Read the contents of the file
        headers = {
                "X-Tika-OCRLanguage": "eng",
                "X-Tika-OCRTimeout": "300"
            }
        with open(file_path, 'r', encoding="latin1") as file:
            extracted_text=''
            if ".json" in file_path:
                content = file.read()
                json_object = json.loads(content)
                #print(json_object)
                source ="IBM White Paper"
                publish_date = json_object['publish_date']
                if publish_date is not None:
                    dt = parse(publish_date)
                    publish_date=today
                else:
                    publish_date = today
                updated_date = publish_date
                indexing_date = today
                url = json_object['url']
                attachment = json_object['attachment']
                title = json_object['title']
                j=1
                if len(attachment) >1:
                    for att in attachment:
                        print("Value",att)     
                        #file_name = att.replace("https://www.ibm.com/support/pages/system/files/inline-files/","")
                        file_name = os.path.basename(att)
                        print(file_name)
                        file_path = white_paper_docs_dir+file_name
                        path = Path(file_path)
                        if path.is_file():
                            #parsed_content = parser.from_file(file_path,requestOptions={'headers': headers, 'timeout': 500})
                            # Get the extracted text
                            #extracted_text = parsed_content['content']
                            extracted_text =readdata_frompdf(file_path)
                        else:
                            print("file_name",file_name)
                            try:
                                downloadFile(att,white_paper_docs_dir+"/"+file_name)
                                if path.is_file():
                                    #parsed_content = parser.from_file(file_path,requestOptions={'headers': headers, 'timeout': 500})
                                    # Get the extracted text
                                    #extracted_text = parsed_content['content']
                                    extracted_text =readdata_frompdf(file_path)
                            except:
                                print("file in not available")
                                break

                        title = title+"_"+str(i)
                        if extracted_text is not None:
                            solrdocs.append({
                    "id": ""+title+"",
                    "published_source": ""+source+"",
                    "publish_date": ""+str(indexing_date)+"",
                    "last_update_date": ""+str(updated_date)+"",
                    "indexing_date": ""+str(indexing_date)+"",
                    "content": ""+extracted_text+"",
                    "url": ""+url+"",
                    "keywords": "",
                    "categories": "",
                })
                            
                else:
                    if len(attachment) ==1:
                        file_name = os.path.basename(attachment[0])
                        print(file_name)
                        file_path = white_paper_docs_dir+file_name
                        path = Path(file_path)
                        if path.is_file():
                            #parsed_content = parser.from_file(file_path,requestOptions={'headers': headers, 'timeout': 500})
                            #extracted_text = parsed_content['content']
                            extracted_text = readdata_frompdf(file_path)
                        if extracted_text is not None:
                            solrdocs.append({
                        "id": ""+title+"",
                        "published_source": ""+source+"",
                        "publish_date": ""+str(indexing_date)+"",
                        "last_update_date": ""+str(updated_date)+"",
                        "indexing_date": ""+str(indexing_date)+"",
                        "content": ""+extracted_text+"",
                        "url": ""+url+"",
                        "keywords": "",
                        "categories": "",
                })
    return esdocs

In [None]:
esdocs =indexwhitepaerdata(esdocs)

In [48]:
len(esdocs)

496

In [None]:
i =0
for document in esdocs:
    # Add the document to the index
    print("index data----",i)
    if i > 226:
        response = es_client.index(index=index_name, body=document)
        print(response)
    i=i+1
   

In [12]:
### Index ibm developer data 
esdocs =[]
file_path_list = get_all_files(ibm_docs_dir)

In [13]:
print(len(file_path_list))

1472


In [16]:
print(len(esdocs))

1472


### Indexing elastic documents

In [None]:
for document in esdocs:
    # Add the document to the index
    response = es_client.index(index=index_name, body=document)
    print(response)

## 4. IBM Developer docs

In [None]:
esdocs =[]
i=0
j=0
with open(ibm_cloud_metadta_file, 'r', encoding="latin1") as file:
        content = file.read()
        content_list = content.split("\n")
        print(len(content_list))
        for content in content_list:
            #try:
                print("process---",i)
                file_names = content.split("file name")
                if len(file_names)>1:
                    url_data = file_names[1].split("Url")
                    file_name = url_data[0].replace("\": ","").replace(",\"","").replace("\"","")
                    metadata = url_data[1].split("metadata")
                    url = metadata[0].replace("\": ","").replace(", \"","")
                    lastupdated = metadata[1].split("lastupdated:")
               
                    
                    if len(lastupdated) >1:
                        last_updated_data = lastupdated[1].split("keywords:")
                        if len(last_updated_data) >1:
                            keywords = last_updated_data[1].replace("\"}","")
                        else:
                            keywords=""
                    else:
                        keywords=""
                    last_updated = last_updated_data[0].replace("\"","").strip().replace(" subcollection: assistant }","").split(" ")[0]
                    if len(last_updated) < 6:
                        last_updated = today
                    else:
                        if '/' in last_updated:
                            last_updated = last_updated.replace("/","-") 
                        if 'b' in last_updated:
                            last_updated = last_updated.replace("b","-") 
                            last_updated = last_updated.replace("'","-") 
                    #last_updated=last_updated.encode("ascii", "ignore")


                    print("filename",file_name)
                    with open(file_name, 'r', encoding="latin1") as file:
                        extracted_text=''
                        extracted_text = file.read()
                        source ="IBM Developer docs"
                        #print(extracted_text)

                    file_name_val = file_name.replace("ibm_cloud_docs_process_metdata_new/","")
                    id = file_name_val.replace(".txt","")+"_"+source+str(i)
                    if extracted_text is not None:
                                esdocs.append({
                                        "id": ""+file_name_val.replace(".txt","")+"",
                                        "published_source": ""+source+"",
                                        "publish_date": ""+str(last_updated)+"",
                                        "last_update_date": ""+str(last_updated)+"",
                                        "indexing_date": ""+str(today)+"",
                                        "content": ""+extracted_text+"",
                                        "url": ""+url+"",
                                        "keywords": ""+keywords+"",
                                        "categories": "",
                                })
                i=i+1
            

In [57]:
print(len(esdocs))

7740


In [None]:
for document in esdocs:
    # Add the document to the index
    try:
        response = es_client.index(index=index_name, body=document)
        print(response)
    except Exception as e:
        print(e)
        continue

## 5. IBM Medium Blog

In [None]:
esdocs =[]
i = 0
j = 0
file_path_list_medium = get_all_files(ibm_medium_blog)
print(type(file_path_list_medium))
for filename in os.listdir(ibm_medium_blog_csv):
    file_path = os.path.join(ibm_medium_blog_csv, filename)
    with open(file_path, 'r', encoding="utf-8") as file:
            if ".csv" in file_path:
                content = ''
                df = pd.read_csv(file_path, sep='\t')
                for ind in df.index:
                    title = df['title'][ind]
                    sub_title = df['subtitle'][ind]
                    url  =  df['story_url'][ind]
                    publish_date = today
                    updated_date = today
                    file_name = str(ibm_medium_blog+"/"+title+".txt")
                    print(file_name)
                    if file_name in file_path_list_medium:
                        with open(file_name, 'r', encoding="utf-8") as file1:
                                content = file1.read()
                               
                    indexing_date = today
                    source = "Medium"
                    categories=''
                    
                    #print(content)

                    esdocs.append({
                        "id": ""+title+"",
                        "published_source": ""+source+"",
                        "publish_date": ""+str(publish_date)+"",
                        "last_update_date": ""+str(updated_date)+"",
                        "indexing_date": ""+str(indexing_date)+"",
                        "content": ""+content+"",
                        "url": ""+url+"",
                        "keywords": ""+str(sub_title)+"",
                        "categories": ""+str(categories)+"",
                    })
                    print(len(solrdocs))

In [None]:
for document in esdocs:
    # Add the document to the index
    response = es_client.index(index=index_name, body=document)
    print(response)
    

## 6. IBM Product Docs

In [7]:
ibm_product_file_folder= "/new-scrape_metadata/"
ibm_product_file_folder_inner= "/new-scrape_metadata/"
corpus_file_folder_output ="/new-scrape_metadata1/"
import os

In [8]:
def extarct_all_files(corpus_file_folder,corpus_file_folder_output):
    os.chdir(corpus_file_folder)
    file_path_list =[]
    file_path_list_new =[]
    for file in os.listdir():
        print(file)
        # Check whether file is in text format or not
        file_path = f"{corpus_file_folder}/{file}"
        file_name = file
        new_file_path = f"{corpus_file_folder_output}/{file_name}"
        file_path_list.append(file_path)
        file_path_list_new.append(new_file_path)
    return file_path_list ,file_path_list_new

In [None]:
file_path_list ,file_path_list_new =extarct_all_files(ibm_product_file_folder,corpus_file_folder_output)

In [10]:
def product_clean_text(text_data_array):
    final_text =""
    skip =False
    for text in text_data_array:
        if len(text) > 0:
            if len(text) < 10 and text[0].isupper():
                skip = True
            if 'URL' in text:
                skip = True
            if 'Copyright' in text:
                skip = True
            if 'Reference' in text:
                skip = True
            if skip:
                skip = False
            else:
                text = re.sub(r"http\S+", "", text)
                final_text = final_text+"\n"+text
    return final_text
            

In [11]:
from datetime import datetime
from datetime import date
## Extracting data
today = date.today()
print("Today's date:", today)

Today's date: 2023-07-25


In [12]:
def index_ibm_product_data(file_path_list):
    i = 0
    esdocs =[]
    for file_name in file_path_list:
        if file_name != 'new-scrape_metadata/HTML':
            url = ""
            title = ""
            last_updated=""
            print("processing -----",i)
            text_data = read_text_file(file_name)
            text_data_array = text_data.split("\n")
            data =text_data_array[0].split("Last Updated:")
            if len(data) > 1:
                url_data = data[0].split(" ",1)
                last_updated = data[1]
                url = url_data[0]
                title = url_data[1]
            else:
                last_updated =""
            content = product_clean_text(text_data_array)
            indexing_date = today
            source = "IBM Product Doc whole"
            categories=''
            if len(last_updated) > 1:
                publish_date = last_updated
            else:
                publish_date = today
                last_updated = today

            esdocs.append({
                    "id": ""+title+"",
                    "published_source": ""+source+"",
                    "publish_date": ""+str(indexing_date)+"",
                    "last_update_date": ""+str(last_updated)+"",
                    "indexing_date": ""+str(indexing_date)+"",
                    "content": ""+content+"",
                    "url": ""+url+"",
                    "keywords": ""+str(title)+"",
                    "categories": ""+str(categories)+"",
                })
        i =i+1
        
    return esdocs

In [None]:
esdocs = index_ibm_product_data(file_path_list)

## 7. IBM Product Docs Inner pages (HTML)

In [14]:
def remove_extra_lines(data):
    data =re.sub(r'\n\s*\n', '\n', data, flags=re.MULTILINE)
    return data

def pre_processingtext(text_data):
    replaced = re.sub("</?p[^>]*>", "", text_data)
    replaced = re.sub("</?div[^>]*>", "", text_data)
    replaced = re.sub("</?a[^>]*>", "", replaced)
    replaced = re.sub("</?h*[^>]*>", "", replaced)
    replaced = re.sub("</?em*[^>]*>", "", replaced)
    replaced = re.sub("</?img*[^>]*>", "", replaced)
    replaced = re.sub("&amp;", "", replaced)
    return replaced

def pre_processing_html(html_data):
    final_data1 = pre_processingtext(html_data)
    final_data1 = remove_extra_lines(final_data1)
    return final_data1

In [15]:
def index_ibm_product_data_inner(file_path_list):
    i = 0
    esdocs =[]
    for file_name in file_path_list:
        url = ""
        title = ""
        last_updated=""
        print("processing -----",i)
        text_data = read_text_file(file_name)
        text_data_array = text_data.split("\n")
        last_updated = today
        url = text_data_array[0].replace("URL:","")
        title = file_name.replace("new-scrape_metadata/HTML","").replace(".html","")
        content = product_clean_text(text_data_array)
        content = pre_processing_html(content)
        indexing_date = today
        source = "IBM Product Doc Inner"
        categories=''
        publish_date = last_updated
        esdocs.append({
                    "id": ""+title+"",
                    "published_source": ""+source+"",
                    "publish_date": ""+str(indexing_date)+"",
                    "last_update_date": ""+str(last_updated)+"",
                    "indexing_date": ""+str(indexing_date)+"",
                    "content": ""+content+"",
                    "url": ""+url+"",
                    "keywords": ""+str(title)+"",
                    "categories": ""+str(categories)+"",
                })
           
        i =i+1
        
    return esdocs

In [None]:
esdocs = index_ibm_product_data_inner(file_path_list)

In [None]:
for document in esdocs:
    # Add the document to the index
    try:
        response = es_client.index(index=index_name, body=document)
        print(response)
    except Exception as e:
        print(e)
        continue

## Delete the index

In [14]:
# Delete the index
response = es_client.indices.delete(index=index_name)

In [15]:
print(response)

{'acknowledged': True}
