In [2]:
import os.path  
import re 
from typing import List, Dict, Tuple
import json
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
import hashlib

import requests
import numpy as np

import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# TASK 3: Merging team indexes

In [26]:
cwd = os.getcwd() # get the current working directory                 
PATH_SCRIPT = os.path.abspath(cwd) 
PATH_DIR_RESULTS = os.path.join(PATH_SCRIPT, '..', 'Results', )
PATH_DIR_CRAWLED_DATA = os.path.join(PATH_DIR_RESULTS, "data")
PATH_INLINKS = os.path.join(PATH_DIR_RESULTS, 'links', 'inlinks.json')
PATH_OUTLINKS = os.path.join(PATH_DIR_RESULTS, 'links', 'outlinks.json')
PATH_INLINKS_COUNTS = os.path.join(PATH_DIR_RESULTS, 'links', 'inlinks_counts.json')

In [11]:
# Open the file and read the JSON data into a dictionary
with open(PATH_INLINKS, 'r') as inlink_file:
    inlinks_dict = json.load(inlink_file)

with open(PATH_OUTLINKS, 'r') as outlink_file:
    outlinks_dict = json.load(outlink_file)
    
with open(PATH_INLINKS_COUNTS, 'r') as inlink_count_file:
    inlinks_count_dict = json.load(inlink_count_file)
    

### 1. Save data to docs_mp => Parsing: read all files and parse the doc ID and TEXT

In [12]:
docs_map = {}       # dictionary to store the key: docno, value: parsed text
num_docs = 0        # total number of docs in the docs_map

In [13]:
def extract_title(doc_content: str) -> str:
    head_segments = re.findall(r"<HEAD>\s*(.*?)\s*</HEAD>", doc_content, re.DOTALL)
    return head_segments

def extract_content(doc_content: str) -> str:
    text_segments = re.findall(r"<TEXT>\s*(.*?)\s*</TEXT>", doc_content, re.DOTALL) # re module return a list of matches
    return text_segments
        

""" Open, parse a file and extract only the DOCNO and TEXT information into a map: docs_map.
"""
def parse_file(file_path: str):
    try:
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            content = file.read()
            matches = re.findall(r"<DOCNO>\s*(.*?)\s*</DOCNO>(.*?)</DOC>", content, re.DOTALL) # matches will be a list of tuples
            for docno, doc_content in matches:
                if docno in docs_map:
                    print(f"Duplicate or overwritten DOCNO: {docno}")             
                title = ' '.join(extract_title(doc_content))
                text = ' '.join(extract_content(doc_content))

                # Initialize a dictionary for the current docno if it doesn't exist
                if docno not in docs_map:
                    docs_map[docno] = {'title':'',
                                       'content': '', 
                                       'inlinks': [], 
                                       'outlinks': [],
                                       'inlink_no': 0,
                                       'outlink_no': 0,
                                       'author': ['Jianqing']
                                      }
            
                docs_map[docno]['title'] = title
                docs_map[docno]['content'] = text

                if docno in inlinks_dict:
                    docs_map[docno]['inlinks'] = inlinks_dict[docno]
                    docs_map[docno]['inlink_no'] = inlinks_count_dict[docno]
                
                if docno in outlinks_dict:
                    docs_map[docno]['outlinks'] = outlinks_dict[docno]
                    docs_map[docno]['outlink_no'] = len(outlinks_dict[docno])

    except FileNotFoundError:
        print("Error when loading the data file")

In [14]:
# Get the list of all files and directories
for filename in os.listdir(PATH_DIR_CRAWLED_DATA):
    if filename == ".ipynb_checkpoints":
        continue
    file_path = os.path.join(PATH_DIR_CRAWLED_DATA, filename) # construct unique filepath for each doc in the data dir
    parse_file(file_path) 
print('Parsing completed. Stored in docs_map.\n')

Parsing completed. Stored in docs_map.



In [27]:
# testing: get total num of docs 
num_docs = len(docs_map)
print(f"Expected total number of docs: 30000 \nActual: {num_docs}")

Expected total number of docs: 30000 
Actual: 30000


In [28]:
# testing
print(docs_map['https://en.wikipedia.org/wiki/UEMOA_Tournament'])

# testing
for doc_info in docs_map['https://en.wikipedia.org/wiki/List_of_epidemics']:
    print(f"title: {doc_info['title']}")
    print(f"content: {doc_info['content']}")
    print(f"inlinks: {doc_info['inlinks']}")
    print(f"outlinks: {doc_info['outlinks']}")
    print(f"inlink_no: {doc_info['inlink_no']}")
    print(f"outlink_no: {doc_info['outlink_no']}")

{'title': 'UEMOA Tournament  Wikipedia', 'content': "TheUEMOA Tournamentis a football (soccer) tournament held between nations who are a member of theWest African Economic and Monetary Union(UEMOA). The tournament was first played in2007. It is also calledCoupe de l'intÃ©gration ouest africaine. The teams are made up of players in national leagues of the organisation's member countries, in an effort to promote local talents. The participating nations are: The 2014 edition was cancelled because of an ebola outbreak. The next edition then was only played in 2016.[1][2]", 'inlinks': ['https://en.wikipedia.org/wiki/West_Africa'], 'outlinks': ['https://en.wikipedia.org/wiki/Special:MyContributions', 'https://en.wikipedia.org/wiki/FIFA_Men%27s_World_Ranking', 'https://en.wikipedia.org/wiki/Niger', 'https://en.wikipedia.org/wiki/South_American_Board_of_New_Football_Federations', 'https://en.wikipedia.org/wiki/Am%C3%ADlcar_Cabral_Cup', 'https://en.wikipedia.org/wiki/Help:Introduction', 'https:

TypeError: string indices must be integers, not 'str'

### 2. Connect to ES

In [3]:
INDEX_NAME = 'crawler' 
CLOUD_ID = "6200:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyRiZTllZjE5NDRkNTg0MDE3YTU0NDg0MzcwYjk5MjQzMSQ2Zjg1ODJhNWRjMGY0NDBhODU1Njk1MDQ4NzMyNmU2Yg=="                  
es = Elasticsearch(request_timeout = 10000, 
                    cloud_id = CLOUD_ID,
                    http_auth = ('elastic', 'fwOhKti7myB3PKFHQavQBhcr'))

print(es.ping())

True


### 2.4 Index Creation -- Already created by teammates

In [19]:
configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "max_result_window": 1000000,
        "analysis": {
            "filter": {},
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text"
            },
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            },
            "inlinks": {
                "type": "keyword"
            },
            "outlinks": {
                "type": "keyword"
            },
            "inlink_no": {
                "type": "integer"
            },
            "outlink_no": {
                "type": "integer"
            },
            "author":{
                "type": "keyword"
            }
        }
    }
}

es.indices.create(index=INDEX_NAME, body=json.dumps(configurations))

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'crawler'}

### 2.5 Merge and Add with Shared Index

In [20]:
# 1. Remove all documents where "Jianqing" is the only author
delete_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "term": {
                        "author": "Jianqing"
                    }
                }
            ],
            "must_not": [
                {
                    "script": {
                        "script": "doc['author'].length > 1"
                    }
                }
            ]
        }
    }
}

es.delete_by_query(index=INDEX_NAME, body=delete_query)

# 2. Remove "Jianqing" from the author field in documents where "Jianqing" is one of the authors
update_query = {
    "script": {
        "source": """
        if (ctx._source.author.contains(params.author)) {
          ctx._source.author.remove(ctx._source.author.indexOf(params.author));
        }
        """,
        "lang": "painless",
        "params": {
            "author": "Jianqing"
        }
    },
    "query": {
        "term": {
            "author": "Jianqing"
        }
    }
}

es.update_by_query(index=INDEX_NAME, body=update_query)


{'took': 3,
 'timed_out': False,
 'total': 0,
 'updated': 0,
 'deleted': 0,
 'batches': 0,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [29]:

# hash the url if too long
def hash_id(long_id):
    return hashlib.sha256(long_id.encode('utf-8')).hexdigest()

def merge(_id, title, content, inlink_no, outlink_no, inlinks, outlinks, author):
    # Check if the _id is too long and hash it if necessary
    if len(_id.encode('utf-8')) > 512:
        _id = hash_id(_id)

    # check if the id already exists
    try:
        doc_exists = es.exists(index=INDEX_NAME, id=_id)
    except Exception as e:
        print(f"An error occurred while checking if the document exists: {e}")
        return
    
    if doc_exists:
        # get the current document
        current_doc = es.get(index=INDEX_NAME, id=_id)['_source']
        
        # merge inlinks and calculate new inlink_no
        current_inlinks = set(current_doc.get('inlinks', []))
        updated_inlinks = current_inlinks.union(set(inlinks))
        
        updated_inlink_no = len(updated_inlinks)
        
        # merge authors
        authors = current_doc.get('author', [])
        if not isinstance(authors, list):
            authors = [authors]
        if author not in authors:
            authors.append(author)
            
        # update the document
        try:
            es.update(index=INDEX_NAME, id=_id, body={
                'doc': {
                    'author': authors,
                    'inlinks': list(updated_inlinks),
                    'inlink_no': updated_inlink_no
                }
            })
        except Exception as e:
            print(f"An error occurred while updating the document: {e}")   
    else:
        # create a new document 
        try:
            es.index(index=INDEX_NAME, 
                     id=_id, 
                     body={
                         'author': [author],
                         'title': title,
                         'content': content,
                         'inlinks': list(inlinks),
                         'outlinks': list(outlinks),
                         'inlink_no': int(inlink_no),
                         'outlink_no': int(outlink_no)
                     })
        except Exception as e:
            print(f"An error occurred while creating the document: {e}")

In [30]:
for docno, doc_info in docs_map.items():
    merge(_id=docno,
          title=doc_info['title'],
          content=doc_info['content'],
          inlink_no=doc_info['inlink_no'],
          outlink_no=doc_info['outlink_no'],  
          inlinks=doc_info['inlinks'],
          outlinks=doc_info['outlinks'],
          author='Jianqing')

In [4]:
try:
    search_results = es.search(
        index=INDEX_NAME,
        body={
            "query": {
                "match": {
                    "author": "Jianqing"
                }
            },
            "track_total_hits": True
        },
        size=100 
    )

    print(f"Total hits: {search_results['hits']['total']['value']}")
    # for doc in search_results['hits']['hits']:
        # print(f"ID: {doc['_id']}, Score: {doc['_score']}")
        # print(doc['_source'])
except Exception as e:
    print(f"An error occurred while searching the documents: {e}")

Total hits: 30000


In [7]:

try:
    # URL must be properly encoded to match the ID in the index.
    url_id = "https%3A%2F%2Fja.wikipedia.org%2Fwiki%2F%25E3%2583%258E%25E3%2583%25BC%25E3%2583%2599%25E3%2583%25AB%25E6%2596%2587%25E5%25AD%25A6%25E8%25B3%259E"

    search_results = es.search(
        index=INDEX_NAME,
        body={
            "query": {
                "terms": {
                    "_id": [url_id]
                }
            },
            "track_total_hits": True
        },
        size=100 
    )

    print(f"Total hits: {search_results['hits']['total']['value']}")
    # Uncomment the following lines to print each document's ID and score.
    # for doc in search_results['hits']['hits']:
        # print(f"ID: {doc['_id']}, Score: {doc['_score']}")
        # print(doc['_source'])

except Exception as e:
    print(f"An error occurred while searching the documents: {e}")

Total hits: 0


In [6]:
search_results = es.search(
    index=INDEX_NAME,
    body={
        "query": {
            "bool": {
                "must": [
                    { "term": { "author": "Anson" } },
                    { "term": { "author": "Rohith" } },
                    #{ "term": { "author": "Jianqing" } }
                ]
            }
        },
        "track_total_hits": True  
    },
    size=10  
)
print(f"Total hits: {search_results['hits']['total']['value']}")
for doc in search_results['hits']['hits']:
    print(f"ID: {doc['_id']}")
    print(f"Content: {doc['_source']['title']}")

Total hits: 697
ID: https://en.wikipedia.org/wiki/Kurdish_language
Content: Kurdish language - Wikipedia
ID: https://en.wikipedia.org/wiki/Latin_language
Content: Latin - Wikipedia
ID: https://www.nlm.nih.gov/web_policies.html
Content: NLM Web Policies
ID: https://en.wikipedia.org/wiki/Arabic_language
Content: Arabic - Wikipedia
ID: https://en.wikipedia.org/wiki/History
Content: History - Wikipedia
ID: https://en.wikipedia.org/wiki/Integrated_Authority_File
Content: Integrated Authority File - Wikipedia
ID: https://en.wikipedia.org/wiki/Biblioth%C3%A8que_nationale_de_France
Content: Bibliothèque nationale de France - Wikipedia
ID: https://wikimediafoundation.org/news/category/topics/legal/
Content: Legal – Wikimedia Foundation
ID: https://lists.wikimedia.org/postorius/lists/commons-l.lists.wikimedia.org/
Content: 
Info | commons-l@lists.wikimedia.org - lists.wikimedia.org

ID: https://wikimediafoundation.org/news/category/audiences/community/
Content: Community – Wikimedia Foundation


In [None]:
# https://en.wikipedia.org/wiki/France

In [None]:
es = Elasticsearch(cloud_id= "0feeb24636464a578a9c7a1ce9739181:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyQyMzcyNjZmYzcwMzg0ZTA2OTM1MTJkZGIxMDgzYTRmMyQ1N2RhZjIzZTNiMWM0MjAwYjBhMDQ0MGY1ZTEyZTc2Yw==",
                http_auth=("elastic", "pETnMazDlmfyCT2rZ2NAWh2V"))
response = es.search(
    index=INDEX,
    body={
    "query": {
        "multi_match": {
        "query": text_query,
        "fields": ["title", "content"]
        }
    }
    }, size=25
)
        

In [7]:

try:
    url_id = "https://en.wikipedia.org/wiki/France"

    response = es.search(
        index=INDEX_NAME,
        body={
            "query": {
                "terms": {
                    "_id": [url_id]
                }
            },
            "track_total_hits": True
        },
        size=100 
    )

    # print(f"{response['hits']['hits']}")
    print(f"{response['hits']}")
    # Uncomment the following lines to print each document's ID and score.
    # for doc in search_results['hits']['hits']:
        # print(f"ID: {doc['_id']}, Score: {doc['_score']}")
        # print(doc['_source'])

except Exception as e:
    print(f"An error occurred while searching the documents: {e}")

An error occurred while searching the documents: TransportError(502, '{"ok":false,"message":"The instance rejected the connection."}\n')
