In [1]:
from elasticsearch import Elasticsearch
import os
from dotenv import load_dotenv
import json 
import warnings

warnings.filterwarnings("ignore")
load_dotenv()

True

### Load ENV variables

In [2]:
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')

### Initialize ElasticSearch

In [3]:
es = Elasticsearch([{'host':'localhost', 'port':9200, 'scheme': 'https'}],
                    verify_certs = False,
                    basic_auth = (USERNAME, PASSWORD))

# Check status
print('running: ', es.ping())

running:  True


### Create Index

Create Index:
```
PUT /cs_courses
```

Check Index:
```
GET _cat/indices
```


#### Example

In [14]:
from datetime import datetime

doc = {
    'author': 'kimchy',
    'text': 'Elasticsearch: cool. bonsai cool.',
    'timestamp': datetime.now(),
}
resp = es.index(index="my-index", id=1, document=doc)
print(resp['result'])

resp = es.get(index="my-index", id=1)
print(resp['_source'])

es.indices.refresh(index="my-index")

resp = es.search(index="my-index", query={"match_all": {}})
print("Got %d Hits:" % resp['hits']['total']['value'])
for hit in resp['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

updated
{'author': 'kimchy', 'text': 'Elasticsearch: cool. bonsai cool.', 'timestamp': '2023-11-18T12:01:33.751967'}
Got 1 Hits:
2023-11-18T12:01:33.751967 kimchy: Elasticsearch: cool. bonsai cool.


### Load CS Courses Data

In [16]:
with open('crawler/cs_courses.json') as f:
    cs_courses = json.load(f)
f.close()
# Load one course
cs_courses[0]

{'code': 'CS100',
 'title': 'Introduction to the Profession',
 'link': 'http://bulletin.iit.edu/search/?P=CS%20100',
 'description': 'An introduction to science and engineering as a profession. Examines the problem-solving process used in engineering and science. Emphasizes the interdisciplinary and international nature of problem-solving and the need to evaluate solutions in terms of a variety of constraints: computational, financial, and social.',
 'credits': ' 2',
 'prerequisites:': []}

### Index CS Courses Data

In [17]:
for id, course in enumerate(cs_courses):
    resp = es.index(index="cs_courses", id = id, document = course)
    print(resp['result'], ' course: ', course['code'])

es.indices.refresh(index="cs_courses")

updated  course:  CS100
updated  course:  CS104
updated  course:  CS105
updated  course:  CS110
updated  course:  CS115
updated  course:  CS116
updated  course:  CS201
updated  course:  CS330
updated  course:  CS331
updated  course:  CS340
updated  course:  CS350
updated  course:  CS351
updated  course:  CS401
updated  course:  CS402
updated  course:  CS403
updated  course:  CS406
updated  course:  CS411
updated  course:  CS422
updated  course:  CS425
updated  course:  CS429
updated  course:  CS430
updated  course:  CS440
updated  course:  CS442
updated  course:  CS443
updated  course:  CS445
updated  course:  CS447
updated  course:  CS450
updated  course:  CS451
updated  course:  CS455
updated  course:  CS456
updated  course:  CS458
updated  course:  CS470
updated  course:  CS480
updated  course:  CS481
updated  course:  CS482
updated  course:  CS484
updated  course:  CS485
updated  course:  CS487
updated  course:  CS492
updated  course:  CS511
updated  course:  CS512
updated  course:

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

### Example Query

Match all documents (top 5)

In [18]:
resp = es.search(index="cs_courses", query={"match_all": {}})
print("Got %d Hits:" % resp['hits']['total']['value'])

# Return first 5 documents
for hit in resp['hits']['hits'][:5]:
    print("%(code)s %(title)s: %(description)s" % hit["_source"])

Got 103 Hits:
CS100 Introduction to the Profession: An introduction to science and engineering as a profession. Examines the problem-solving process used in engineering and science. Emphasizes the interdisciplinary and international nature of problem-solving and the need to evaluate solutions in terms of a variety of constraints: computational, financial, and social.
CS104 Introduction to Computer Programming for Engineers: Introduces the use of high-level programming language as a problem-solving tool in engineering including basic data structures and algorithms, structured programming techniques, and software documentation.  Designed for students who have had little or no prior experience with computer programming.  Students should only take one of these courses (
CS105 Introduction to Computer Programming: Introduces the use of high-level programming language as a problem-solving tool, including basic data structures and algorithms, structured programming techniques, and software do

Match Documents on Description (top 5)

In [23]:
description = 'Operating Systems'
query_body = {
    'match': {
        'description': description
    }
}

In [24]:
resp = es.search(index="cs_courses", query = query_body)
print("Got %d Hits:" % resp['hits']['total']['value'])

# Return first 5 documents
for hit in resp['hits']['hits'][:5]:
    print("%(code)s %(title)s: %(description)s" % hit["_source"])

Got 32 Hits:
CS550 Advanced Operating Systems: Advanced operating system design concepts such as interprocess communication, distributed processing, replication and consistency, fault tolerance, synchronization, file systems.  Study of systems highlighting these concepts.
CS555 Analytic Models and Simulation of Computer Systems: Analytic and simulation techniques for the performance analysis of computer architecture, operating systems and communication networks. Rigorous development of queuing models. Study of simulation languages and models.
CS458 Introduction to Information Security: An introduction to the fundamentals of computer and information security. This course focuses on algorithms and techniques used to defend against malicious software. Topics include an introduction to encryption systems, operating system security, database security, network security, system threats, and risk avoidance procedures.
CS450 Operating Systems: Introduction to operating system concepts-including