# Web Scraping part

In [99]:
import json
import re 
import urllib.request 
from bs4 import BeautifulSoup 

def get_all_web_data(url):
    try:
        html = urllib.request.urlopen(url) 
    except:
        return None
    soup = BeautifulSoup(html) 
    try:
    #find where the metadata is
        content = soup.findAll('script',type='application/ld+json')
        formatted_paragraph = content[0].string.replace('\n','').replace('&apos;','\'').replace('&nbsp; ','')
        doc_data = json.loads(formatted_paragraph)
    
    #store article metadata
        start_word = doc_data['articleBody'][:doc_data['articleBody'].find(' ')]
        author = doc_data['author']['name']
        headline = doc_data['headline']
        date = doc_data['datePublished']
        original_url = doc_data['url'] #re.search("https://www.livescience.com/\S*\.html",doc_data['url']).group()
    except:
        return None
    
    #find and store the full article
    full_content = soup.findAll('p')
    full_paragraph = list()
    for element in full_content[:-6]:
        full_paragraph.append(str(element))
    full_long_paragraph = ''.join([element for element in full_paragraph])
    cleantext = BeautifulSoup(full_long_paragraph, 'lxml').text
    completed_article = cleantext[cleantext.find(start_word):].replace('\xa0','')
    
    return {'headline':headline, 'author':author, 'date':date[:10], 'link':original_url, 'content':completed_article}

In [215]:
#for test
test = get_all_web_data('https://www.livescience.com/57965-fossils-of-extinct-giant-rodents-found.html')
test

{'headline': "Extinct Giant Rodents' Family Tree Rewritten by New Fossil Finds",
 'author': 'Mindy Weisberger',
 'date': '2017-02-22',
 'link': 'https://www.livescience.com/57965-fossils-of-extinct-giant-rodents-found.html',
 'content': 'Scientists have found a near-complete skull and a jaw from a pair of giant rodents belonging to a group that lived millions of years ago in South America, and they say the fossils show that the extinct creatures weighed as much as 1 ton when fully grown.These are the best-preserved fossils to date of this extinct group, which was previously known only by skull fragments and individual teeth, the scientists reported in a new study.The new fossils of the two rodents — an adult and a juvenile — paint a more complete picture of the extinct and massive rat-like animals, the researchers said. For instance, the finds raise questions about how these giant rodents were classified within their genus, and hint that several species that were thought to be related 

In [79]:
#Get all links from the main browse page of the website
def get_all_links_current_version(main_url):
    try:
        html = urllib.request.urlopen(main_url) 
    except:
        return None
    soup = BeautifulSoup(html)
    
    #Get Links from the current version of the website
    find_article_links = soup.findAll('a',class_='article-link')
    all_links = re.findall("https://www.livescience.com/\S*\.html",str(find_article_links))
    return all_links

In [80]:
def get_all_links_older_version(main_url):
    try:
        html = urllib.request.urlopen(main_url) 
    except:
        return None
    soup = BeautifulSoup(html)
    
    #Get links from the older version of the website (same method)
    #Start working since May 5, 2016 to august 7, 2019
    find_article_links = soup.findAll('a',class_='read-url')
    all_links = re.findall("https://www.livescience.com/\S*\.html",str(find_article_links))
    return all_links

In [111]:
#for test
test = get_all_links_current_version('https://web.archive.org/web/20200320223035/https://www.livescience.com/animals/')
test

['https://www.livescience.com/ancient-fish-fingers.html',
 'https://www.livescience.com/macaque-fight-thailand-temple-coronavirus.html',
 'https://www.livescience.com/earth-shorter-days-millions-years-ago.html',
 'https://www.livescience.com/white-giraffes-slaughtered-by-poachers.html',
 'https://www.livescience.com/smallest-dinosaur-of-mesozoic.html',
 'https://www.livescience.com/swamp-wallaby-always-pregnant.html',
 'https://www.livescience.com/oldest-cave-dwelling-animal-cockroaches.html',
 'https://www.livescience.com/deep-sea-sponges-sneeze-underwater.html',
 'https://www.livescience.com/llm-podcast-8-dinosaurs.html',
 'https://www.livescience.com/why-cats-have-white-socks-on-paws.html',
 'https://www.livescience.com/polar-bears-photos.html',
 'https://www.livescience.com/coconut-crab-clicking.html',
 'https://www.livescience.com/parasitic-worms-in-lizard-embryos.html',
 'https://www.livescience.com/first-non-breathing-animal.html',
 'https://www.livescience.com/ice-age-bird-perm

In [85]:
test2 = get_all_links_older_version('https://web.archive.org/web/20180102215036/https://www.livescience.com/animals/')
test2

['https://www.livescience.com/61308-do-animals-get-jealous.html',
 'https://www.livescience.com/61299-sea-stars-making-comeback.html',
 'https://www.livescience.com/61287-new-book-asks-does-it-fart.html',
 'https://www.livescience.com/61292-does-it-fart-10-fascinating-facts-about-animal-toots.html',
 'https://www.livescience.com/61272-new-worm-species-no-anus-discovery.html',
 'https://www.livescience.com/61269-cambrian-sea-monster.html',
 'https://www.livescience.com/32115-bison-vs-buffalo-whats-the-difference.html',
 'https://www.livescience.com/61248-dog-chocolate-poisonings-spike-at-christmas.html',
 'https://www.livescience.com/61241-how-often-do-dogs-maul-owners.html',
 'https://www.livescience.com/61238-spiders-build-sandcastles-underground.html']

# Search Engine (Main part)

In [1]:
from elasticsearch import Elasticsearch

In [118]:
#Use when everything is finished
all_article_links = list()
main_links = {'https://www.livescience.com/animals/',
              'https://web.archive.org/web/20200320223035/https://www.livescience.com/animals/', #Current #March 20, 2020
              'https://web.archive.org/web/20200216223856/https://www.livescience.com/animals/', #Current #Feb 16, 2020
              'https://web.archive.org/web/20200127213043/https://www.livescience.com/animals/', #Current #Jan 27, 2020
              'https://web.archive.org/web/20200127213045/https://www.livescience.com/animals/2', #Current #Jan 27, 2020
              'https://web.archive.org/web/20200127214447/https://www.livescience.com/animals/3', #Current #Jan 27, 2020
              'https://web.archive.org/web/20200104041205/https://www.livescience.com/animals/4', #Current #Jan 04, 2020
              'https://web.archive.org/web/20190731024418/https://www.livescience.com/animals/', #Older #July 31, 2019
              'https://web.archive.org/web/20190723220159if_/https://www.livescience.com/animals/2', #Older #July 31, 2019
              'https://web.archive.org/web/20190709215801/https://www.livescience.com/animals/', #Older #Jul 09, 2019
              'https://web.archive.org/web/20190709203939/https://www.livescience.com/animals/2', #Older #Jul 09, 2019
              'https://web.archive.org/web/20190617213319/https://www.livescience.com/animals/', #Older #June 17, 2019
              'https://web.archive.org/web/20190617213320if_/https://www.livescience.com/animals/2', #Older #June 17, 2019
              'https://web.archive.org/web/20190525180136/https://www.livescience.com/animals/', #Older #May 25, 2019
              'https://web.archive.org/web/20190513020621/https://www.livescience.com/animals/', #Older #May 13, 2019
              'https://web.archive.org/web/20190513020623/https://www.livescience.com/animals/2', #Older #May 13, 2019
              'https://web.archive.org/web/20190418160125/https://www.livescience.com/animals/', #Older #Apr 18, 2019
              'https://web.archive.org/web/20190402194936/https://www.livescience.com/animals/2', #Older #Apr 02, 2019
              'https://web.archive.org/web/20190319212821/https://www.livescience.com/animals/', #Older #March 19, 2019
              'https://web.archive.org/web/20190312075436/https://www.livescience.com/animals/', #Older #March 12, 2019
              'https://web.archive.org/web/20190301174738/https://www.livescience.com/animals/', #Older #March 01, 2019
              'https://web.archive.org/web/20190221182843/https://www.livescience.com/animals/', #Older #Feb 21, 2019
              'https://web.archive.org/web/20190212030534/https://www.livescience.com/animals/', #Older #Feb 12, 2019
              'https://web.archive.org/web/20190126204430/https://www.livescience.com/animals/', #Older #Jan 26, 2019
              'https://web.archive.org/web/20190114124031/https://www.livescience.com/animals/', #Older #Jan 14, 2019
              'https://web.archive.org/web/20190112194350/https://www.livescience.com/animals/2', #Older #Jan 12, 2019
              'https://web.archive.org/web/20181215173831/https://www.livescience.com/animals/', #Older #Dec 15, 2018
              'https://web.archive.org/web/20181210000847/https://www.livescience.com/animals/2', #Older #Dec 10, 2018
              'https://web.archive.org/web/20181109002340/https://www.livescience.com/animals', #Older #Nov 9, 2018
              'https://web.archive.org/web/20181109002342if_/https://www.livescience.com/animals/2', #Older #Nov 9, 2018
              'https://web.archive.org/web/20181002213341/https://www.livescience.com/animals/', #Older #Oct 02, 2018
              'https://web.archive.org/web/20180909231408/https://www.livescience.com/animals', #Older #Sep 09, 2018
              'https://web.archive.org/web/20180823022930/https://www.livescience.com/animals/', #Older #Aug 23, 2018
              'https://web.archive.org/web/20180823022932/https://www.livescience.com/animals/2', #Older #Aug 23, 2018
              'https://web.archive.org/web/20180726004832/https://www.livescience.com/animals/', #Older #Jul 26, 2018
              'https://web.archive.org/web/20180702202725/https://www.livescience.com/animals', #Older #Jul 02, 2018
              'https://web.archive.org/web/20180702202727if_/https://www.livescience.com/animals/2', #Older #Jul 02, 2018
              'https://web.archive.org/web/20180614044854/https://www.livescience.com/animals/', #Older #June 14, 2018
              'https://web.archive.org/web/20180607183812/https://www.livescience.com/animals/', #Older #June 07, 2018
              'https://web.archive.org/web/20180528133652/https://www.livescience.com/animals/', #Older #May 28, 2018
              'https://web.archive.org/web/20180429130220/https://www.livescience.com/animals/', #Older #Apr 29, 2018
              'https://web.archive.org/web/20180402211016/https://www.livescience.com/animals', #Older #Apr 02, 2018
              'https://web.archive.org/web/20180319215338/https://www.livescience.com/animals/', #Older #March 19, 2018
              'https://web.archive.org/web/20180309211545/https://www.livescience.com/animals/', #Older #March 09, 2018
              'https://web.archive.org/web/20180221233300/https://www.livescience.com/animals', #Older #Feb 21, 2018
              'https://web.archive.org/web/20180207193444/https://www.livescience.com/animals/', #Older #Feb 07, 2018
              'https://web.archive.org/web/20180109225659/https://www.livescience.com/animals/', #Older #Jan 09, 2018
              'https://web.archive.org/web/20180102215036/https://www.livescience.com/animals/', #Older #Jan 02, 2018
              'https://web.archive.org/web/20171219204609/https://www.livescience.com/animals/', #Older #Dec 19, 2017
              'https://web.archive.org/web/20171205195858/https://www.livescience.com/animals/', #Older #Dec 05, 2017
              'https://web.archive.org/web/20171124030644/https://www.livescience.com/animals/', #Older #Nov 24, 2017
              'https://web.archive.org/web/20171112043830/https://www.livescience.com/animals/', #Older #Nov 12, 2017
              'https://web.archive.org/web/20171108201231/https://www.livescience.com/animals', #Older #Nov 08, 2017
              'https://web.archive.org/web/20171024234930/https://www.livescience.com/animals/', #Older #Oct 24, 2017
              'https://web.archive.org/web/20171017231940/https://www.livescience.com/animals/', #Older #Oct 17, 2017
              'https://web.archive.org/web/20170830200008/https://www.livescience.com/animals', #Older #Aug 30, 2017
              'https://web.archive.org/web/20170821193401/https://www.livescience.com/animals/', #Older #Aug 21, 2017 ##800 Links
              'https://web.archive.org/web/20170722060301/https://www.livescience.com/animals/', #Older #Jul 22, 2017
              'https://web.archive.org/web/20170715053204/https://www.livescience.com/animals/', #Older #Jul 15, 2017
              'https://web.archive.org/web/20170708050112/https://www.livescience.com/animals/', #Older #Jul 08, 2017
              'https://web.archive.org/web/20170701044926/https://www.livescience.com/animals/', #Older #Jul 01, 2017
              'https://web.archive.org/web/20170624043338/https://www.livescience.com/animals/', #Older #June 24, 2017
              'https://web.archive.org/web/20170617040652/https://www.livescience.com/animals', #Older #June 17, 2017
              'https://web.archive.org/web/20170610032615/https://www.livescience.com/animals', #Older #June 10, 2017
              'https://web.archive.org/web/20170603030053/https://www.livescience.com/animals', #Older #June 03, 2017
              'https://web.archive.org/web/20170520021443/https://www.livescience.com/animals', #Older #May 20, 2017
              'https://web.archive.org/web/20170513015100/https://www.livescience.com/animals', #Older #May 13, 2017 ##900 Links
              'https://web.archive.org/web/20170508234355/https://www.livescience.com/animals', #Older #May 08, 2017
              'https://web.archive.org/web/20170428233828/https://www.livescience.com/animals', #Older #Apr 28, 2017
              'https://web.archive.org/web/20170421054020/https://www.livescience.com/animals', #Older #Apr 21, 2017 to Apr 18
              'https://web.archive.org/web/20170414050620/https://www.livescience.com/animals', #Older #Apr 14, 2017
              'https://web.archive.org/web/20170407033827/https://www.livescience.com/animals', #Older #Apr 07, 2017
              'https://web.archive.org/web/20170402212849/https://www.livescience.com/animals', #Older #Apr 02, 2017
              'https://web.archive.org/web/20170328143256/https://www.livescience.com/animals', #Older #Mar 28, 2017
              'https://web.archive.org/web/20170320155539/https://www.livescience.com/animals', #Older #Mar 20, 2017
              'https://web.archive.org/web/20170312043329/https://www.livescience.com/animals', #Older #Mar 12, 2017
              'https://web.archive.org/web/20170303011443/https://www.livescience.com/animals', #Older #Mar 03, 2017 to Feb 28 ##1,000 links
             }

In [119]:
main_links_amount = len(main_links)
for i,main_link in enumerate(main_links):
    if i <= 6:
        if main_link.startswith('https://www.livescience.com'):
            for j in range(1,10):
                links_in_web = get_all_links_current_version(main_link+str(j))
                if links_in_web != None:
                    all_article_links.extend(links_in_web)
            print("Done", i, "out of", main_links_amount)
        else:
            links_in_web = get_all_links_current_version(main_link)
            if links_in_web != None:
                all_article_links.extend(links_in_web)
                print("Done", i, "out of", main_links_amount)
    else:
        links_in_web = get_all_links_older_version(main_link)
        if links_in_web != None:
            all_article_links.extend(links_in_web)
            print("Done", i, "out of", main_links_amount)
all_article_links = set(all_article_links)
len(all_article_links)

Done 0 out of 77
Done 1 out of 77
Done 2 out of 77
Done 3 out of 77
Done 4 out of 77
Done 5 out of 77
Done 6 out of 77
Done 7 out of 77
Done 8 out of 77
Done 9 out of 77
Done 10 out of 77
Done 11 out of 77
Done 12 out of 77
Done 13 out of 77
Done 14 out of 77
Done 15 out of 77
Done 16 out of 77
Done 17 out of 77
Done 18 out of 77
Done 19 out of 77
Done 20 out of 77
Done 21 out of 77
Done 22 out of 77
Done 23 out of 77
Done 24 out of 77
Done 25 out of 77
Done 26 out of 77
Done 27 out of 77
Done 28 out of 77
Done 29 out of 77
Done 30 out of 77
Done 31 out of 77
Done 32 out of 77
Done 33 out of 77
Done 34 out of 77
Done 35 out of 77
Done 36 out of 77
Done 37 out of 77
Done 38 out of 77
Done 39 out of 77
Done 40 out of 77
Done 41 out of 77
Done 42 out of 77
Done 43 out of 77
Done 44 out of 77
Done 45 out of 77
Done 46 out of 77
Done 47 out of 77
Done 48 out of 77
Done 49 out of 77
Done 50 out of 77
Done 51 out of 77
Done 52 out of 77
Done 53 out of 77
Done 54 out of 77
Done 55 out of 77
Do

685

In [120]:
#Add 9 pages of current website to the list
current_version_links = {
    'https://www.livescience.com/animals/',
    'https://web.archive.org/web/20200320223035/https://www.livescience.com/animals/', #Current #March 20, 2020
    'https://web.archive.org/web/20200216223856/https://www.livescience.com/animals/', #Current #Feb 16, 2020
    'https://web.archive.org/web/20200127213043/https://www.livescience.com/animals/', #Current #Jan 27, 2020
    'https://web.archive.org/web/20200127213045/https://www.livescience.com/animals/2', #Current #Jan 27, 2020
    'https://web.archive.org/web/20200127214447/https://www.livescience.com/animals/3', #Current #Jan 27, 2020
    'https://web.archive.org/web/20200104041205/https://www.livescience.com/animals/4'
}
current_version_link_amount = len(current_version_links)
all_article_links = list(all_article_links)
for i,current_version_link in enumerate(current_version_links):
    if i <= 6:
        if current_version_link.startswith('https://www.livescience.com'):
            for i in range(1,10):
                links_in_web = get_all_links_current_version(current_version_link+str(i))
                if links_in_web != None:
                    all_article_links.extend(links_in_web)
                    print("Done", i, "out of", current_version_link_amount)
        else:
            links_in_web = get_all_links_current_version(current_version_link)
            if links_in_web != None:
                all_article_links.extend(links_in_web)
                print("Done", i, "out of", current_version_link_amount)
all_article_links = set(all_article_links)
len(all_article_links)

Done 0 out of 7
Done 1 out of 7
Done 2 out of 7
Done 3 out of 7
Done 4 out of 7
Done 5 out of 7
Done 6 out of 7
Done 7 out of 7
Done 8 out of 7
Done 9 out of 7
Done 2 out of 7
Done 3 out of 7
Done 4 out of 7
Done 5 out of 7
Done 6 out of 7


944

In [121]:
article_data_list = list()
all_article_links_amount = len(all_article_links)
for i,link in enumerate(all_article_links):
    return_data = get_all_web_data(link)
    if return_data != None:
        article_data_list.append(return_data)
        print("Done", i, "out of", all_article_links_amount)

Done 0 out of 944
Done 1 out of 944
Done 2 out of 944
Done 3 out of 944
Done 4 out of 944
Done 5 out of 944
Done 6 out of 944
Done 7 out of 944
Done 8 out of 944
Done 9 out of 944
Done 10 out of 944
Done 11 out of 944
Done 12 out of 944
Done 13 out of 944
Done 14 out of 944
Done 15 out of 944
Done 16 out of 944
Done 17 out of 944
Done 18 out of 944
Done 20 out of 944
Done 21 out of 944
Done 22 out of 944
Done 23 out of 944
Done 24 out of 944
Done 25 out of 944
Done 26 out of 944
Done 27 out of 944
Done 28 out of 944
Done 29 out of 944
Done 30 out of 944
Done 31 out of 944
Done 32 out of 944
Done 33 out of 944
Done 34 out of 944
Done 35 out of 944
Done 37 out of 944
Done 38 out of 944
Done 39 out of 944
Done 40 out of 944
Done 41 out of 944
Done 42 out of 944
Done 43 out of 944
Done 44 out of 944
Done 45 out of 944
Done 46 out of 944
Done 48 out of 944
Done 49 out of 944
Done 51 out of 944
Done 52 out of 944
Done 54 out of 944
Done 55 out of 944
Done 56 out of 944
Done 57 out of 944
Don

Done 433 out of 944
Done 434 out of 944
Done 435 out of 944
Done 436 out of 944
Done 437 out of 944
Done 438 out of 944
Done 439 out of 944
Done 440 out of 944
Done 441 out of 944
Done 442 out of 944
Done 443 out of 944
Done 444 out of 944
Done 445 out of 944
Done 446 out of 944
Done 447 out of 944
Done 448 out of 944
Done 449 out of 944
Done 450 out of 944
Done 451 out of 944
Done 452 out of 944
Done 453 out of 944
Done 454 out of 944
Done 455 out of 944
Done 456 out of 944
Done 457 out of 944
Done 458 out of 944
Done 459 out of 944
Done 460 out of 944
Done 461 out of 944
Done 462 out of 944
Done 463 out of 944
Done 464 out of 944
Done 465 out of 944
Done 466 out of 944
Done 467 out of 944
Done 468 out of 944
Done 469 out of 944
Done 470 out of 944
Done 471 out of 944
Done 472 out of 944
Done 474 out of 944
Done 475 out of 944
Done 476 out of 944
Done 477 out of 944
Done 478 out of 944
Done 479 out of 944
Done 480 out of 944
Done 481 out of 944
Done 482 out of 944
Done 483 out of 944


Done 862 out of 944
Done 863 out of 944
Done 864 out of 944
Done 865 out of 944
Done 866 out of 944
Done 867 out of 944
Done 868 out of 944
Done 870 out of 944
Done 871 out of 944
Done 872 out of 944
Done 873 out of 944
Done 874 out of 944
Done 875 out of 944
Done 876 out of 944
Done 877 out of 944
Done 878 out of 944
Done 879 out of 944
Done 880 out of 944
Done 881 out of 944
Done 882 out of 944
Done 883 out of 944
Done 884 out of 944
Done 885 out of 944
Done 886 out of 944
Done 887 out of 944
Done 888 out of 944
Done 889 out of 944
Done 890 out of 944
Done 891 out of 944
Done 892 out of 944
Done 893 out of 944
Done 894 out of 944
Done 895 out of 944
Done 896 out of 944
Done 897 out of 944
Done 898 out of 944
Done 899 out of 944
Done 900 out of 944
Done 901 out of 944
Done 902 out of 944
Done 903 out of 944
Done 904 out of 944
Done 905 out of 944
Done 906 out of 944
Done 907 out of 944
Done 908 out of 944
Done 909 out of 944
Done 910 out of 944
Done 911 out of 944
Done 912 out of 944


In [122]:
len(article_data_list)

904

In [2]:
es = Elasticsearch()

In [124]:
all_usable_article_data_amount = len(article_data_list)
for i,article_data in enumerate(article_data_list):
    es.index(index='article',id=i,body=article_data)
    print("Done", i, "out of", all_usable_article_data_amount)

Done 0 out of 904
Done 1 out of 904
Done 2 out of 904
Done 3 out of 904
Done 4 out of 904
Done 5 out of 904
Done 6 out of 904
Done 7 out of 904
Done 8 out of 904
Done 9 out of 904
Done 10 out of 904
Done 11 out of 904
Done 12 out of 904
Done 13 out of 904
Done 14 out of 904
Done 15 out of 904
Done 16 out of 904
Done 17 out of 904
Done 18 out of 904
Done 19 out of 904
Done 20 out of 904
Done 21 out of 904
Done 22 out of 904
Done 23 out of 904
Done 24 out of 904
Done 25 out of 904
Done 26 out of 904
Done 27 out of 904
Done 28 out of 904
Done 29 out of 904
Done 30 out of 904
Done 31 out of 904
Done 32 out of 904
Done 33 out of 904
Done 34 out of 904
Done 35 out of 904
Done 36 out of 904
Done 37 out of 904
Done 38 out of 904
Done 39 out of 904
Done 40 out of 904
Done 41 out of 904
Done 42 out of 904
Done 43 out of 904
Done 44 out of 904
Done 45 out of 904
Done 46 out of 904
Done 47 out of 904
Done 48 out of 904
Done 49 out of 904
Done 50 out of 904
Done 51 out of 904
Done 52 out of 904
Don

Done 424 out of 904
Done 425 out of 904
Done 426 out of 904
Done 427 out of 904
Done 428 out of 904
Done 429 out of 904
Done 430 out of 904
Done 431 out of 904
Done 432 out of 904
Done 433 out of 904
Done 434 out of 904
Done 435 out of 904
Done 436 out of 904
Done 437 out of 904
Done 438 out of 904
Done 439 out of 904
Done 440 out of 904
Done 441 out of 904
Done 442 out of 904
Done 443 out of 904
Done 444 out of 904
Done 445 out of 904
Done 446 out of 904
Done 447 out of 904
Done 448 out of 904
Done 449 out of 904
Done 450 out of 904
Done 451 out of 904
Done 452 out of 904
Done 453 out of 904
Done 454 out of 904
Done 455 out of 904
Done 456 out of 904
Done 457 out of 904
Done 458 out of 904
Done 459 out of 904
Done 460 out of 904
Done 461 out of 904
Done 462 out of 904
Done 463 out of 904
Done 464 out of 904
Done 465 out of 904
Done 466 out of 904
Done 467 out of 904
Done 468 out of 904
Done 469 out of 904
Done 470 out of 904
Done 471 out of 904
Done 472 out of 904
Done 473 out of 904


Done 862 out of 904
Done 863 out of 904
Done 864 out of 904
Done 865 out of 904
Done 866 out of 904
Done 867 out of 904
Done 868 out of 904
Done 869 out of 904
Done 870 out of 904
Done 871 out of 904
Done 872 out of 904
Done 873 out of 904
Done 874 out of 904
Done 875 out of 904
Done 876 out of 904
Done 877 out of 904
Done 878 out of 904
Done 879 out of 904
Done 880 out of 904
Done 881 out of 904
Done 882 out of 904
Done 883 out of 904
Done 884 out of 904
Done 885 out of 904
Done 886 out of 904
Done 887 out of 904
Done 888 out of 904
Done 889 out of 904
Done 890 out of 904
Done 891 out of 904
Done 892 out of 904
Done 893 out of 904
Done 894 out of 904
Done 895 out of 904
Done 896 out of 904
Done 897 out of 904
Done 898 out of 904
Done 899 out of 904
Done 900 out of 904
Done 901 out of 904
Done 902 out of 904
Done 903 out of 904


In [5]:
search_query = input('Search: ')
size = input('Number of docs: ')
not_include = input('Words not to include (optional): ')
body = {
    "from":0,
    "size":int(size),
    "query": {
        "bool":{
            "should":[
                { "match": {"content":{"query": search_query}} },
                { "match": {"content":{ "query": search_query, "operator": "and" }} },
                { "match_phrase": {"content":{"query": search_query, "boost": 2}} }
            ],
            "must_not":[
                { "match": {"content":{"query": not_include}}}
            ]
        }
    }
}

res = es.search(index="article", body=body)
res
#res['hits'].keys()
#print(f"Number to show {size} \nMatched Query: {res['hits']['hits'][0]['_source'].get('content')}")

Search: cat
Number of docs: 10
Words not to include (optional): 


{'took': 10,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 61, 'relation': 'eq'},
  'max_score': 21.536486,
  'hits': [{'_index': 'article',
    '_type': '_doc',
    '_id': '114',
    '_score': 21.536486,
    '_source': {'headline': 'Meet the Cat-Fox, an Oddball Feline Roaming Around a French Island',
     'author': 'Yasemin Saplakoglu',
     'date': '2019-06-19',
     'link': 'https://www.livescience.com/65744-cat-fox-new-species.html',
     'content': 'A bizarre-looking cat that roams the remote forests of the French island of Corsica may be a new species, according to local news reports.These felines are known to locals as "cat-foxes," and wildlife rangers in Corsica think that they might be a new, undocumented species, according to the Agence France-Presse(AFP).These cat-foxes earned their name because of their size (slightly larger than an average domestic cat) and their tail decorations — most of them have 

In [12]:
#print matched results
for i,result in enumerate(res['hits']['hits']):  
    print("Ranking:",i+1)
    print("Score:", result['_score'])
    print(f"Headline: {result['_source']['headline']}")
    print(f"Author: {result['_source']['author']} || Date: {result['_source']['date']}")
    print("\n\t",result['_source']['content'])
    print("\nOriginal article:",result['_source']['link'])
    print("------------------------------------------------------------------------------------------------------------------")

Ranking: 1
Score: 0.6904231
Headline: Why do so many cats have white 'socks' on their paws?
Author: Grant Currin || Date: 2020-02-29

	 If you see a house cat, the odds are high that it will have white paws, a look that many owners affectionately call "socks." But socks are rarely seen in wildcats, the elusive and undomesticated cousin of the house cat, so why do so many pet cats sport furry white feet?As it turns out, this story started about 10,000 years ago, when humans and cats decided life was better together.This domestication eventually led to über-prevalent socks on cats, as well as other well-known coat patterns, said Leslie Lyons, professor emerita and head of the Feline Genetics Laboratory at the University of Missouri College of Veterinary Medicine.Related: Why do cats wiggle their butts before they pounce?"As humans became farmers and started staying in one place, they had grain stores and refuse piles" that attracted rodents, Lyons said. It was a mutually beneficial arran

# Experiment part
Nothing affects the search system  
No need to care

In [77]:
import ast

for_test = Elasticsearch()
rating_string = ''.join(['{ "_index": "article", "_id": "%s", "rating": %s }, ' % (element['_id'],element['_score']) for element in res['hits']['hits']])[:-2]
rating_list = ast.literal_eval(rating_string)
test_eval = {
  "requests": [
    {
      "id": "cat_eval_query",                                  
      "request": {                                              
          "query": { "match": { "content": search_query } }
      },
      "ratings": [                                              
            rating_string
#           { '_index': 'article', '_id': '1', 'rating': 0.5 }
#           { '_index': 'article', '_id': '0', 'rating': 0.6 }
      ]
    }
  ],
    "metric": {
    "precision": {
      "k": 5,
      "relevant_rating_threshold": 1,
      #"ignore_unlabeled": False
    }
  }
}

#testx = for_test.rank_eval(body=test_eval, index="article")
#testx
ast.literal_eval(rating_string)

SyntaxError: invalid syntax (<ipython-input-77-a32cf35f5021>, line 14)

In [5]:
all_body = {
    "query": {
        "match_all":{
        }
    }
}

all_res = es.search(index="article", body=all_body)
all_res

{'took': 65,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 904, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'article',
    '_type': '_doc',
    '_id': '0',
    '_score': 1.0,
    '_source': {'headline': 'Photos: These Mammal Ancestors Glided from Jurassic Trees',
     'author': 'Laura Geggel',
     'date': '2017-08-09',
     'link': 'https://www.livescience.com/60092-photos-first-gliding-mammals.html',
     'content': '.'}},
   {'_index': 'article',
    '_type': '_doc',
    '_id': '1',
    '_score': 1.0,
    '_source': {'headline': 'Strange, spiral bee combs look like fantastical crystal palaces. Now we know why.',
     'author': 'Brandon Specktor',
     'date': '2020-07-22',
     'link': 'https://www.livescience.com/tetragonula-spiral-bee-comb-grow-like-crystals.html',
     'content': 'In a world of bland hexagonal honeycombs, a small group of rebellious Australian bees has chosen to build spira

In [30]:
for element in res['hits']['hits']:
    print("'{ '_index': 'article', '_id': %s, 'rating': %s }," % (element['_id'],element['_score']))

'{ '_index': 'article', '_id': 4, 'rating': 0.6904231 },
'{ '_index': 'article', '_id': 0, 'rating': 0.68682206 },
'{ '_index': 'article', '_id': 1, 'rating': 0.6264848 },
'{ '_index': 'article', '_id': 2, 'rating': 0.6028628 },
'{ '_index': 'article', '_id': 3, 'rating': 0.47649592 },


In [4]:
es.indices.delete(index='bank')

{'acknowledged': True}