# Search Engine for Docs

## Import Corpus

In [2]:
import pandas as pd
df = pd.read_csv('D://NLP//Frame_NLP//jan_july_200_article.csv', encoding = 'utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,title,body,keywords,words_count
0,0,British Prime Minister Boris Johnson Hospitali...,"On Sunday, British Prime Minister Boris Johnso...","['Johnson', 'Hospitalized', 'Boris', 'Minister...",218
1,1,NSW coronavirus death toll hits 18 as cases ri...,NSW has now recorded 18 COVID-19 deaths as the...,"['cases', 'toll', 'death', 'coronavirus', 'COV...",278
2,2,Industry in Chandigarh will need major impetus...,ChandigarhWith shops and manufacturing units c...,"['government', 'impetus', 'major', 'post', 'Ch...",570
3,3,"Coronavirus in Chandigarh: Follow advisories, ...","Chandigarh The 23-year-old man, discharged fro...","['careful', 'advisories', '23-year-old', 'Chan...",348
4,4,Crackers sound jarring note as Chandigarh tric...,CHANDIGARH The stillness which had become so m...,"['Chandigarh', 'tricity', 'lights', 'note', 'j...",377


## Import Whoosh and Indexing the Corpus

In [3]:
from whoosh.fields import Schema, TEXT, ID
from whoosh import index
import os, os.path
from whoosh import index
from whoosh import qparser
from whoosh.qparser import QueryParser

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored = True))

if not os.path.exists("index_dir"):
    os.mkdir("index_dir")

In [4]:
ix = index.create_in("index_dir", schema)
writer = ix.writer()

In [5]:
for i in range(len(df)):
    writer.add_document(title=str(df.title.iloc[i]), content=str(df.body.iloc[i]),
                    path=str(i))
writer.commit()

## Search Doc and Save Title to File

In [6]:
def index_search(dirname, search_fields, search_query):
    ix = index.open_dir(dirname)
    schema = ix.schema
    
    og = qparser.OrGroup.factory(0.9)
    mp = qparser.MultifieldParser(search_fields, schema, group = og)

    
    q = mp.parse(search_query)
    
    #f = open('doctor.txt', 'w', encoding='utf-8')
    f = open('D://NLP//Frame_NLP//archive//coronavirus_title.txt', 'w', encoding='utf-8')
    
    with ix.searcher() as s:
        results = s.search(q, terms=True, limit = 10000)
        for hit in results:
            print(hit['title'], file=f,sep = "\t\t")
          
                
    f.close()
    
        #print(results[0:3])
        #response = dict(results)
        #return ({"content": response['content'], 'path': response['path']})
    
    
results_dict = index_search("index_dir", ['title', 'content'], u"coronavirus")

#results_dict = index_search("index_dir", ['title'], u"isolation")

## Search Doc and Save Content to File

In [7]:
def index_search(dirname, search_fields, search_query):
    ix = index.open_dir(dirname)
    schema = ix.schema
    
    og = qparser.OrGroup.factory(0.9)
    mp = qparser.MultifieldParser(search_fields, schema, group = og)

    
    q = mp.parse(search_query)
    
    #f = open('doctor.txt', 'w', encoding='utf-8')
    f = open('D://NLP//Frame_NLP//archive//coronavirus_content.txt', 'w', encoding='utf-8')
    
    with ix.searcher() as s:
        results = s.search(q, terms=True, limit = 1000)
        for hit in results:
            print(hit['content'], file=f,sep = "\t\t")
          
                
    f.close()
    
        #print(results[0:3])
        #response = dict(results)
        #return ({"content": response['content'], 'path': response['path']})
    
    
results_dict = index_search("index_dir", ['title', 'content'], u"coronavirus")

#results_dict = index_search("index_dir", ['title'], u"isolation")