In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('..', 'src')))
import pandas as pd
import numpy as np
import shutil
'''Load models:'''
from whoosh.index import create_in
from whoosh.index import open_dir
from whoosh.fields import Schema, TEXT
from whoosh.qparser import MultifieldParser,OrGroup, query
from whoosh import scoring
from whoosh import highlight
import os, os.path

import clean_dataset as clean

[nltk_data] Downloading package stopwords to /Users/jonas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jonas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv(os.path.abspath(os.path.join('..', 'data/processed/'))+"/taxonomy_final.csv")

In [8]:
df.all_text.iloc[0]

"This programme will contribute to the protection of the biological diversity of the Nimba Mountains Biosphere Reserve, including the World Heritage Site (officially on the list of Sites in Danger since 1992).  The proposed programme will rely on integrated ecosystem management through participatory approaches according to the philosophy of a biosphere reserve in which conservation of globally important biodiversity, and landscape-level sustainable use of natural resources and sustainable development are harmonised. The programme will enhance mainstreaming of biodiversity conservation into local and national level sustainable development planning. The programme will also contribute to the development of a national system of protected areas. The programme consists of (I) support to the protection of three core reserve areas in the Nimba Mountains which cover a range of ecosystem types from high-altitude savannahs to montane, mid-altitude and lowland rainforest formations plus their asso

In [4]:
'''create directory to save whoosh indeces:'''
if os.path.exists(os.path.abspath(os.path.join('/whoosh'))):
    shutil.rmtree(os.path.abspath(os.path.join('whoosh')))
os.mkdir(os.path.abspath(os.path.join('whoosh')))

schema = Schema(PIMS_ID=TEXT(stored=True), 
                leading_country=TEXT(stored=True), 
                title=TEXT(stored=True),
                grant_amount =TEXT(stored=True),
                all_text=TEXT(stored=True) ,
                all_text_clean=TEXT(stored=True), 
                all_text_clean_spacy=TEXT(stored=True))

ix = create_in(os.path.abspath(os.path.join('whoosh')), schema)
writer = ix.writer()

for i in df.index:                                         
    writer.add_document(PIMS_ID = str(df.loc[i,"PIMS_ID"]), 
                      leading_country = str(df.loc[i, "leading_country"]),
                      title = str(df.loc[i, "title"]),
                        grant_amount = str(df.loc[i, "grant_amount"]),
                      all_text = str(df.loc[i, "all_text"]),
                      all_text_clean = str(df.loc[i, "all_text_clean"]),
                       all_text_clean_spacy = str(df.loc[i, "all_text_clean_spacy"])) 
writer.commit()

In [5]:
from whoosh.index import open_dir
ix = open_dir(os.path.abspath(os.path.join('whoosh')))

In [6]:
'''Set parameters and variables for search:'''
question = 'chemical and waste'
#question = 'How much was the chemical sector in Vietnam growing for the period 1990 – 2004?'
#question = 'In what substances are higher levels of chemicals of concern found?'


weighting_type = scoring.BM25F()
fields = ['all_text_clean']
#allow_q = query.Term("Lead_Country", "Vietnam")
og = OrGroup.factory(0.9) #bonus scaler

'''Parse Search:'''
parser = MultifieldParser(fields, ix.schema, group = og)
q = parser.parse(question)

In [8]:
pims = []
title = []
with ix.searcher(weighting = weighting_type) as s:
  results = s.search(q, limit = 10)
  results.fragmenter = highlight.SentenceFragmenter()
  results.formatter = highlight.UppercaseFormatter()
  print('--------------------------')
  print('Question:', question)
  print('--------------------------')
  print('--------------------------')
  print("Search Results for most relevant document - searched in", fields, ":")
  for res in results:
    pims.append(res['PIMS_ID'])
    title.append(res['title'])
    print('-------------')
    print('document:', res['PIMS_ID'], res['title'])
summary = dict(zip(pims, title))
summary = pd.DataFrame(summary.items(), columns=['PIMS_ID', 'title'])

--------------------------
Question: chemical and waste
--------------------------
--------------------------
Search Results for most relevant document - searched in ['all_text_clean'] :
-------------
document: 4905 Elimination of Obsolete Pesticide Stockpiles and addressing POPs Contaminated Sites within a Sound Chemicals Management Framework in Armenia
-------------
document: 5706 National Programme for the environmental Sound Management and Life cycle management of Chemical substances
-------------
document: 5361 Sound Chemicals Management Mainstreaming and UPOPs reduction in Kenya
-------------
document: 5481 Reducing UPOPs and mercury releases from healthcare waste management, e-waste treatment, scrap processing and biomass burning.
-------------
document: 5188 Asuncion Green City of the Americas  Pathways to Sustainability
-------------
document: 4600 Comprehensive Reduction And Elimination Of  Persistent  Organic  Pollutants In Pakistan
-------------
document: 4833 POPs Legacy 

## make splitted index

In [3]:
splitted = clean.split_at_length(df, 'all_text_clean', 500)

In [4]:
splitted

Unnamed: 0,text,PIMS_ID
0,this programme will contribute to the protecti...,1584
1,harmonised the programme will enhance mainstre...,1584
2,environments ii improving agricultural intensi...,1584
3,traditional medicinal plants in the buffer zon...,1584
4,development needs ecological integrity assured...,1584
...,...,...
5578,the project will provide technical assistance ...,6520
5579,sector engagement and fice d the programme wil...,6520
5580,the objective of the partnership initiative is...,6532
5581,the period of,6532


In [11]:
'''create directory to save whoosh indeces:'''
if os.path.exists(os.path.abspath(os.path.join('/split'))):
    shutil.rmtree(os.path.abspath(os.path.join('split')))
os.mkdir(os.path.abspath(os.path.join('split')))

schema = Schema(PIMS_ID=TEXT(stored=True), 
                #leading_country=TEXT(stored=True), 
                #title=TEXT(stored=True),
                #grant_amount =TEXT(stored=True),
                text=TEXT(stored=True) ,
                #all_text_clean=TEXT(stored=True), 
                #all_text_clean_spacy=TEXT(stored=True)
               )

ix = create_in(os.path.abspath(os.path.join('split')), schema)
writer = ix.writer()

for i in df.index:                                         
    writer.add_document(PIMS_ID = str(splitted.loc[i,"PIMS_ID"]), 
                      #leading_country = str(df.loc[i, "leading_country"]),
                      #title = str(df.loc[i, "title"]),
                        #grant_amount = str(df.loc[i, "grant_amount"]),
                      text = str(splitted.loc[i, "text"]),
                      #all_text_clean = str(df.loc[i, "all_text_clean"]),
                       #all_text_clean_spacy = str(df.loc[i, "all_text_clean_spacy"])
                       ) 
writer.commit()