In [2]:
import pandas as pd
import spacy
import pickle
import numpy as np

from tfidf_search import *

In [3]:
def clean_up_corpus(mycorpus):
    corpus = []
    
    for (ii,mydocument) in enumerate(mycorpus):
        document = mydocument.copy()
        if document["document type"][0] == "html":
            document.insert(4,"subsubsection",0)
            document.insert(5,"subsubsection title","")
            document.insert(6,"page_no",1)
        elif document["document type"][0] == "pdf":
            if "subsection" not in document.columns:
                document.insert(1,"subsection",0)
            if "subsubsection" not in document.columns:
                document.insert(2,"subsubsection",0)
            
            document = document[
                ["section","subsection","subsubsection","page_no","text",
                 "document title","document url","document type",
                 "keywords","needed records","references","reference urls"]
            ]
            
            document.insert(1,"section title","")
            document.insert(3,"subsection title","")
            document.insert(5,"subsubsection title","")

        corpus.append(document)
    
    return corpus

In [4]:
with open('labeled_corpus.pickle','rb') as f:
    labeled_corpus = pickle.load(f)

labeled_corpus = clean_up_corpus(labeled_corpus)


In [5]:
labeled_corpus[1]

Unnamed: 0,section,subsection,text,section title,subsubsection,subsubsection title,page_no,subsection title,document title,document url,document type,keywords,needed records,references,reference urls
0,0,0,Immigration Rules Appendix 6: academic subject...,,0,,1,,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Commonwealth, 1)]","[(certificate, 2)]",[(Immigration Rules Appendix 6: academic subje...,[https://www.gov.uk/guidance/immigration-rules...
1,1,0,1. Doctorate or Masters by research:,1. Doctorate or Masters by research:,0,,1,,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
2,1,1,Subjects allied to Medicine:,1. Doctorate or Masters by research:,0,,1,Subjects allied to Medicine:,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
3,1,1,\nCAH codes:\nCAH02-02-01 - Pharmacology\nCAH0...,1. Doctorate or Masters by research:,0,,1,Subjects allied to Medicine:,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
4,1,2,Biological Sciences:,1. Doctorate or Masters by research:,0,,1,Biological Sciences:,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
5,1,2,\nCAH codes:\nCAH03-01-02 - Biology (non-speci...,1. Doctorate or Masters by research:,0,,1,Biological Sciences:,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
6,1,3,"Veterinary Sciences, Agriculture and related s...",1. Doctorate or Masters by research:,0,,1,"Veterinary Sciences, Agriculture and related s...",Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
7,1,3,\nCAH codes:\nCAH05-01-02 - Others in Veterina...,1. Doctorate or Masters by research:,0,,1,"Veterinary Sciences, Agriculture and related s...",Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
8,1,4,Physical Sciences:,1. Doctorate or Masters by research:,0,,1,Physical Sciences:,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
9,1,4,\nCAH codes:\nCAH-07-02-01 - Chemistry\nCAH-07...,1. Doctorate or Masters by research:,0,,1,Physical Sciences:,Immigration Rules Appendix 2: police registration,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]


##### keyword based search
---

In [6]:
corpus_tfidf,idf_vector,vocab = tfidf_init_corpus(labeled_corpus)

In [9]:
search_corpus_tfidf(["","Tier 4","points-based","child"],corpus_tfidf,idf_vector,vocab,sort_by_relevance=True,level="segment")

Unnamed: 0,section,section title,subsection,subsection title,subsubsection,subsubsection title,page_no,text,document title,document url,document type,keywords,needed records,references,reference urls,keywords found,relevance
0,41.0,Attributes for Tier 4 (Child) Students,1.0,Table 17,0,,1,Table 17,Immigration Rules Appendix AR: administrative ...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 2), (child, 2)]",[],[],[],"[(Tier 4, 2), (child, 2)]",0.751120
1,41.0,Attributes for Tier 4 (Child) Students,2.0,Notes,0,,1,Notes,Immigration Rules Appendix AR: administrative ...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 2), (child, 2)]",[],[],[],"[(Tier 4, 2), (child, 2)]",0.751120
2,6.0,Tier 4 (Child) Students,0.0,Tier 4 (Child) Students,0,,1,Tier 4 (Child) Students,Immigration Rules Appendix B: English language,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 4), (child, 4)]",[],[],[],"[(Tier 4, 4), (child, 4)]",0.751120
3,6.0,Tier 4 (Child) Students,1.0,Notes,0,,1,Notes,Immigration Rules Appendix B: English language,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 2), (child, 2)]",[],[],[],"[(Tier 4, 2), (child, 2)]",0.751120
4,2.0,Innovators,0.0,Innovators,0,,1,Innovators,Immigration Rules part 6A: the points-based sy...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(points-based, 3)]",[],[],[],"[(points-based, 3)]",0.748880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,13.0,"Parents, grandparents and other dependent rela...",1.0,Requirements for leave to enter or remain in t...,0,,1,\n319V. The requirements to be met by a person...,Immigration Rules part 7: other categories,https://www.gov.uk/guidance/immigration-rules/...,html,"[(parent, 14), (grandparent, 14), (refugee, 11...",[],[],[],"[(child, 2)]",0.502902
722,1.0,Family Members - Specified Evidence,2.0,Calculating Gross Annual Income under Appendix FM,0,,1,\n13. Based on evidence that meets the require...,Immigration Rules Appendix E: maintenance (fun...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(family, 5), (points, 3), (date, 35), (applic...","[(evidence, 10), (Franchise Agreement, 1), (ta...",[],[],"[(child, 2)]",0.502143
723,20.0,Money and assets: notes,0.0,Attributes for Tier 1 (Entrepreneur) Migrants,0,,1,\n60. Money is disposable in the UK if all of ...,Immigration Rules Appendix AR: administrative ...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 1, 2), (exchange, 1), (date, 7), (appli...","[(evidence, 2), (certificate, 1), (Corporate/B...",[],[],"[(child, 1)]",0.501861
724,4.0,Eligibility for indefinite leave to enter or ...,0.0,Purpose,0,,1,Condition ...,Immigration Rules Appendix ECAA,https://www.gov.uk/guidance/immigration-rules/...,html,"[(EEA, 26), (family, 8), (relationship, 3), (r...","[(evidence, 1)]",[],[],"[(child, 1)]",0.501299
