In [1]:
import pandas as pd
import spacy
import pickle

In [2]:
def build_segments_df(segments,merge_section_headers = False):
    """
    Build a pandas.DataFrame from a list of text segments.
    segments : list of tuples
        each touple has the struncture (label,string),
        where label can be either "text","header","subheader","table". All other labels are treated as "text"
    merge_section_headers : bool, default False
        If true, (sub)section headers and (sub)section text are merged.
    """
    
    
    section_cntr = 0
    subsection_cntr = 0

    segments_dict = {
        "section" : [],
        "section title" : [],
        "subsection" : [],
        "subsection title" : [],
        "string" : []}
    
    section_title = ""
    subsection_title = ""
    
    was_header = False
    for (ii,segment) in enumerate(segments):

        if segment[0] == "section":
            section_cntr = section_cntr + 1
            subsection_cntr = 0
            section_title = segment[1]
            was_header = True
        elif segment[0] == "subsection":
            subsection_cntr = subsection_cntr + 1
            subsection_title = segment[1]
            was_header = True
        else:
            was_header = False
        
        
        if merge_section_headers and not was_header and ii > 0:
            segments_dict["string"][-1] += "\n" + segment[1]
        else:
            segments_dict["section"].append(section_cntr)
            segments_dict["section title"].append(section_title)
            segments_dict["subsection title"].append(subsection_title)
            segments_dict["subsection"].append(subsection_cntr)
            segments_dict["string"].append(segment[1])

    segments_df = pd.DataFrame(segments_dict)
    return segments_df


In [3]:
with open("../../data cleaning/immigration_rules_scrape.pickle",'rb') as f:
    scrape_df = pickle.load(f)

scrape_df["text_segmented"] = scrape_df["text_segmented"].map(lambda x : build_segments_df(x,merge_section_headers = True))



scrape_df.head()

Unnamed: 0,URL,media type,title,text_dump,text_segmented,hyperlinks_dump,timestamp
0,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: Index,\nImmigration Rules: Index\nThe rules are divi...,section section title subsection subsectio...,[https://www.gov.uk/guidance/immigration-rules...,2020-08-20T11:07:43+00:00
1,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: introduction,\nImmigration Rules: introduction\nThis contai...,section secti...,[],2020-08-20T11:07:45+00:00
2,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 1: leave to enter or st...,\nImmigration Rules part 1: leave to enter or ...,section ...,[],2020-08-20T11:07:46+00:00
3,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 2: transitional provisi...,\nImmigration Rules part 2: transitional provi...,section ...,[],2020-08-20T11:07:48+00:00
4,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 3: students,\nImmigration Rules part 3: students\nPersons ...,section ...,[],2020-08-20T11:07:49+00:00


In [4]:
def find_keywords(string,keywords):
    """
    Finds substrings ("keywords") within a string. 
    string : str
    keywords : list of str
    returns : list of str
        list of keywords found
    """
    keywords_found = []
    for keyword in keywords:
        if string.find(keyword) != -1:
            keywords_found.append(keyword)
    return keywords_found 

In [5]:
def search_corpus(corpus,keywords,sort_by_relevance=True):
    """
    Searches a given document corpus for keywords
    corpus : pandas.DataFrame
    keywords : list of str
    sort_by_rlevance : bool, default False
        If true, sorts the result by the number of keywords found in decending order
    returns : pandas.DataFrame
        A table that hold all the subsections of the documents in which the keywords were found
    """
    search_result = None
    
    for ii in corpus.index:
        strings = corpus["text_segmented"][ii]["string"].copy()
        
        document = corpus["text_segmented"][ii][["section title", "subsection title"]].copy()
        document.insert(0,"document title",corpus["title"][ii])
        
        #print(corpus["title"][ii])
        
        #document["keywords"] = strings.map(lambda x: find_keywords(x,keywords))
        document["keywords found"] = strings.map(lambda x: find_keywords(x,keywords)).copy()
        document = document[document["keywords found"].map(lambda x: x != [])]
        document["#keywords"] = document["keywords found"].map(lambda x: len(x)).copy()
        
        
        if search_result is None:
            search_result = document
        else:
            search_result = pd.concat([search_result,document],ignore_index = True)
        
    #sort by number of keywords found
    if sort_by_relevance:
        search_result = search_result.sort_values(by = "#keywords",ascending=False,ignore_index = True)
    
    return search_result

In [6]:
 search_corpus(scrape_df,["Tier 2","sponsor","licence","EU","points"])
#scrape_df["text_segmented"][2]["string"]

Unnamed: 0,document title,section title,subsection title,keywords found,#keywords
0,Immigration Rules: introduction,Interpretation,,"[Tier 2, sponsor, licence, EU, points]",5
1,Immigration Rules: Index,,,"[Tier 2, sponsor, EU, points]",4
2,Immigration Rules Appendix A: attributes,Attributes for Tier 4 (General) Students,Table 16,"[sponsor, licence, EU, points]",4
3,Immigration Rules part 6A: the points-based sy...,"Tier 2 (General) Migrants, Tier 2 (Minister of...",245HD. Requirements for leave to remain,"[Tier 2, sponsor, licence, points]",4
4,Immigration Rules part 6A: the points-based sy...,Tier 4 (General) Student,245ZX. Requirements for leave to remain,"[Tier 2, sponsor, licence, points]",4
...,...,...,...,...,...
240,Immigration Rules Appendix A: attributes,Attributes for Tier 1 (Investor) Migrants,Table 3 - DELETED,[points],1
241,Immigration Rules Appendix A: attributes,Money and assets: notes,Table 3 - DELETED,[points],1
242,Immigration Rules Appendix A: attributes,Source of additional money (Table 9A and Table...,Table 3 - DELETED,[points],1
243,Immigration Rules Appendix A: attributes,Tier 2 (General) limit,Table 11CA,[Tier 2],1
