In [3]:
import pandas as pd
import spacy
import pickle

In [4]:
def build_segments_df(segments,merge_section_headers = False):
    """
    Build a pandas.DataFrame from a list of text segments.
    segments : list of tuples
        each touple has the struncture (label,string),
        where label can be either "text","header","subheader","table". All other labels are treated as "text"
    merge_section_headers : bool, default False
        If true, (sub)section headers and (sub)section text are merged.
    """
    
    
    section_cntr = 0
    subsection_cntr = 0

    segments_dict = {
        "section" : [],
        "section title" : [],
        "subsection" : [],
        "subsection title" : [],
        "string" : []}
    
    section_title = ""
    subsection_title = ""
    
    was_header = False
    for (ii,segment) in enumerate(segments):

        if segment[0] == "section":
            section_cntr = section_cntr + 1
            subsection_cntr = 0
            section_title = segment[1]
            was_header = True
        elif segment[0] == "subsection":
            subsection_cntr = subsection_cntr + 1
            subsection_title = segment[1]
            was_header = True
        else:
            was_header = False
        
        
        if merge_section_headers and not was_header and ii > 0:
            segments_dict["string"][-1] += "\n" + segment[1]
        else:
            segments_dict["section"].append(section_cntr)
            segments_dict["section title"].append(section_title)
            segments_dict["subsection title"].append(subsection_title)
            segments_dict["subsection"].append(subsection_cntr)
            segments_dict["string"].append(segment[1])

    segments_df = pd.DataFrame(segments_dict)
    return segments_df


In [5]:
import sys
print(sys.version)

print(sys.getrecursionlimit())


3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
3000


In [6]:
#sys.setrecursionlimit(5000)


scrape_df = pd.read_pickle("labeled_immigration_rules.pickle")
#scrape_df = pd.read_pickle("labeled_immigration_rules_phrases10.pickle")




scrape_df.head(10)

Unnamed: 0,URL,media type,title,text_dump,text_segmented,hyperlinks_dump,timestamp
0,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: Index,\nImmigration Rules: Index\nThe rules are divi...,section section title subsection subsectio...,[https://www.gov.uk/guidance/immigration-rules...,2020-08-25T18:12:12+00:00
1,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: introduction,\nImmigration Rules: introduction\nThis contai...,section secti...,[],2020-08-25T18:12:16+00:00
2,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 1: leave to enter or st...,\nImmigration Rules part 1: leave to enter or ...,section ...,[],2020-08-25T18:12:20+00:00
3,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 2: transitional provisi...,\nImmigration Rules part 2: transitional provi...,section ...,[],2020-08-25T18:12:23+00:00
4,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 3: students,\nImmigration Rules part 3: students\nPersons ...,section ...,[],2020-08-25T18:12:27+00:00
5,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 4: work experience,\nImmigration Rules part 4: work experience\nP...,section ...,[],2020-08-25T18:12:30+00:00
6,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 5: working in the UK,\nImmigration Rules part 5: working in the UK\...,section ...,[],2020-08-25T18:12:33+00:00
7,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 6: self-employment and ...,\nImmigration Rules part 6: self-employment an...,section ...,[],2020-08-25T18:12:36+00:00
8,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 6A: the points-based sy...,\nImmigration Rules part 6A: the points-based ...,section ...,[],2020-08-25T18:12:45+00:00
9,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 7: other categories,\nImmigration Rules part 7: other categories\n...,section ...,[],2020-08-25T18:12:48+00:00


Simple keyword search of the text corpus
---

In [7]:
def find_keywords(string,keywords):
    """
    Finds substrings ("keywords") within a string. 
    string : str
    keywords : list of str
    returns : list of str
        list of keywords found
    """
    keywords_found = []
    for keyword in keywords:
        if string.find(keyword) != -1:
            keywords_found.append(keyword)
    return keywords_found 

In [8]:
def search_corpus(corpus,keywords,sort_by_relevance=True):
    """
    Searches a given document corpus for keywords
    corpus : pandas.DataFrame
    keywords : list of str
    sort_by_rlevance : bool, default False
        If true, sorts the result by the number of keywords found in decending order
    returns : pandas.DataFrame
        A table that hold all the subsections of the documents in which the keywords were found
    """
    search_result = None
    
    for ii in corpus.index:
        strings = corpus["text_segmented"][ii]["string"].copy()
        
        document = corpus["text_segmented"][ii][["section title", "subsection title"]].copy()
        document.insert(0,"document title",corpus["title"][ii])
        
        #print(corpus["title"][ii])
        
        #document["keywords"] = strings.map(lambda x: find_keywords(x,keywords))
        document["keywords found"] = strings.map(lambda x: find_keywords(x,keywords)).copy()
        document = document[document["keywords found"].map(lambda x: x != [])]
        document["#keywords"] = document["keywords found"].map(lambda x: len(x)).copy()
        
        
        if search_result is None:
            search_result = document
        else:
            search_result = pd.concat([search_result,document],ignore_index = True)
        
    #sort by number of keywords found
    if sort_by_relevance:
        search_result = search_result.sort_values(by = "#keywords",ascending=False,ignore_index = True)
    
    return search_result

In [9]:
search_corpus(scrape_df,["Tier 2","sponsor","licence","EU","points","UK"])


Unnamed: 0,document title,section title,subsection title,keywords found,#keywords
0,Immigration Rules: introduction,Interpretation,,"[Tier 2, sponsor, licence, EU, points, UK]",6
1,Immigration Rules: Index,,,"[Tier 2, sponsor, EU, points, UK]",5
2,Immigration Rules Appendix A: attributes,Attributes for Tier 4 (General) Students,Table 16,"[sponsor, licence, EU, points, UK]",5
3,Immigration Rules part 6A: the points-based sy...,"Tier 2 (General) Migrants, Tier 2 (Minister of...",245HD. Requirements for leave to remain,"[Tier 2, sponsor, licence, points, UK]",5
4,Immigration Rules part 6A: the points-based sy...,Tier 4 (General) Student,245ZX. Requirements for leave to remain,"[Tier 2, sponsor, licence, points, UK]",5
...,...,...,...,...,...
533,Immigration Rules part 13: deportation,A deportation order,Deportation and Article 8,[UK],1
534,Immigration Rules part 13: deportation,A deportation order,Rights of appeal in relation to a decision not...,[UK],1
535,Immigration Rules part 13: deportation,A deportation order,Returned deportees,[UK],1
536,Immigration Rules part 11B,Reception Conditions for non-EU asylum applicants,,[EU],1


keyword based search
---

In [10]:
def search_keyword_list(keyword_list,keywords):
    """
    keyword_list : list of (str,int)
    keywords : list of str
    returns : list of str
        list of (str,int)
    """
    keywords_found = []
    for keyword in keywords:
        for label in keyword_list:
            if label[0] == keyword:
                keywords_found.append(label)
        
    return keywords_found 

In [11]:



test_document = scrape_df.loc[1,"text_segmented"].copy()


test_document



Unnamed: 0,section,section title,subsection,subsection title,string,immigration status,region,company,company body,regulatory body,SOC,documents,relation,general,references,phrases
0,0,,0,,Immigration Rules: introduction\nThis contains...,[],[],[],[],[],[],[],[],[],"[(the Immigration Acts, 1)]",[]
1,1,Implementation and transitional provisions,0,,Implementation and transitional provisions\n\n...,[],[],[],[],[],[],[],[],[],[],[]
2,2,Application,0,,Application\n\n5.Save where expressly indicate...,[],"[(EEA, 1)]",[],[],[],[],[],[],[],[],[]
3,3,Interpretation,0,,Interpretation\n\n6.In these Rules the followi...,"[(Student, 16), (Tier 4, 12), (International G...","[(EEA, 11), (UK regions, 62), (refugee, 6), (n...","[(Charity, 2)]",[],"[(General Medical Council, 1)]","[(SOC, 2)]","[(evidence, 1), (letter, 2)]","[(partner, 17), (child, 16), (relationship, 4)...","[(sponsor, 41), (age, 9), (highly skilled, 3),...","[(the Immigration Acts, 1), (the 1993 Act, 1),...",[]


In [12]:
def update_keyword_list(keyword_list,new_keyword):
    """
    keyword_list : list of (str,int)
    new_keyword : (str,int)
    returns : list of (str,int)
    """
    
    if type(new_keyword) == str:
        new_keyword = (new_keyword,1)
        
    found_tf = False
    for (ii,keyword) in enumerate(keyword_list):
        if keyword[0] == new_keyword[0]:
            keyword_list[ii] = (keyword[0],keyword[1] + new_keyword[1])
            found_tf = True
            
    if not found_tf:
        keyword_list.append(new_keyword)
        
    return [keyword_list]

test_keywords = test_document.loc[2,"region"].copy()

print(test_keywords)
print(update_keyword_list(test_keywords,"foo"))
print(update_keyword_list(test_keywords,('Guernsey', 2)))
print(update_keyword_list(test_keywords,"Isle of Man"))

[('EEA', 1)]
[[('EEA', 1), ('foo', 1)]]
[[('EEA', 1), ('foo', 1), ('Guernsey', 2)]]
[[('EEA', 1), ('foo', 1), ('Guernsey', 2), ('Isle of Man', 1)]]


In [13]:
def merge_keyword_lists(mykeyword_lists):
    #print(type(keyword_lists))
    if len(mykeyword_lists) == 0:
        print("empty list")
        return []
    
    keyword_lists = mykeyword_lists
    #keyword_lists = mykeyword_lists.tolist()
    
    merged_list = []
    for keyword_list in keyword_lists:
        for keyword in  keyword_list:
            #print(merged_list)
            update_keyword_list(merged_list,keyword) 
            
    return merged_list

test_keywords1 = test_document.loc[0,"region"].copy()
test_keywords2 = test_document.loc[1,"region"].copy()
print(test_keywords1)
print(test_keywords2)
print(merge_keyword_lists([test_keywords1,test_keywords2,[]]))
print(merge_keyword_lists([]))

[]
[]
[]
empty list
[]


In [26]:
def aggregate_keywords(document_df):
    keyword_categories = document_df.columns.tolist()[5:]
    document_df["subsection_keywords"] = document_df[keyword_categories].sum(axis=1)
    
    
    #document_df["section_keywords"] = document_df.groupby("section")["subsection_keywords"].transform(merge_keyword_lists)
    
    #document_df["section_keywords"] = []*len(document_df)

    section_keywords = []
    for section,group in document_df.groupby("section"):
        section_keywords += len(group)*[
                merge_keyword_lists(group["subsection_keywords"].tolist())
            ]
    
    document_df["section_keywords"] = pd.Series(section_keywords)
    
    document_keywords = []
    for subscection_kw_list in document_df["subsection_keywords"]:
        for kw in subscection_kw_list:
            update_keyword_list(document_keywords,kw)
            
    document_df["document_keywords"] = [document_keywords.copy()]*len(document_df)
    
    return document_df

def aggregate_keywords_corpus(corpus):
    pass

In [27]:
test_document = scrape_df.loc[1,"text_segmented"].copy()

keyword_categories = test_document.columns.tolist()[5:]
test_document["subsection_keywords"] = test_document[keyword_categories].sum(axis=1)
test_document.groupby("section")["subsection_keywords"].transform(sum)

0                          [(the Immigration Acts, 1)]
1                                                   []
2                                           [(EEA, 1)]
3    [(Student, 16), (Tier 4, 12), (International G...
Name: subsection_keywords, dtype: object

In [28]:
test_document = scrape_df.loc[7,"text_segmented"].copy()
test_document_agg = aggregate_keywords(test_document)

test_document_agg.groupby("section").aggregate("first")

Unnamed: 0_level_0,section title,subsection,subsection title,string,immigration status,region,company,company body,regulatory body,SOC,documents,relation,general,references,phrases,subsection_keywords,section_keywords,document_keywords
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,,0,,Immigration Rules part 6: self-employment and ...,"[(Tier 1 (Investor), 1)]",[],[],[],[],[],[],[],[],[],[],"[(Tier 1 (Investor), 1)]","[(Tier 1 (Investor), 1)]","[(Tier 1 (Investor), 7), (Appendix F, 50), (In..."
1,Person intending to establish themselves in bu...,0,,Person intending to establish themselves in bu...,[],[],[],[],[],[],[],[],[],[],[],[],"[(Appendix F, 18)]","[(Tier 1 (Investor), 7), (Appendix F, 50), (In..."
2,Innovators,0,Refusal of extension of stay in order to remai...,Innovators,[],[],[],[],[],[],[],[],[],[],[],[],"[(Innovator, 6), (Appendix F, 15)]","[(Tier 1 (Investor), 7), (Appendix F, 50), (In..."
3,Persons intending to establish themselves in b...,0,Refusal of extension of stay as an innovator,Persons intending to establish themselves in b...,[],[],[],[],[],[],[],[],[],[],[],[],"[(Tier 1 (Investor), 3), (Appendix F, 3)]","[(Tier 1 (Investor), 7), (Appendix F, 50), (In..."
4,Requirements for an extension of stay as an in...,0,Refusal of leave to enter as an investor,Requirements for an extension of stay as an in...,"[(Tier 1 (Investor), 1)]",[],[],[],[],[],[],[],[],[],[],"[(Tier 1 (Investor), 1)]","[(Tier 1 (Investor), 3), (Appendix F, 8)]","[(Tier 1 (Investor), 7), (Appendix F, 50), (In..."
5,"Writers, composers and artists",0,Refusal of extension of stay as an investor,"Writers, composers and artists",[],[],[],[],[],[],[],[],[],[],[],[],"[(Appendix F, 6)]","[(Tier 1 (Investor), 7), (Appendix F, 50), (In..."


In [33]:
test_document.name = "foo"
test_document.name

'foo'

In [24]:
def search_corpus_keywords(corpus,keywords,level = "subsection",sort_by_relevance=True):
    """
    Searches a given document corpus for keywords
    corpus : pandas.DataFrame
    keywords : list of str
    sort_by_rlevance : bool, default False
        If true, sorts the result by the number of keywords found in decending order
    returns : pandas.DataFrame
        A table that hold all the subsections of the documents in which the keywords were found
    """
    search_result = None
    
    
    
    for ii in corpus.index:
        #text_segmented = corpus["text_segmented"][ii]["string"].copy()

        
        document = corpus.loc[ii,"text_segmented"].copy()
        document = aggregate_keywords(document)
        
        #list of all keywords
        # FIXME: Do this in a cleaner way
        #keyword_categories = document.columns.tolist()[5:]
        #document["all_keywords"] = document[keyword_categories].sum(axis=1)
        
        
        document.insert(0,"document title",corpus["title"][ii])
        document = document.drop(
            [
                "immigration status",
                "region","company",
                "company body",
                "regulatory body",
                "SOC",
                "documents",
                "relation",
                "general",
                "phrases"
            ],
            axis=1)
        
        
        if level == "subsection":
            document["keywords found"] = document["subsection_keywords"].map(lambda x: search_keyword_list(x,keywords)).copy()
        elif level == "document":
            document = document[0:1]
            document["keywords found"] = document["document_keywords"].map(lambda x: search_keyword_list(x,keywords)).copy()
            document = document.drop(["section title","subsection title"],axis=1)
        elif level == "section":
            document = document.groupby("section").first().reset_index().copy()
            document["keywords found"] = document["document_keywords"].map(lambda x: search_keyword_list(x,keywords)).copy()
            document = document.drop(["subsection title"],axis=1)
        else:
            raise NotImplementedError()
        
        document = document.drop(["section","subsection","string"],axis=1)
        document = document.drop(["subsection_keywords","section_keywords","document_keywords"],axis=1)
        
        
        document = document[document["keywords found"].map(lambda x: x != [])]
        
        document["relevance"] = document["keywords found"].map(lambda x: sum([label[1] for label in x])).copy()
        
        
        if search_result is None:
            search_result = document
        else:
            search_result = pd.concat([search_result,document],ignore_index = True)
        
    #sort by number of keywords found
    if sort_by_relevance:
        search_result = search_result.sort_values(by = "relevance",ascending=False,ignore_index = True)
        #pass
    
    return search_result

In [35]:
search_result = search_corpus_keywords(scrape_df,["Tier 2","sponsor","Student"],level="section")

search_result

Unnamed: 0,document title,section title,references,keywords found,relevance
0,Immigration Rules part 6A: the points-based sy...,"Tier 2 (General) Migrants, Tier 2 (Minister of...",[],"[(Tier 2, 84), (sponsor, 105), (Student, 90)]",279
1,Immigration Rules part 6A: the points-based sy...,Tier 1 (Investor) Migrants,[],"[(Tier 2, 84), (sponsor, 105), (Student, 90)]",279
2,Immigration Rules part 6A: the points-based sy...,Tier 2 (Intra-Company Transfer) Migrants,[],"[(Tier 2, 84), (sponsor, 105), (Student, 90)]",279
3,Immigration Rules part 6A: the points-based sy...,Tier 1 (Entrepreneur) Migrants,[],"[(Tier 2, 84), (sponsor, 105), (Student, 90)]",279
4,Immigration Rules part 6A: the points-based sy...,Tier 5 (Youth Mobility Scheme) Temporary Migrants,[],"[(Tier 2, 84), (sponsor, 105), (Student, 90)]",279
...,...,...,...,...,...
309,Immigration Rules Appendix V: visitor rules,VISITORS APPENDIX 5. PERMIT FREE FESTIVALS,[],"[(sponsor, 1)]",1
310,Immigration Rules part 11: asylum,,[],"[(sponsor, 1)]",1
311,Immigration Rules part 11: asylum,326A. Procedure,"[(Appendix FM, 1)]","[(sponsor, 1)]",1
312,Immigration Rules Appendix K: shortage occupat...,Table 1 - United Kingdom Shortage Occupation List,[],"[(sponsor, 1)]",1


In [None]:
search_result = search_corpus_keywords(scrape_df,["Tier 2","EEA","student","Ireland"],level="document")

search_result

In [None]:
pd.Series([["a","b"],[1,2]])
#sum(pd.Series([["a","b"],[1,2]]))

In [None]:
["a","b"] + [1,2]