# Topic modeling with LDA

In [1]:
#!pip install -U scikit-learn scipy matplotlib
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from topicmodeling import extract_topics
import warnings
warnings.filterwarnings('ignore')

In [2]:
source_list=[]
for i in range(5):
    file = "rawtext_webscraping%s.txt" % i
    source_list.append(file)

Finds different topics in each document and prints the most frequent words in each of the topics.

The following example looks at the 5 websites

    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-index",

    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system",
    
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-a-attributes",
    
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds",
    
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-b-english-language"

It prints the 10 most important words of 5 topics for each website.

In [3]:
for i in range(5):
    extract_topics(source_list[i])

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
appendix      leave         appendix      tier          paragraphs    
immigration   enter         migrants      migrants      appendix      
rules         remain        tier          appendix      overseas      
deleted       united        private       documents     language      
forces        kingdom       households    scheme        english       
partners      persons       workers       students      353           
hm            paragraphs    funds         student       353b          
paragraphs    limited       academic      children      12            
civil         entry         certificate   child         354           
nationals     members       maintenance   specified     356b          


topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
appl

Here we look at the 10 most common pairs of words from 5 topics from the website

"https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system".

In [4]:
extract_topics(source_list[0],5,10,2)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
enter remain  united kingdomleave enter   transitional provisionsleave enter   
remain united hm forces     united kingdomchild student retired persons
leave enter   points based  appendix deletedtier child    independent means
united kingdomindefinite leaveentry clearanceindefinite leavepersons independent
limited leave control sectionclearance leaveimmigration rulesenter remain  
persons seekingimmigration actremain united highly skilledappendix fm   
seeking enter exempt immigrationenter remain  appendix immigration357 361       
persons limitedimmigration controlfamily membersyouth mobilitypartners persons
remain paragraphssection immigrationleave remain  mobility schemekingdom retired
children personsact 1971      enter united  administrative reviewunited kingdom




In the next example, we find 5 topics in the Sponsor Guidance for Tier 2 and 5, and print the 10 most important words.

In [5]:
source_file = "text_extraction_from_Tier_2_5.txt"
extract_topics(source_file)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
guidance      charge        uk            licence       tier          
sponsors      sponsor       cos           tier          page          
sponsorship   pay           remain        worker        immigration   
version       refund        leave         sponsor       general       
tiers         immigration   service       application   act           
uk            paid          guidance      temporary     migrant       
certificate   skills        date          uk            company       
information   eea           assigned      work          level         
cos           organisation  section       workers       points        
employment    duties        migrant       apply         uk            




## Visualization of topics

We give an example of a vizualization of the topics from the Sponsor Guidance, with 5 topics and 10 words per topic.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import numpy as np
import mglearn

In [7]:
vect=CountVectorizer(ngram_range=(1,1),stop_words='english')
with open(source_file, "r", encoding='utf-8', errors='ignore') as file:
     SOURCE_TEXT = file.readlines()
dtm=vect.fit_transform(SOURCE_TEXT)
dtf=pd.DataFrame(dtm.toarray(),columns=vect.get_feature_names())
lda=LatentDirichletAllocation(n_components=5)
lda_dtf=lda.fit_transform(dtm)    
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
licence       tier          uk            tier          uk            
sponsor       guidance      sponsorship   cos           office        
application   sponsors      certificate   migrant       level         
immigration   version       information   pay           make          
apply         tiers         work          leave         time          
refund        worker        act           company       home          
charge        page          immigration   remain        law           
migrants      temporary     gov           charge        charge        
circumstances workers       sponsor       applying      service       
skills        sponsor       organisation  general       work          




In [8]:
from __future__ import  print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [9]:
zit=pyLDAvis.sklearn.prepare(lda,dtm,vect)

  and should_run_async(code)


In [10]:
pyLDAvis.display(zit)

  and should_run_async(code)


# Text summarization with BART

In [19]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

In [20]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn', max_length=1024)

In [21]:
source_file = "example_text_from_Tier_5_guide.txt"

In [22]:
with open(source_file, "r", encoding='utf-8', errors='ignore') as file:
    TEXT_TO_SUMMARIZE = file.readlines()[0]

In [23]:
print(TEXT_TO_SUMMARIZE)

You must demonstrate that you have the required funds in the form of cash funds in the bank (this includes savings accounts and current accounts even when notice must be given), or as a loan available to you. We will not take into account other assets (for example shares, bonds, overdrafts, credit cards or pension funds) or money you have earned while you were in breach of the United Kingdom’s immigration laws as evidence of maintenance funds. You can find details of the maintenance requirement in Appendix C of the Immigration Rules.


In [25]:
inputs = tokenizer([TEXT_TO_SUMMARIZE], return_tensors='pt')

# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

['You must demonstrate that you have the required funds in the bank. This includes savings accounts and current accounts even when notice must be given. We will not take into account other assets (for example shares, bonds, overdrafts, credit cards or pension funds) You can find details of the maintenance requirement in Appendix C of the Immigration Rules.']
