# Some examples for text analysis with NLP

## EXAMPLE 1: Text summarization with BART

See: https://huggingface.co/transformers/model_doc/bart.html

In [1]:
#!pip install transformers
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

In [2]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
source_file = "example_text_from_Tier_2_5.txt"

In [4]:
with open(source_file, "r", encoding='utf-8') as file:
    TEXT_TO_SUMMARIZE = file.readlines()[0]

In [5]:
print(TEXT_TO_SUMMARIZE)

We may refuse your application where you have previously been named as key personnel at any sponsor organisation where an application for a licence was refused within the last 6 months or where a licence has been revoked within the last 12 months. More information is given in Annex 2 and 6 of this guidance. We also reserve the right to undertake checks on persons associated with sponsors who do not fall under the general definition of ‘you’ or ‘your’. Such persons may include, for example, employees in positions of responsibility who are not directors or key personnel and financiers involved in the running of your institution. Where appropriate, we may refuse your application or take action against your licence.


In [6]:
inputs = tokenizer([TEXT_TO_SUMMARIZE], return_tensors='pt')

# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4)#, max_length=50, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

['We reserve the right to undertake checks on persons associated with sponsors who do not fall under the general definition of ‘you’ or ‘your’ Such persons may include, for example, employees in positions of responsibility who are not directors or key personnel and financiers involved in the running of your institution. We may refuse your application or take action against your licence.']


## EXAMPLE 2: Topic modelling with LDA

Based on: https://github.com/chibueze-oguejiofor/Machine-Learning-In-Law/tree/master

In [7]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS



In [8]:
vect=CountVectorizer(ngram_range=(1,1),stop_words='english')

In [9]:
source_file = "text_extraction_from_Tier_2_5.txt"

In [10]:
with open(source_file, "r", encoding='utf-8') as file:
    SOURCE_TEXT = file.readlines()

In [11]:
dtm=vect.fit_transform(SOURCE_TEXT)

In [12]:
pd.DataFrame(dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,able,abode,abuse,abused,academic,accepted,accepts,accordance,accordingly,account,...,worker,workers,workforce,working,works,year,years,young,youth,zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
lda=LatentDirichletAllocation(n_components=5)
lda_dtf=lda.fit_transform(dtm)

In [14]:
import numpy as np
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())

In [15]:
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features,
sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
tier          immigration   page          guidance      worker        
sponsor       act           application   sponsors      tier          
uk            uk            licence       version       sponsorship   
licence       test          charge        tiers         temporary     
information   market        uk            uk            licence       
general       labour        sponsor       sponsor       certificate   
migrant       resident      applying      work          cos           
company       charge        cos           licence       apply         
guidance      points        paid          duties        workers       
immigration   months        migrant       refund        pay           




In [16]:
Agreement_Topic=np.argsort(lda_dtf[:,2])[::-1]
for i in Agreement_Topic[:4]:
    print(".".join(SOURCE_TEXT[i].split("."))+".\n")  

 'Transitional arrangements for standard occupational classification code skill level',
.

 ' applying from outside the UK for entry clearance a visa to work in the UK ',
.

 'employer to a UK branch which is linked by common ownership or control ',
.

 ' Compliance officers will refer cases of illegal working for prosecution or the ',
.



In [17]:
Domain_Name_Topic=np.argsort(lda_dtf[:,4])[::-1]
for i in Domain_Name_Topic[:4]:
    print(".".join(SOURCE_TEXT[i].split(".")[:2]) +".\n")

 ' ICT Long term Staff subcategory The charge came into force on April ',
.

 'the basis of their gender gender identity sexual orientation marital status ',
.

 'settled or EEA worker either on a long term basis or for frequent short ',
.

 'Tier Ministers of Religion and Tier Temporary Worker Religious Workers ',
.



## Visualization of topis

In [18]:
from __future__ import  print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [19]:
zit=pyLDAvis.sklearn.prepare(lda,dtm,vect)

In [20]:
pyLDAvis.display(zit)

## EXAMPLE 3: Predicting missing words with BERT

See: https://stackoverflow.com/questions/54978443/predicting-missing-words-in-a-sentence-natural-language-processing-model

In [21]:
#!pip install transformers
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import torch

In [22]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#text = '[CLS] I want to [MASK] the car because it is cheap . [SEP]'
text = '[CLS]We may use the information that you [MASK] to us when you apply for a licence. [SEP]'
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Create the segments tensors.
segments_ids = [0] * len(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)
    masked_index = tokenized_text.index('[MASK]')
predicted_index = torch.argmax(predictions[0][0][masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

print(predicted_token)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


provide
