# Run Trained Model on EULA Clauses
* Load Model
* Extract clauses from pdf and docx
* save pdf and dox clauses as pandas dataframe (for visualization)
* run each clause through Model
* display prediction and confidence


In [70]:
# For Loading BERT trained model
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator


import matplotlib.pyplot as plt
# for pdf and docx extraction
import pandas as pd
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter#process_pdf
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
import docx
from tqdm import tqdm
# for stripping and preprocessing text
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
stop_words = set(stopwords.words('english'))
no_nonsense_re = re.compile(r'^[a-zA-Z^508]+$')


## Load Model

In [45]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea
def load_checkpoint(load_path, model,device):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']
def predict(model,inputs):
    #print('model(inputs): ', model(inputs))
    return model.encoder(inputs)[0]
def custom_forward(model,inputs):
    preds = predict(model,inputs)
    return torch.softmax(preds, dim = 1)[:, 0] # for negative attribution, torch.softmax(preds, dim = 1)[:, 1] <- for positive 


In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [126]:
def load_model_and_tokenizer(path_to_model, device):
    '''
    function to load pretrained model and tokenizer
    '''
    best_model = BERT().to(device)

    load_checkpoint(path_to_model + '/model.pt', best_model,device)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return best_model,tokenizer
PATH_TO_MODEL = '/Users/andrewmendez1/Documents/ai-ml-challenge-2020/data/Finetune BERT oversampling 8_16_2020/Model_1_4_0'
best_model,tokenizer = load_model_and_tokenizer(PATH_TO_MODEL,device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [49]:
# best_model.encoder

## Extract Clauses from pdf and docx

In [23]:

def strip_nonsense(doc,remove_stop_words=False,port_stem=False):
    """
    Returns stemmed lowercased alpha-only substrings from a string
    
    Parameters:
        doc (str): the text of a single FBO document.
        
    Returns:
        words (str): a string of space-delimited lower-case alpha-only words (except for `508`)
    """
    
    doc = doc.lower()
    doc = doc.split()
    words = ''
    for word in doc:
        m = re.match(no_nonsense_re, word)
        if m:
            match = m.group()
            if remove_stop_words and match in stop_words:
                continue
            else:
                if port_stem == True:
                    match_len = len(match)
                    if match_len <= 17 and match_len >= 3:
                        porter = PorterStemmer()
                        stemmed = porter.stem(match)
                        words += stemmed + ' '
                else:
                    words+= match+ ' '
    return words
def extract_clauses_from_pdf(path_to_pdf):
    '''
    Extracts clauses from pdf
    First segments PDF into pages
    Then extracts clauses from all paragraphs in page
    '''
    fp = open(path_to_pdf, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # print(type(retstr))
    codec = 'utf-8'
    laparams = LAParams(line_margin=0.1)
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    page_no = 0
    pages = []
    for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
        print('Processing page {} from {}'.format(page_no,path_to_pdf))
        # if pageNumber == page_no:
        interpreter.process_page(page)

        data = retstr.getvalue()
        pages.append(data)

        # with open(os.path.join('Files/Company_list/0010/text_parsed/2017AR', f'pdf page {page_no}.txt'), 'wb') as file:
        #     file.write(data.encode('utf-8'))
        data = ''
        retstr.truncate(0)
        retstr.seek(0)

        page_no += 1
    # split pages into claues
    clauses_per_page = []
    for p in pages:
        clauses_unormalized = [i.replace("\n"," ") for i in p.split("\n\n")]
        clauses_per_page.append(clauses_unormalized)
    return clauses_per_page

# the underlying XML does not make it easy to identify page breaks
def get_text_from_docx(filename):
    '''
    Function that uses python-docx to extract clauses (sentences) from docx.
    Loops through document, finds paragraphs
    '''
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        clause = para.text
        clause = strip_nonsense(clause)
        if len(clause)>2:
            fullText.append(clause)
    return fullText
def preprocess_clauses_pdf(pages):
    '''
    preprocess clauses
    '''
    clauses = []
    for p in pages:
        for clause in p:
            # do not include if length < 2
            clause_normalized = strip_nonsense(clause)
            if len(clause_normalized) >2:# append if has at least one word
                clauses.append(clause_normalized)
            # stip nonsense
    return clauses
# get_text_from_docx('/Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.docx')
# pages = extract_clauses_from_pdf('../reference/sample_eula_1.pdf')

In [21]:
pages_pdf = extract_clauses_from_pdf('/Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf')

Processing page 0 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 1 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 2 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 3 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 4 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 5 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 6 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 7 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 8 from /Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.pdf
Processing page 9 from /Users/andrewmendez1/Documents/ai-ml-challenge-202

In [22]:
clauses_docx = get_text_from_docx('/Users/andrewmendez1/Documents/ai-ml-challenge-2020/reference/sample_eula_1.docx')
clauses_pdf = preprocess_clauses_pdf(pages_pdf)

In [25]:
print(len(clauses_pdf))
print(len(clauses_docx))

493
183


## (DONT DO) Save pdf and docx as pandas data frame

## Run Each Clause through model, display prediction and confidence

In [39]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

clauses_pdf[70]

'annual basis at request including any changes that have been made to the plan since the prior company will also ensure '

In [67]:
# text = clauses_pdf[70]
text = 'company warrants that the software for a period of sixty days from the date of your perform substantially in accordance with software written materials accompanying except as just parties acknowledge that the software is provided and may not be functional on every machine or in every except as set forth company disclaims all warranties relating to the express or but not limited any warranties against infringement of third party merchantability and fitness for a particular'
def tokenize_text(text):
    text_ids = tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=128)
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    input_ids = torch.tensor([input_ids], device=device)
    return input_ids
input_ids = tokenize_text(text)

In [60]:
score = predict(best_model,input_ids)

print('Sentence: ', text)
pred_label = torch.argmax(score[0]).cpu().numpy()
print('Classification: ' + str(pred_label) + \
      ', Probability Not acceptable (Positive Class): ' + str(torch.softmax(score, dim = 1)[0][1].cpu().detach().numpy()))

Sentence:  annual basis at request including any changes that have been made to the plan since the prior company will also ensure 
Classification: 0, Probability Not acceptable (Positive Class): 0.03749417


In [68]:
def get_prediction_and_confidence(model,text):
    '''
    pass text and return prediction label and confidence
    '''
    text_ids = tokenize_text(text)
    score = predict(model,text_ids)
    pred_label = torch.argmax(score[0]).cpu().numpy()
    confidence = torch.softmax(score, dim = 1)[0][pred_label].cpu().detach().numpy()
    return pred_label,confidence
pred, confidence = get_prediction_and_confidence(best_model,text)
print("Classification: {}, Confidence: {}".format(pred, confidence))



Classification: 1, Confidence: 0.9458168745040894


## Process all clauses in PDF

In [82]:
classifications_pdf = []
for clause in tqdm(clauses_pdf):
    pred,confidence = get_prediction_and_confidence(best_model,clause)
    classifications_pdf.append([clause,int(pred),float(confidence)])


100%|██████████| 493/493 [02:00<00:00,  4.09it/s]


In [79]:
classifications_pdf[0]

['company warrants that the software for a period of sixty days from the date of your perform substantially in accordance with software written materials accompanying except as just parties acknowledge that the software is provided and may not be functional on every machine or in every except as set forth company disclaims all warranties relating to the express or but not limited any warranties against infringement of third party merchantability and fitness for a particular',
 '1',
 '0.9458169']

## Process all clauses in DOCX

In [83]:
classifications_docx = []
for clause in tqdm(clauses_pdf):
    pred,confidence = get_prediction_and_confidence(best_model,clause)
    classifications_docx.append([clause,int(pred),float(confidence)])

100%|██████████| 493/493 [01:47<00:00,  4.57it/s]


In [88]:
for i in classifications_pdf:
    if i[1]==1:
        print(i)

['contract clause this agreement may only be amended in a writing signed by both ', 1, 0.5321720838546753]
['availability and use of the company will make the services available to client in accordance with each order form entered ', 1, 0.6579024791717529]
['changes to the company may make modifications and enhancements to the services from time to ', 1, 0.9722101092338562]
['presented to client for review and will not be effective unless and until both parties sign a written agreement updating these ', 1, 0.6191073656082153]
['with respect to long term company follows industry standard best practices in having monthly and seven yearly ', 1, 0.853610098361969]
['center providers utilized in the provision of the services to client acknowledges and agrees that all reviews constitute ', 1, 0.9768471121788025]
['confidential information of company recognizes that federal agencies are subject to the freedom of information 5 ', 1, 0.9160873293876648]
['order client will order services by sig

In [114]:
pd.DataFrame(np.arange(9).reshape(3,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [120]:
np.array(predictions).shape

NameError: name 'predictions' is not defined

In [123]:
import numpy as np
def save_list_as_dataframe(predictions, columns):
    '''
    Save Predictions as a Pandas Data Frame
    '''
    df = pd.DataFrame(np.array(predictions))
    df.columns = columns
    return df

df = save_list_as_dataframe(classifications_docx,['text','classification','confidence'])

Unnamed: 0,text,classification,confidence
0,master services subscription agreement,0,0.8188013434410095
1,full legal,0,0.8465394973754883
2,company,0,0.7949044108390808
3,business entity,0,0.9038620591163635
4,corporation,0,0.8643710613250732
...,...,...,...
488,company will provide client with a service cre...,1,0.8014154434204102
489,agreement has the service credit will be calcu...,0,0.8918464779853821
490,of system availability per calendar month,0,0.9904935956001282
491,service credit,0,0.8602070212364197


In [None]:
# df.save_csv('')