# Bert Test (with translation)

## Import librares for notebook.

In [3]:
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize
from googletrans import Translator
import pandas as pd
from deep_translator import GoogleTranslator
from sklearn.metrics.pairwise import cosine_similarity

## Import and translate data

In [4]:
#Import senator csv from senator_data_file
senators = pd.read_csv(r'/Users/jmlunamugica/code/jomilu93/sivico/senators_data.csv', converters={'initiative_list': pd.eval})

In [5]:
#Split the concatenated initiatives for each senator into bites smaller than 5000 to be able to translate.
senators["initiatives_summary_dummy_split"] = ""

n = 4999

for i, row in senators.iterrows():
    if not row["initiative_list"] == []:
        initiatives_split = [row["initiatives_summary_dummy"][i:i+n] for i in range(0, len(row["initiatives_summary_dummy"]), n)]
        senators.at[i, "initiatives_summary_dummy_split"] = initiatives_split
    else:
        senators.at[i, "initiatives_summary_dummy_split"] = []

In [None]:
def translate(in)

In [6]:
#Translate the concatenated, split string of initiatives per senator, store in new column.

senators["initiatives_summary_dummy_split_en"] = ""

senators["initiatives_summary_dummy_split_en"] = senators["initiatives_summary_dummy_split"].apply(lambda x:)

for i, row in senators.iterrows():
#     print(f"Working on row {i} of {len(senators)}")
    if len(row["initiatives_summary_dummy_split"]) >= 1:
        en_initiatives = GoogleTranslator(source='es', target='en').translate_batch(row["initiatives_summary_dummy_split"])
        senators.at[i, "initiatives_summary_dummy_split_en"] = en_initiatives
#         print(f"Summary {i} translated successfully. Sample: {en_initiatives[0][:20]}")
    else:
#         print(f"Senator number {i}, senator {row['senadores']} has no initiatives to translate.")
        continue

In [7]:
#Join all split, translated initiatives into one long english string per senator.

senators["initiatives_summary_dummy_en"] = senators["initiatives_summary_dummy_split_en"].apply(lambda x: "".join(x))

In [8]:
senators["initiatives_summary_dummy_en"][0]

'It intends to guarantee members of the Mexican Foreign Service, in a higher percentage than that established in current legislation, access to state representation positions such as Ambassador or Consul. It proposes to establish greater clarity in the subdivision of the ordinary commissions of Education, Culture and Science and Technology, suppressing the only current denomination of the ordinary commission of "Education, Culture, Science and Technology". It proposes establishing that the Institute of Expert Services and Forensic Sciences will be the technical support area for the Prosecutor\'s Office that will have with the objective of strengthening its institutional capacity to carry out its functions in a specialized and scientific manner. It determines the establishment of an Expert Career Service. It proposes to establish that citizen initiatives have the preferential character. It proposes to establish that the National Educational System will consider education in terms of pri

## Feed text through BERT model to create a summary.

In [9]:
#Import BERT model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
senators["initiative_summary_en"] = ""

In [14]:
#     print(f"Working on summary {i} of {len(senators)}")

sentences = sent_tokenize(senators["initiatives_summary_dummy_en"][35])
print(f"This senator has {len(sentences)} sentences.")

n = 100
sentences_split = [sentences[i:i+n] for i in range(0, len(sentences), n)]

summary = []

for sen_split in sentences_split:
    
    tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sen_split]

    max_len = 0
    for i in tokenized_sentences:
        if len(i) > max_len:
            max_len = len(i)

    padded_sentences = []
    for i in tokenized_sentences:
        while len(i) < max_len:
            i.append(0)
        padded_sentences.append(i)

    input_ids = torch.tensor(padded_sentences)

    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]

    sentence_embeddings = []
    for i in range(len(sen_split)):
        print(f"Appending sentence {i}")
        sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())

    # Compute the similarity matrix
    similarity_matrix = cosine_similarity(sentence_embeddings)

    # Generate the summary
    num_sentences = round(len(sen_split)*.25)
    summary_sentences = []
    for i in range(num_sentences):
        sentence_scores = list(enumerate(similarity_matrix[i]))

    sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    
    for i in range(num_sentences):
        summary_sentences.append(sentences_split[sentence_scores[i][0]])

    sub_summary = ' '.join(summary_sentences)
    
    summary.append(sub_summary)
    
    print("Summary batch completed.")

summary = " ".join([str(item) for item in summary])

print(f"Summary complete!")

This senator has 702 sentences.
Finished sentence tokenization
Sentences padded.
Input IDs Set
Last hidden states Set
Appending sentence 0
Appending sentence 1
Appending sentence 2
Appending sentence 3
Appending sentence 4
Appending sentence 5
Appending sentence 6
Appending sentence 7
Appending sentence 8
Appending sentence 9
Appending sentence 10
Appending sentence 11
Appending sentence 12
Appending sentence 13
Appending sentence 14
Appending sentence 15
Appending sentence 16
Appending sentence 17
Appending sentence 18
Appending sentence 19
Appending sentence 20
Appending sentence 21
Appending sentence 22
Appending sentence 23
Appending sentence 24
Appending sentence 25
Appending sentence 26
Appending sentence 27
Appending sentence 28
Appending sentence 29
Appending sentence 30
Appending sentence 31
Appending sentence 32
Appending sentence 33
Appending sentence 34
Appending sentence 35
Appending sentence 36
Appending sentence 37
Appending sentence 38
Appending sentence 39
Appending se

Last hidden states Set
Appending sentence 0
Appending sentence 1
Appending sentence 2
Appending sentence 3
Appending sentence 4
Appending sentence 5
Appending sentence 6
Appending sentence 7
Appending sentence 8
Appending sentence 9
Appending sentence 10
Appending sentence 11
Appending sentence 12
Appending sentence 13
Appending sentence 14
Appending sentence 15
Appending sentence 16
Appending sentence 17
Appending sentence 18
Appending sentence 19
Appending sentence 20
Appending sentence 21
Appending sentence 22
Appending sentence 23
Appending sentence 24
Appending sentence 25
Appending sentence 26
Appending sentence 27
Appending sentence 28
Appending sentence 29
Appending sentence 30
Appending sentence 31
Appending sentence 32
Appending sentence 33
Appending sentence 34
Appending sentence 35
Appending sentence 36
Appending sentence 37
Appending sentence 38
Appending sentence 39
Appending sentence 40
Appending sentence 41
Appending sentence 42
Appending sentence 43
Appending sentence 

In [None]:
for index, row in senators.iterrows():
    
    if len(row["initiatives_summary_dummy_en"]) >= 1:
        
        sentences = sent_tokenize(row["initiatives_summary_dummy_en"])
        
        print(f"Working on summary {index} of {len(senators)}. This senator has {len(sentences)} sentences.")

        n = 150
        sentences_split = [sentences[i:i+n] for i in range(0, len(sentences), n)]
        summary = []
        
        for sen_split in sentences_split:
            
            print(f"Procesing batch of {len(sen_split)} sentences.")
                
            tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

            max_len = 0
            for i in tokenized_sentences:
                if len(i) > max_len:
                    max_len = len(i)

            padded_sentences = []
            for i in tokenized_sentences:
                while len(i) < max_len:
                    i.append(0)
                padded_sentences.append(i)

            input_ids = torch.tensor(padded_sentences)

            with torch.no_grad():
                last_hidden_states = model(input_ids)[0]

            sentence_embeddings = []
            for i in range(len(sen_split)):
                sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())

            # Compute the similarity matrix
            similarity_matrix = cosine_similarity(sentence_embeddings)

            # Generate the summary
            num_sentences = round(len(sen_split)*.25)
            summary_sentences = []
            for i in range(num_sentences):
                sentence_scores = list(enumerate(similarity_matrix[i]))

            sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)

            for i in range(num_sentences):
                print(f"Appending summary sentence {i}")
                summary_sentences.append(sen_split[sentence_scores[i][0]])

            sub_summary = ' '.join(summary_sentences)
            
            summary.append(sub_summary)
                    
        summary = " ".join([str(item) for item in summary])

        senators.at[index, "initiative_summary_en"] = summary
        
        print(f"Summary {index} of {len(senators)} complete!")
        
    else:
        
        print(f"Senator number {index}, senator {row['senadores']} has no initiatives to summarize.")

Working on summary 0 of 126. This senator has 8 sentences.
Procesing batch of 8 sentences.
Appending summary sentence 0
Appending summary sentence 1
Summary 0 of 126 complete!
Working on summary 1 of 126. This senator has 65 sentences.
Procesing batch of 65 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Summary 1 of 126 complete!
Working on summary 2 of 126. This senator has 135 sentences.
Procesing batch of 135 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sente

Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Appending summary sentence 16
Appending summary sentence 17
Appending summary sentence 18
Appending summary sentence 19
Appending summary sentence 20
Appending summary sentence 21
Appending summary sentence 22
Appending summary sentence 23
Appending summary sentence 24
Appending summary sentence 25
Appending summary sentence 26
Appending summary sentence 27
Appending summary sentence 28
Appending summary sentence 29
Appending summary sentence 30
Appending summary sentence 31
Appending summary sentence 32
Appending summary se

Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Summary 10 of 126 complete!
Working on summary 11 of 126. This senator has 37 sentences.
Procesing batch of 37 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Summary 11 of 126 complete!
Working on summary 12 of 126. This senator has 116 sentences.
Procesing batch of 116 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9


Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Summary 16 of 126 complete!
Working on summary 17 of 126. This senator has 109 sentences.
Procesing batch of 109 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Appending summary sentence 16
Appending summary sentence

Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Appending summary sentence 16
Appending summary sentence 17
Appending summary sentence 18
Appending summary sentence 19
Appending summary sentence 20
Appending summary sentence 21
Appending summary sentence 22
Appending summary sentence 23
Appending summary sentence 24
Appending summary sentence 25
Appending summary sentence 26
Appending summary sentence 27
Appending summary sentence 28
Appending summary sentence 29
Appending summary sentence 30
Appending summary sentence 31
Appending summary sentence 32
Appending summary se

Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Summary 24 of 126 complete!
Working on summary 25 of 126. This senator has 363 sentences.
Procesing batch of 150 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Appending summary sentence 16
Appending summary sentence 17
Appending summary sentence

Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Appending summary sentence 16
Appending summary sentence 17
Appending summary sentence 18
Summary 28 of 126 complete!
Working on summary 29 of 126. This senator has 231 sentences.
Procesing batch of 150 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence

Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Appending summary sentence 16
Appending summary sentence 17
Appending summary sentence 18
Appending summary sentence 19
Appending summary sentence 20
Appending summary sentence 21
Summary 32 of 126 complete!
Working on summary 33 of 126. This senator has 105 sentences.
Procesing batch of 105 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sente

In [1]:
senators["initiatives_summary_dummy_en"][35]

NameError: name 'senators' is not defined