# Bert Test (with translation)

## Import librares for notebook.

In [1]:
import torch
import nltk
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize
from googletrans import Translator
import pandas as pd
import numpy as np
from deep_translator import GoogleTranslator
from sklearn.metrics.pairwise import cosine_similarity

## Import and translate data

In [2]:
#Import senator csv from senator_data_file
senators = pd.read_csv(r'senators_data.csv', converters={'initiative_list': pd.eval})

In [3]:
#Split the concatenated initiatives for each senator into bites smaller than 5000 to be able to translate.
senators["initiatives_summary_dummy_split"] = ""

n = 4999

for i, row in senators.iterrows():
    if not row["initiative_list"] == []:
        initiatives_split = [row["initiatives_summary_dummy"][i:i+n] for i in range(0, len(row["initiatives_summary_dummy"]), n)]
        senators.at[i, "initiatives_summary_dummy_split"] = initiatives_split
    else:
        senators.at[i, "initiatives_summary_dummy_split"] = []

In [4]:
#Translate the concatenated, split string of initiatives per senator, store in new column.

senators["initiatives_summary_dummy_split_en"] = ""

for i, row in senators.iterrows():
    print(f"Working on row {i} of {len(senators)}")
    if len(row["initiatives_summary_dummy_split"]) >= 1:
        en_initiatives = GoogleTranslator(source='es', target='en').translate_batch(row["initiatives_summary_dummy_split"])
        senators.at[i, "initiatives_summary_dummy_split_en"] = en_initiatives
        print(f"Summary {i} translated successfully. Sample: {en_initiatives[0][:20]}")
    else:
        print(f"Senator number {i}, senator {row['senadores']} has no initiatives to translate.")
        continue

Working on row 0 of 126
Summary 0 translated successfully. Sample: It intends to guaran
Working on row 1 of 126
Summary 1 translated successfully. Sample: It proposes that the
Working on row 2 of 126
Summary 2 translated successfully. Sample: Proposes to modify t
Working on row 3 of 126
Summary 3 translated successfully. Sample: It proposes to elimi
Working on row 4 of 126
Summary 4 translated successfully. Sample: Proposes to modify t
Working on row 5 of 126
Summary 5 translated successfully. Sample: Proposes to modify t
Working on row 6 of 126
Summary 6 translated successfully. Sample: It proposes to repea
Working on row 7 of 126
Summary 7 translated successfully. Sample: It proposes to add a
Working on row 8 of 126
Summary 8 translated successfully. Sample: It proposes to deter
Working on row 9 of 126
Summary 9 translated successfully. Sample: It proposes to estab
Working on row 10 of 126
Summary 10 translated successfully. Sample: Proposes that the Mi
Working on row 11 of 126
Summa

In [5]:
#Join all split, translated initiatives into one long english string per senator.

senators["initiatives_summary_dummy_en"] = senators["initiatives_summary_dummy_split_en"].apply(lambda x: "".join(x))

In [51]:
senators["initiatives_summary_dummy_split_en"][3]

['It proposes to eliminate the requirement that establishes that it is necessary for plant workers to have at least 15 years of seniority in employment, in order to receive a seniority premium in the event of voluntary separation from the same. It proposes to eliminate the assumption that determines that when proceeds to a commercial trial and the conventional address agreed to receive notifications does not correspond to that of the claim, notification is made by edicts without the need to obtain the report that the public authority or institution must prepare when the judge so orders. Proposes establish that independent workers who were not included in insurance under the voluntary regime or who have ceased to be, may be subject to insurance under the mandatory regime; states that, in order to enjoy benefits for unemployment of advanced age, the worker will have to have covered 750 weekly contributions. In addition, it establishes that when the mother or father of a newborn has him o

## Feed text through BERT model to create a summary.

In [10]:
#Import BERT model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [11]:
senators["initiative_summary_en"] = ""

In [17]:
for index, row in senators.iterrows():
    
    if len(row["initiatives_summary_dummy_en"]) >= 1:
        
        sentences = sent_tokenize(row["initiatives_summary_dummy_en"])
        
        print(f"Working on summary {index} of {len(senators)}. This senator has {len(sentences)} sentences.")

        n = 150
        sentences_split = [sentences[i:i+n] for i in range(0, len(sentences), n)]
        summary = []
        
        for sen_split in sentences_split:
            
            print(f"Procesing batch of {len(sen_split)} sentences.")
                
            tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

            max_len = 0
            for i in tokenized_sentences:
                if len(i) > max_len:
                    max_len = len(i)

            padded_sentences = []
            for i in tokenized_sentences:
                while len(i) < max_len:
                    i.append(0)
                padded_sentences.append(i)

            input_ids = torch.tensor(padded_sentences)

            with torch.no_grad():
                last_hidden_states = model(input_ids)[0]

            sentence_embeddings = []
            for i in range(len(sen_split)):
                sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())

            # Compute the similarity matrix
            similarity_matrix = cosine_similarity(sentence_embeddings)

            # Generate the summary
            num_sentences = round(len(sen_split)*.25)
            summary_sentences = []
            for i in range(num_sentences):
                sentence_scores = list(enumerate(similarity_matrix[i]))

            sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)

            for i in range(num_sentences):
                print(f"Appending summary sentence {i}")
                summary_sentences.append(sen_split[sentence_scores[i][0]])

            sub_summary = ' '.join(summary_sentences)
            
            summary.append(sub_summary)
                    
        summary = " ".join([str(item) for item in summary])

        senators.at[index, "initiative_summary_en"] = summary
        
        print(f"Summary {index} of {len(senators)} complete!")
        
    else:
        
        print(f"Senator number {index}, senator {row['senadores']} has no initiatives to summarize.")

Working on summary 0 of 126. This senator has 8 sentences.
Procesing batch of 8 sentences.
Appending summary sentence 0
Appending summary sentence 1
Summary 0 of 126 complete!
Working on summary 1 of 126. This senator has 65 sentences.
Procesing batch of 65 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sentence 4
Appending summary sentence 5
Appending summary sentence 6
Appending summary sentence 7
Appending summary sentence 8
Appending summary sentence 9
Appending summary sentence 10
Appending summary sentence 11
Appending summary sentence 12
Appending summary sentence 13
Appending summary sentence 14
Appending summary sentence 15
Summary 1 of 126 complete!
Working on summary 2 of 126. This senator has 135 sentences.
Procesing batch of 135 sentences.
Appending summary sentence 0
Appending summary sentence 1
Appending summary sentence 2
Appending summary sentence 3
Appending summary sente

In [19]:
senators.to_csv(r'senators_data_summarized_en.csv')

## Retranslate summaries back to spanish.

In [35]:
#Split the english summaries for each senator into bites smaller than 5000 to be able to translate back to spanish.
senators["initiatives_summary_en_split"] = ""

n = 4999

for i, row in senators.iterrows():
    if not row["initiative_list"] == []:
        summary_split_es = [row["initiative_summary_en"][i:i+n] for i in range(0, len(row["initiative_summary_en"]), n)]
        senators.at[i, "initiatives_summary_en_split"] = summary_split_es
    else:
        senators.at[i, "initiatives_summary_en_split"] = []

In [52]:
#Translate the split english summaires back to spanish, split string of initiatives per senator, store in new column.

senators["initiatives_summary_es_split"] = ""

for i, row in senators.iterrows():
    print(f"Working on row {i} of {len(senators)}")
    if len(row["initiatives_summary_dummy_split"]) >= 1:
        es_initiatives = GoogleTranslator(source='en', target='es').translate_batch(row["initiatives_summary_en_split"])
        senators.at[i, "initiatives_summary_es_split"] = es_initiatives
        print(f"Summary {i} translated successfully. Sample: {es_initiatives[0][:20]}")
    else:
        print(f"Senator number {i}, senator {row['senadores']} has no initiatives to translate.")
        continue

Working on row 0 of 126
Summary 0 translated successfully. Sample: Propone establecer u
Working on row 1 of 126
Summary 1 translated successfully. Sample: Propone que este ord
Working on row 2 of 126
Summary 2 translated successfully. Sample: Dicho informe deberá
Working on row 3 of 126
Summary 3 translated successfully. Sample: Propone incorporar l
Working on row 4 of 126
Summary 4 translated successfully. Sample: -Registro único y pú
Working on row 5 of 126
Summary 5 translated successfully. Sample: Propone considerar q
Working on row 6 of 126
Summary 6 translated successfully. Sample: Autoriza al Congreso
Working on row 7 of 126
Summary 7 translated successfully. Sample: Propone que el Ejecu
Working on row 8 of 126
Summary 8 translated successfully. Sample: Propone homologar el
Working on row 9 of 126
Summary 9 translated successfully. Sample: Propone que en los c
Working on row 10 of 126
Summary 10 translated successfully. Sample: Propone agregar los 
Working on row 11 of 126
Summa

In [53]:
#Merge translated batches into one
senators["initiative_summary_es"] = senators["initiatives_summary_es_split"].apply(lambda x: "".join(x))

In [35]:
#Drop senators without initiatives
senators = senators.dropna(subset=['initiative_summary_es'])

In [38]:
#Save file with spanish translated summary column.
senators.to_csv(r'../data/senators_data_summarized_es.csv')

In [4]:
senators = pd.read_csv(r"/Users/jmlunamugica/Downloads/senators_data_summarized_es.csv").fillna("")

In [9]:
senators = pd.read_csv(r"/Users/jmlunamugica/Downloads/senators_data_summarized_es.csv").fillna("").drop(["Unnamed: 0.1", "Unnamed: 0"], axis=1)

In [11]:
senators.keys()

Index(['senator_id', 'Apellidos', 'Nombre', 'Fraccion', 'Legislatura',
       'Estado', 'Sexo', 'tipoEleccion', 'Suplente', 'estadoOrigen', 'correo',
       'facebook', 'twitter', 'youtube', 'instagram', 'url_sitio', 'telefono',
       'extension', 'estatus', 'direccion', 'id', 'senadores',
       'attendance_score', 'initiative_list', 'initiatives_summary_dummy',
       'initiatives_summary_dummy_split', 'initiatives_summary_dummy_split_en',
       'initiatives_summary_dummy_en', 'initiative_summary_en',
       'initiatives_summary_en_split', 'initiatives_summary_es_split',
       'initiative_summary_es'],
      dtype='object')