# BETO Matcher

In [1]:
import pandas as pd
import os
import sys

## Step 1: Imports

In [2]:
from senator_matcher.beto_matcher.embedding import generate_embeddings
from senator_matcher.beto_matcher.matching import match_senators, get_top_senators

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.we

## Step 2: Generate embeddings

In [3]:
SENATORS_TO_PROCESS = 3

current_path = os.getcwd()
parent_directory = os.path.dirname(current_path)
project_data_path = os.path.join(parent_directory, 'data')

# Load the data
df = pd.read_csv(os.path.join(project_data_path, 'senators_data.csv')).head(SENATORS_TO_PROCESS)

# Generate embeddings for the desired column
embeddings = [generate_embeddings(text) for text in df['initiatives_summary_dummy']]

## Step 3: Matching

In [4]:
user_input = "Quiero proteccion para los animales"

# Match the senators based on the user's input
similarity_scores = match_senators(user_input, embeddings)

# Get the top 5 senators
top_senators = get_top_senators(similarity_scores, df)

# Display the top senators based on their summary
top_senators

Unnamed: 0,Apellidos,Nombre,Fraccion,similarity_score
0,Botello Montes,José Alfredo,PAN,0.840162
1,Rojas Loreto,Estrella,PAN,0.837799
2,Moya Clemente,Roberto Juan,PAN,0.833831


# TFIDF Matcher

In [5]:
import pandas as pd
import os
import sys

## Step 1: Imports

In [6]:
from senator_matcher.tfidf_matcher.preprocessing import preprocess_text
from senator_matcher.tfidf_matcher.vectorization import fit_vectorizer, save_vectorizer_and_matrix, load_vectorizer_and_matrix
from senator_matcher.tfidf_matcher.matching import match_senators

[nltk_data] Downloading package stopwords to /Users/luis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-07-13 10:56:06 INFO: Downloading default packages for language: es (Spanish) ...
2023-07-13 10:56:08 INFO: File exists: /Users/luis/stanza_resources/es/default.zip
2023-07-13 10:56:12 INFO: Finished downloading models and saved to /Users/luis/stanza_resources.
2023-07-13 10:56:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-07-13 10:56:13 INFO: Loading these models for language: es (Spanish):
| Processor    | Package  |
---------------------------
| tokenize     | ancora   |
| mwt          | ancora   |
| pos          | ancora   |
| lemma        | ancora   |
| constituency | combined |
| depparse     | ancora   |
| sentiment    | tass2020 |
| ner          | conll02  |

2023-07-13 10:56:13 INFO: Using device: cpu
2023-07-13 10:56:13 INFO: Loading: tokenize
2023-07-13 10:56:13 INFO: Loading: mwt
2023-07-13 10:56:13 INFO: Loading: pos
2023-07-13 10:56:14 INFO: Loading: lemma
2023-07-13 10:56:14 INFO: Loading: constituency
2023-07-13 10:56:15 INFO: Loading: depparse
2023-07-13 10:56:15 INFO: Loading: sentiment
2023-07-13 10:56:15 INFO: Loading: ner
2023-07-13 10:56:16 INFO: Done loading processors!


## Step 2: Preprocessing

In [7]:
SENATORS_TO_PROCESS = 3

current_path = os.getcwd()
parent_directory = os.path.dirname(current_path)
project_data_path = os.path.join(parent_directory, 'data')

# Load the data
df = pd.read_csv(os.path.join(project_data_path, 'senators_data.csv')).head(SENATORS_TO_PROCESS)

# Apply the preprocessing function to the desired column
df['preprocessed_summary'] = df['initiatives_summary_dummy'].apply(preprocess_text)

## Step 3: Vectorization

In [8]:
# Fit the TfidfVectorizer to the preprocessed column
tfidf_matrix, vectorizer = fit_vectorizer(df, 'preprocessed_summary')

# Save the TF-IDF matrix and the fitted vectorizer so we can reuse it on users input
# We should define a place to store this in production
save_vectorizer_and_matrix(tfidf_matrix, vectorizer, 'tfidf_matrix.pkl', 'fitted_vectorizer.pkl')

## Step 4: Matching

In [9]:
# Load the TF-IDF matrix and the fitted vectorizer
tfidf_matrix, vectorizer = load_vectorizer_and_matrix('tfidf_matrix.pkl', 'fitted_vectorizer.pkl')

# Use the match_senators function
user_input = "Quiero proteccion para los animales"
results_df = match_senators(user_input, df, vectorizer, tfidf_matrix)

# Display the top 5 senators based on their summary
results_df.head(5)

Unnamed: 0,Apellidos,Nombre,Fraccion,similarity_score
1,Rojas Loreto,Estrella,PAN,0.107948
0,Botello Montes,José Alfredo,PAN,0.0
2,Moya Clemente,Roberto Juan,PAN,0.0
