# Mexican Senate Data

## Importing necessary libraries

In [159]:
# importing required modules
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.1/bin/tesseract'
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import requests
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import numpy as np
from lxml import etree
import re
from selenium import webdriver
import time

## Federal Senate Database

### Importing senator table

In [224]:
def get_senators():
    senators_url = 'https://www.senado.gob.mx/65/datosAbiertos/senadoresDatosAb.json'
    senators_json = requests.get(senators_url).json()
    senators = pd.DataFrame.from_dict(senators_json)
    senators = senators.rename(columns={"idSenador": "senator_id"})
    return senators

In [241]:
senators = get_senators()

In [242]:
#Creating a field that includes first and last names to join with initiatives+proposals table.
senators["senadores"] = senators["Nombre"].str.strip()+" "+senators["Apellidos"].str.strip()

### Importing attendance data and adding to senator table

In [233]:
def get_senator_attendance():
    
    senators = get_senators()
    
    senator_ids = senators["senator_id"].tolist()
    
    senator_attendance = pd.DataFrame()
    senator_attendance["senator_id"] = ""
    senator_attendance["session_date"] = ""
    senator_attendance["attendance_record"] = ""

    counter = 0
    for sen in senator_ids:
        url = f'https://www.senado.gob.mx/65/asistencias/{sen}#info'
        html = requests.get(url)
        content = BeautifulSoup(html.text, 'html.parser')
        content_x = etree.HTML(str(content))
        dates = content_x.xpath('//*[@id="imPage"]/div[7]/div[2]/div/div[2]/section/div/div/table/tbody//a')
        att_records = content_x.xpath('//*[@id="imPage"]/div[7]/div[2]/div/div[2]/section/div/div/table/tbody//strong')
        for i in range(len(dates)):
            senator_attendance.at[i+counter, 'senator_id'] = sen
            senator_attendance.at[i+counter, 'session_date'] = dates[i].text
            senator_attendance.at[i+counter, 'attendance_record'] = att_records[i].text
        counter += len(dates)

    senator_attendance["attendance_score"] = senator_attendance["attendance_record"].copy()
    senator_attendance["attendance_score"] = senator_attendance["attendance_score"].map(lambda x: 1 if x == "Asistencia" else 0)
    senator_attendance = pd.merge(senator_attendance, senators[['senator_id','Fraccion', 'Estado', 'Apellidos', 'Nombre', 'tipoEleccion']], on='senator_id', how='left')

    senator_attendance["full_name"] = senator_attendance['Nombre'] + " " + senator_attendance['Apellidos']
    
    senator_attendance = senator_attendance.groupby(['senator_id', 'full_name', 'Fraccion', 'Estado', 'tipoEleccion'], as_index=False)[['attendance_score']].mean()

    return senator_attendance

In [234]:
senator_attendance = get_senator_attendance()

In [243]:
senators = senators.merge(senator_attendance[["senator_id", "attendance_score"]], how="left", on="senator_id")

Unnamed: 0,senator_id,Apellidos,Nombre,Fraccion,Legislatura,Estado,Sexo,tipoEleccion,Suplente,estadoOrigen,...,youtube,instagram,url_sitio,telefono,extension,estatus,direccion,id,senadores,attendance_score
0,1160,Botello Montes,José Alfredo,PAN,64,Querétaro,Hombre,Senador Electo por el Principio de Mayoría Rel...,,Querétaro,...,,,,5345 3000,4183 y 4184,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",1,José Alfredo Botello Montes,0.958904
1,1162,Rojas Loreto,Estrella,PAN,64,Querétaro,Mujer,Senadora Electa por el Principio de Mayoría Re...,,Querétaro,...,,,,5345 3000,3323,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",2,Estrella Rojas Loreto,0.973913
2,1247,Moya Clemente,Roberto Juan,PAN,64,Lista Nacional,Hombre,Senador Electo por Representación Proporcional...,,Lista Nacional,...,,,,5345 3000,"3301, 5917 y 2122",en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",3,Roberto Juan Moya Clemente,0.826568
3,1101,Bermúdez Méndez,José Erandi,PAN,64,Guanajuato,Hombre,Senador Electo por el Principio de Mayoría Rel...,Juan Antonio Negrete Martínez,Guanajuato,...,,https://www.instagram.com/erandibermudez/,,5345 3000,3268,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",4,José Erandi Bermúdez Méndez,0.868421
4,1052,Cruz Blackledge,Gina Andrea,PAN,64,Baja California,Mujer,Senadora Electa por el Principio de Primera Mi...,Iraís María Vázquez Aguiar,Baja California,...,,https://www.instagram.com/ginacruzb/,,5345 3000,3388,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",5,Gina Andrea Cruz Blackledge,0.884868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,1274,Vasconcelos,Héctor,morena,64,Lista Nacional,Hombre,Senador Electo por Representación Proporcional...,,Lista Nacional,...,https://www.youtube.com/channel/UCZKK3iAGK1_5C...,,http://hectorvasconcelos.mx,5345 3000,3657,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",122,Héctor Vasconcelos,0.871711
122,1273,Vázquez Alatorre,Antares Guadalupe,morena,64,Lista Nacional,Mujer,Senadora Electa por Representación Proporciona...,,Lista Nacional,...,https://www.youtube.com/channel/UCTCzq6UOjyy3G...,https://www.instagram.com/antaresvazquezalatorre/,,5345 3000,3834,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",123,Antares Guadalupe Vázquez Alatorre,0.957237
123,1169,Villegas Canché,Freyda Marybel,morena,64,Quintana Roo,Mujer,Senadora Electa por el Principio de Mayoría Re...,Gabriela López Gómez,Quintana Roo,...,https://www.youtube.com/channel/UCbSepTNbWgNRb...,https://www.instagram.com/marybelvillegascanche/,http://marybelvillegas.mx/,5345 3000,3194,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",124,Freyda Marybel Villegas Canché,0.888446
124,1406,Jiménez Arteaga,Rosa Elena,morena,64,Nayarit,Mujer,Senadora Electa por el Principio de Mayoría Re...,Beatriz Andrea Navarro Pérez,Nayarit,...,,,,5345 3000,3167,en_funciones,"Av Paseo de la Reforma No. 135, Hemiciclo Piso...",125,Rosa Elena Jiménez Arteaga,0.931034


### Importing initiatives and proposals, concatenating both and adding senator ids

In [161]:
def get_initiatives():
    """fucntion that extracts initiatives from Senate JSON."""
    
    init_64_url = 'https://www.senado.gob.mx/65/datosAbiertos/iniciativa_64.json'
    init_65_url = 'https://www.senado.gob.mx/65/datosAbiertos/iniciativa_65.json'
    
    init_64_json = requests.get(init_64_url).json()
    init_65_json = requests.get(init_65_url).json()
    
    init_64 = pd.DataFrame.from_dict(init_64_json)
    init_65 = pd.DataFrame.from_dict(init_65_json)
    
    initiatives = pd.concat([init_64, init_65])
    
    initiatives['fecha_presentacion'] = pd.to_datetime(initiatives['fecha_presentacion'],errors='coerce')
    initiatives['fecha_aprobacion'] = pd.to_datetime(initiatives['fecha_aprobacion'],errors='coerce')
    
    initiatives = initiatives.set_index('id')
        
    return initiatives

In [188]:
def get_proposals():
    """fucntion that extracts proposals from Senate JSON."""
    
    prop_64_url = 'https://www.senado.gob.mx/65/datosAbiertos/proposicion_64.json'
    prop_65_url = 'https://www.senado.gob.mx/65/datosAbiertos/proposicion_65.json'
    
    prop_64_json = requests.get(prop_64_url).json()
    prop_65_json = requests.get(prop_65_url).json()
    
    prop_64 = pd.DataFrame.from_dict(prop_64_json)
    prop_65 = pd.DataFrame.from_dict(prop_65_json)
    
    proposals = pd.concat([prop_64, prop_65])
    
    proposals['fecha_presentacion'] = pd.to_datetime(proposals['fecha_presentacion'],errors='coerce')
    proposals['fecha_aprobacion'] = pd.to_datetime(proposals['fecha_aprobacion'],errors='coerce')
    
    proposals = proposals.set_index('id')
    
    return proposals

In [220]:
#Create concatenated df that includes initiatives and proposals.
initiatives = get_initiatives()
proposals = get_proposals()
inipros = pd.concat([initiatives, proposals])

In [221]:
print(f"Inipros df has {inipros.shape[0]} initiatives with {inipros.shape[1]} features.")

Inipros df has 9200 initiatives with 13 features.


In [222]:
#creates a 1:1 relationship between initiative/proposal and senator (in case where more than 1 senator proposes).
inipros["senadores"] = inipros["senadores"].apply(lambda x:x.strip().split("<br>"))

for i, row in inipros.iterrows():
    senators = []
    for senator in row["senadores"]:
        strt_pos = senator.find('(')
        senator = senator[:strt_pos-1].strip()
        senators.append(senator)
    inipros.at[i, "senadores"] = senators[:-1]

inipros = inipros.explode("senadores")

In [227]:
inipros = inipros.merge(senators, how='left', on='senadores')

In [228]:
inipros.shape

(12591, 34)

In [229]:
senator_inipros = inipros[inipros["senator_id"].notnull()]

In [230]:
senator_inipros.shape

(7737, 34)

In [231]:
no_senator_inipros = inipros[inipros["senator_id"].isnull()]

In [155]:
#most are non-senators, some are senators but have name mismatch...still need to parse.
no_senator_inipros.shape

array(['Geovanna del Carmen Bañuelos de La Torre',
       'Ricardo Monreal Ávila', 'Congreso del Estado de Baja California',
       'Congreso del Estado de Nuevo León',
       'Noé Fernando Castañón Ramírez', 'Aníbal Ostoa Ortega',
       'Diputados', 'Bertha Xóchitl Gálvez Ruiz',
       'Julio Ramón Menchaca Salazar', 'Manuel Velasco Coello',
       'Ejecutivo Federal', 'Lucía Virginia Meza Guzmán',
       'José Alejandro Peña Villa', 'Lilly Téllez García',
       'Senadores integrantes del PAN', 'Raúl Bolaños Cacho Cué',
       'Evelia Sandoval Urban', 'Ruth Alejandra López Hernández',
       'Congreso del Estado de Oaxaca',
       'Audelia Esthela Villarreal Zavala', 'Martí Batres Guadarrama',
       'Congreso del Estado de Chihuahua', 'Américo Villarreal Anaya',
       'José Alfonso Pascual Solórzano Fraga', 'Patricia Mercado Castro',
       'María Marena López García', 'Higinio Martínez Miranda',
       'Juan Manuel Zepeda Hernández', 'José Alberto Galarza Villaseñor',
       'Cru

### Senator voting record data still pending...script is partial as it take a long time to scrape...

In [15]:
def get_senator_voting():
    """Returns dataframe with senator voting history for current senators."""
    
    #Create list of senators from senator dataframe.
    senator_ids = get_senators()["senator_id"].tolist()

    #Create dataframe to store voting data.
    senator_voting = pd.DataFrame()
    senator_voting["senator_id"] = ""
    senator_voting["voting_date"] = ""
    senator_voting["vote_id"] = ""
    senator_voting["vote_description"] = ""
    senator_voting["vote_position"] = ""

    #Script that scrapes senate website for voting data. First, all except voting date.
    sens = len(senator_ids)
    for sen in senator_ids:
        print(f"Beginning to process all votes for senator # {sen}. {sens} left to process.")
        url = f'https://www.senado.gob.mx/65/votaciones/{sen}#info'
        html = requests.get(url)
        content = BeautifulSoup(html.text, 'html.parser')
        vote_positions = content.find_all(class_ = "col-sm-1 text-center")
        vote_descriptions = content.find_all(class_ = "col-sm-11")
        votes = len(vote_descriptions)
        for i in range(len(vote_positions)):
            vote_id = vote_descriptions[i].findChildren()[0]['href'].split("/")[3]
#             start_time = time.time()
            senator_voting.at[i, 'senator_id'] = sen
            senator_voting.at[i, 'vote_position'] = vote_positions[i].text.strip()
            senator_voting.at[i, 'vote_description'] = vote_descriptions[i].text.strip()
            senator_voting.at[i, 'vote_id'] = vote_id
            votes -= 1
#             end_time = time.time()
#             time_spent = round(end_time - start_time)
#             time_left = votes * time_spent
#             print(f"{round(time_left/60)} min left.")
        sens -= 1
    
    #Script that scrapes Senate website for voting data. Secondly, voting dates.
    vote_ids = senator_voting["vote_id"].tolist()
    votes = len(vote_ids)
    for i in range(len(vote_ids)):
        print(f"Working on vote {senator_voting.at[i, 'vote_id']}. {votes} votes left.")
        start_time = time.time()
        vote_date_url = f"https://www.senado.gob.mx/65/votacion/{senator_voting.at[i, 'vote_id']}"
        html = requests.get(vote_date_url)
        content = BeautifulSoup(html.text, 'html.parser')
        vote_date = content.find_all(class_ = "col-sm-12 text-center")[1].text
        senator_voting.at[i, 'voting_date'] = vote_date
        votes -= 1
        end_time = time.time()
        time_spent = round(end_time - start_time)
        time_left = votes * time_spent
        print(f"{round(time_left/60)} min left.")
    
    
    return senator_voting

# Following data extraction sections deprecated...or not yet started...

## For PDFs that have text in them.

In [9]:
def text_from_normal_pdf(pdf_path):
    
    # creating a pdf reader object
    reader = PdfReader(pdf_path)

    # printing number of pages in pdf file
    print(len(reader.pages))

    # getting a specific page from the pdf file
    page = reader.pages[2]

    # extracting text from page
    text = page.extract_text()
    print(text)

3



## For PDFs that are images or scans of documents

In [117]:
def text_from_flat_pdf(pdf_path):
    pdfs = pdf_path
    pages = convert_from_path(pdfs, 350)
    
    i = 1
    images = []
    for page in pages:
        image_name = "Page_" + str(i) + ".jpg"  
        page.save(image_name, "JPEG")
        i = i+1
        images.append(f'/Users/jmlunamugica/sivico/code/{image_name}')
    print(images)
        
    full_text = ''
    i = 1
    for image in images:
        image = cv2.imread(image)
        # convert the image to black and white for better OCR
        ret,thresh1 = cv2.threshold(image,120,255,cv2.THRESH_BINARY)
        # pytesseract image to string to get results
        image_text = str(pytesseract.image_to_string(thresh1, config='--psm 6'))
        full_text = full_text + "page " + str(i) + ": " + image_text
        i += 1
        
    print(full_text)
    return full_text

In [118]:
text_from_flat_pdf('/Users/jmlunamugica/sivico/code/Asistencia_20230413.pdf')

['/Users/jmlunamugica/sivico/code/Page_1.jpg', '/Users/jmlunamugica/sivico/code/Page_2.jpg', '/Users/jmlunamugica/sivico/code/Page_3.jpg']
page 1: OB Go CN WER ET OR a ES
*. Ce % ee
‘ oes %,' LXV Legislatura’.  ANOI _..... Seglindo Periodo Ordinario
hy SERB OS POE ee
Cone RS Ce ee
ELA RY SE SORES RUS ER TEESE gad Bs STE ESE UES PRE SG STS Ree oar ter ck Gune rene na er eg
! REGISTRO DE ASISTENCIA DE LA SESION ORDINARIA
. CELEBRADA EL DIA JUEVES 13 DE ABRIL DE 2023.
De conformidad con los articulos 63 y 64 del reglamento del Senado de la
Republica, se presenta la siguiente relacion:
SENADORAS Y SENADORES ASISTENTES, CONFORME A LO DISPUESTO EN EL ARTICULO
58 DEL REGLAMENTO POR SISTEMA ELECTRONICO:
ABREU ARTINANO ROCIO ADRIANA
ACEVES DEL OLMO CARLOS HUMBERTO
ALVAREZ ICAZA LONGORIA EMILIO
ALVAREZ LIMA JOSE ANTONIO CRUZ
ANAYA MOTA CLAUDIA EDITH
ANORVE BANOS MANUEL
ARIAS SOLIS CRISTOBAL
ARMENTA MIER ALEJANDRO
AVILA VAZQUEZ KATYA ELIZABETH
AVILA VILLEGAS ERUVIEL
BALDERAS ESPINOZA CLAUDIA ESTH

"page 1: OB Go CN WER ET OR a ES\n*. Ce % ee\n‘ oes %,' LXV Legislatura’.  ANOI _..... Seglindo Periodo Ordinario\nhy SERB OS POE ee\nCone RS Ce ee\nELA RY SE SORES RUS ER TEESE gad Bs STE ESE UES PRE SG STS Ree oar ter ck Gune rene na er eg\n! REGISTRO DE ASISTENCIA DE LA SESION ORDINARIA\n. CELEBRADA EL DIA JUEVES 13 DE ABRIL DE 2023.\nDe conformidad con los articulos 63 y 64 del reglamento del Senado de la\nRepublica, se presenta la siguiente relacion:\nSENADORAS Y SENADORES ASISTENTES, CONFORME A LO DISPUESTO EN EL ARTICULO\n58 DEL REGLAMENTO POR SISTEMA ELECTRONICO:\nABREU ARTINANO ROCIO ADRIANA\nACEVES DEL OLMO CARLOS HUMBERTO\nALVAREZ ICAZA LONGORIA EMILIO\nALVAREZ LIMA JOSE ANTONIO CRUZ\nANAYA MOTA CLAUDIA EDITH\nANORVE BANOS MANUEL\nARIAS SOLIS CRISTOBAL\nARMENTA MIER ALEJANDRO\nAVILA VAZQUEZ KATYA ELIZABETH\nAVILA VILLEGAS ERUVIEL\nBALDERAS ESPINOZA CLAUDIA ESTHER\nBANUELOS GEOVANNA\nBENAVIDES COBOS GABRIELA\nBERMUDEZ MENDEZ JOSE ERANDI\nBOLANOS CACHO CUE RAUL\nBOTELLO MONTES

### Select region of images from which to extract text.

In [68]:
import cv2
from PIL import Image
import matplotlib.pyplot as plt

def mark_region(image_path):
    
    im = cv2.imread(image_path)

    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9,9), 0)
    thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
    
    # Dilate to combine adjacent text contours
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
    dilate = cv2.dilate(thresh, kernel, iterations=4)
    
    # Find contours, highlight text areas, and extract ROIs
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]

    line_items_coordinates = []
    
    for c in cnts:
        area = cv2.contourArea(c)
        x,y,w,h = cv2.boundingRect(c)

        if y >= 600 and x <= 1000:
            if area > 10000:
                image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
                line_items_coordinates.append([(x,y), (2200, y+h)])
                return image, line_items_coordinates

        if y >= 2400 and x<= 2000:
            image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
            line_items_coordinates.append([(x,y), (2200, y+h)])
            return image, line_items_coordinates

    plt.imshow(im)

## Mexican Laws

# Classifying initiatives & proposals into topics

## LDA Approach

### Preprocessing text

In [244]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [257]:
def clean(column):
    """Remove punctuation, make strings lower case, remove numbers. Tokenize, remove stopwords and lemmatize."""
    #Removing punctuation.
    for punctuation in string.punctuation:
        column = column.apply(lambda x: x.replace(punctuation, ''))
    #Making lower case and removing whitespace.
    column = column.apply(lambda x: x.lower().strip())
    #Removing numbers
    column = column.apply(lambda x: re.sub(r'[0-9]', '', x))
    #Tokenize all rows.
    column = column.apply(lambda x: word_tokenize(x))
    #Remove stopwords and words too frequently present in initiative language.
    stop_words = set(stopwords.words('spanish'))
    stop_words_extra = ("exhorta", "modificar", "actualizar", "política", "general", "caso", "derecho", "materia", "virtud", "referencias", "cambiar", "deberán", "día", "año", "denominación", "distrito", "cámara", "senadores", "normativa", "senado", "objetivo", "cumplimiento", "ordenamiento", "república", "reforma", "cada", "dar", "federal", "secretaría", "mención", "paso", "dejar", "principio", "ser", "paridad", "así", "derechos", "reformar", "propone", "nacional", "establecer", "méxico", "persona", "ley", "ciudad", "deberá", "legal", "personas")
    column = column.apply(lambda x: [w for w in x if w not in stop_words])
    column = column.apply(lambda x: [w for w in x if w not in stop_words_extra])
    # Lemmatizing the verbs
    column = column.apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos = "v") for word in x])
    # 2 - Lemmatizing the nouns
    column = column.apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos = "n") for word in x])
    # Rejoin words to make sentences
    column = column.apply(lambda x: " ".join(x))
    return column

In [258]:
inipros["sintesis_clean"] = clean(inipros["sintesis"])

### Training vectorization model

In [252]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [259]:
vectorizer = TfidfVectorizer()

vectorized_text = vectorizer.fit_transform(inipros["sintesis_clean"])

# Instantiate the LDA 
n_components = 15
lda_model = LatentDirichletAllocation(n_components=n_components)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_text)

### Visualize potential topics

In [255]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i]) for i in np.argsort(topic)[:-5 -1:-1]])

In [260]:
print_topics(lda_model, vectorizer)

Topic 0:
[('cuidados', 8.152377824370088), ('plazo', 6.54867271043552), ('moratorios', 6.251462236713305), ('tratándose', 6.140648831027995), ('circuito', 5.822456621174861)]
Topic 1:
[('prisión', 17.33962346795972), ('delitos', 15.172324267906053), ('delito', 15.069667077986509), ('votos', 14.898576003258666), ('democrática', 14.693022763766855)]
Topic 2:
[('inclusivo', 18.261246187847167), ('lenguaje', 17.418572834911885), ('constitucionales', 16.878337552432512), ('disposiciones', 15.727013331203173), ('incorporar', 14.849857208397227)]
Topic 3:
[('órganos', 17.763659837930643), ('federales', 15.273162946841973), ('uso', 11.773341107108418), ('tribunales', 10.711978953185039), ('administrativos', 10.623157522217339)]
Topic 4:
[('indígenas', 34.68765761725618), ('pueblo', 30.01337346202277), ('agua', 27.900267103930567), ('ordenamientos', 27.526534543323102), ('inclusión', 25.80922310283219)]
Topic 5:
[('género', 90.26950785407557), ('garantizar', 76.29735575292742), ('salud', 15.686

### Test with real initiatives

In [267]:
random_num = np.random.randint(0, len(inipros))
example = [inipros["sintesis"][random_num]]
example_df = pd.DataFrame(example, columns = ["text"])
print(example_df["text"][0])

ÚNICO. El Senado de la República exhorta respetuosamente la Secretaria de Comunicaciones y Transportes, para que, en el marco de sus atribuciones, revise las tarifas que se cobran en las carreteras de cuota, con la finalidad de evitar cobros excesivos que afecten la economía de los usuarios, y se mejore y mantenga el estado físico de la red carretera del país.


In [268]:
clean_example = clean(example_df["text"])
example_vectorized = vectorizer.transform(clean_example)
lda_vectors = lda_model.transform(example_vectorized)
lda_vectors

array([[0.01157094, 0.01157095, 0.01157098, 0.01157095, 0.01157094,
        0.01157095, 0.30414982, 0.01157098, 0.01157097, 0.01157095,
        0.01157095, 0.01157097, 0.01157104, 0.01157097, 0.54542765]])