# Mexican Senate Data

## Importing necessary libraries

In [1]:
# importing required modules
import requests
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import numpy as np
from lxml import etree
import re
from selenium import webdriver
import time

## Senator Database, Exports to CSV in data folder.

### Importing senator table

In [2]:
def get_senators():
    senators_url = 'https://www.senado.gob.mx/65/datosAbiertos/senadoresDatosAb.json'
    senators_json = requests.get(senators_url).json()
    senators = pd.DataFrame.from_dict(senators_json)
    senators = senators.rename(columns={"idSenador": "senator_id"})
    return senators

In [3]:
senators = get_senators()

In [4]:
#Creating a field that includes first and last names to join with initiatives+proposals table.
senators["senadores"] = senators["Nombre"].str.strip()+" "+senators["Apellidos"].str.strip()

### Importing attendance data and adding to senator table

In [5]:
def get_senator_attendance():
    
    senators = get_senators()
    
    senator_ids = senators["senator_id"].tolist()
    
    senator_attendance = pd.DataFrame()
    senator_attendance["senator_id"] = ""
    senator_attendance["session_date"] = ""
    senator_attendance["attendance_record"] = ""

    counter = 0
    for sen in senator_ids:
        url = f'https://www.senado.gob.mx/65/asistencias/{sen}#info'
        html = requests.get(url)
        content = BeautifulSoup(html.text, 'html.parser')
        content_x = etree.HTML(str(content))
        dates = content_x.xpath('//*[@id="imPage"]/div[7]/div[2]/div/div[2]/section/div/div/table/tbody//a')
        att_records = content_x.xpath('//*[@id="imPage"]/div[7]/div[2]/div/div[2]/section/div/div/table/tbody//strong')
        for i in range(len(dates)):
            senator_attendance.at[i+counter, 'senator_id'] = sen
            senator_attendance.at[i+counter, 'session_date'] = dates[i].text
            senator_attendance.at[i+counter, 'attendance_record'] = att_records[i].text
        counter += len(dates)

    senator_attendance["attendance_score"] = senator_attendance["attendance_record"].copy()
    senator_attendance["attendance_score"] = senator_attendance["attendance_score"].map(lambda x: 1 if x == "Asistencia" else 0)
    senator_attendance = pd.merge(senator_attendance, senators[['senator_id','Fraccion', 'Estado', 'Apellidos', 'Nombre', 'tipoEleccion']], on='senator_id', how='left')

    senator_attendance["full_name"] = senator_attendance['Nombre'] + " " + senator_attendance['Apellidos']
    
    senator_attendance = senator_attendance.groupby(['senator_id', 'full_name', 'Fraccion', 'Estado', 'tipoEleccion'], as_index=False)[['attendance_score']].mean()

    return senator_attendance

In [6]:
senator_attendance = get_senator_attendance()

In [7]:
senators = senators.merge(senator_attendance[["senator_id", "attendance_score"]], how="left", on="senator_id")

### Importing initiatives and proposals, concatenating both and adding senator ids

In [8]:
def get_initiatives():
    """fucntion that extracts initiatives from Senate JSON."""
    
    init_64_url = 'https://www.senado.gob.mx/65/datosAbiertos/iniciativa_64.json'
    init_65_url = 'https://www.senado.gob.mx/65/datosAbiertos/iniciativa_65.json'
    
    init_64_json = requests.get(init_64_url).json()
    init_65_json = requests.get(init_65_url).json()
    
    init_64 = pd.DataFrame.from_dict(init_64_json)
    init_65 = pd.DataFrame.from_dict(init_65_json)
    
    initiatives = pd.concat([init_64, init_65])
    
    initiatives['fecha_presentacion'] = pd.to_datetime(initiatives['fecha_presentacion'],errors='coerce')
    initiatives['fecha_aprobacion'] = pd.to_datetime(initiatives['fecha_aprobacion'],errors='coerce')
    
    initiatives = initiatives.set_index('id')
        
    return initiatives

In [9]:
def get_proposals():
    """fucntion that extracts proposals from Senate JSON."""
    
    prop_64_url = 'https://www.senado.gob.mx/65/datosAbiertos/proposicion_64.json'
    prop_65_url = 'https://www.senado.gob.mx/65/datosAbiertos/proposicion_65.json'
    
    prop_64_json = requests.get(prop_64_url).json()
    prop_65_json = requests.get(prop_65_url).json()
    
    prop_64 = pd.DataFrame.from_dict(prop_64_json)
    prop_65 = pd.DataFrame.from_dict(prop_65_json)
    
    proposals = pd.concat([prop_64, prop_65])
    
    proposals['fecha_presentacion'] = pd.to_datetime(proposals['fecha_presentacion'],errors='coerce')
    proposals['fecha_aprobacion'] = pd.to_datetime(proposals['fecha_aprobacion'],errors='coerce')
    
    proposals = proposals.set_index('id')
    
    return proposals

In [10]:
#Create concatenated df that includes initiatives and proposals.
initiatives = get_initiatives()
proposals = get_proposals()
inipros = pd.concat([initiatives, proposals])

  initiatives['fecha_aprobacion'] = pd.to_datetime(initiatives['fecha_aprobacion'],errors='coerce')


In [11]:
print(f"Inipros df has {inipros.shape[0]} initiatives with {inipros.shape[1]} features.")

Inipros df has 9396 initiatives with 13 features.


In [12]:
#creates a 1:1 relationship between initiative/proposal and senator (in case where more than 1 senator proposes).
inipros["senadores"] = inipros["senadores"].apply(lambda x:x.strip().split("<br>"))

for i, row in inipros.iterrows():
    senator_ids = []
    for senator in row["senadores"]:
        strt_pos = senator.find('(')
        senator = senator[:strt_pos-1].strip()
        senator_ids.append(senator)
    inipros.at[i, "senadores"] = senator_ids[:-1]

inipros = inipros.explode("senadores")

In [13]:
#Inner join on senator names to ensure only initiatives that match senator ids from table remain.
inipros = inipros.merge(senators[["senadores", "senator_id"]], how='inner', on='senadores')

In [14]:
print(f"Inipros df has {inipros.shape[0]} initiatives with {inipros.shape[1]} features.")

Inipros df has 7832 initiatives with 14 features.


### Add list of initiative strings back to senator table

In [15]:
senators["initiative_list"] = ""

In [16]:
#Function that creates a list of initiative syntheses and then adds to senator database.
for i, row in senators.iterrows():
    initiatives = []
    relevant_inipros = inipros[inipros["senator_id"] == str(row["senator_id"])]["sintesis"]
    [initiatives.append(initiative) for initiative in relevant_inipros]
    senators.at[i, "initiative_list"] = initiatives

In [17]:
#Creates dummy summary of a all initiatives, to be replaced by BERT or BETO summaries.
senators["initatives_summary_dummy"] = senators["initiative_list"].copy()
senators["initatives_summary_dummy"] = senators["initatives_summary_dummy"].apply(lambda x: "".join(x))

### Export file to CSV in data folder

In [26]:
import os
current_path = os.getcwd()
parent_directory = os.path.dirname(current_path)

project_path = os.path.join(parent_directory, 'data')
senators.to_csv(os.path.join(project_path, 'senators_data.csv'))

# Classifying initiatives & proposals into topics with LDA (DEPRECATED)

## LDA Approach

### Preprocessing text

In [None]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [None]:
def clean(column):
    """Remove punctuation, make strings lower case, remove numbers. Tokenize, remove stopwords and lemmatize."""
    #Removing punctuation.
    for punctuation in string.punctuation:
        column = column.apply(lambda x: x.replace(punctuation, ''))
    #Making lower case and removing whitespace.
    column = column.apply(lambda x: x.lower().strip())
    #Removing numbers
    column = column.apply(lambda x: re.sub(r'[0-9]', '', x))
    #Tokenize all rows.
    column = column.apply(lambda x: word_tokenize(x))
    #Remove stopwords and words too frequently present in initiative language.
    stop_words = set(stopwords.words('spanish'))
    stop_words_extra = ("exhorta", "modificar", "actualizar", "política", "general", "caso", "derecho", "materia", "virtud", "referencias", "cambiar", "deberán", "día", "año", "denominación", "distrito", "cámara", "senadores", "normativa", "senado", "objetivo", "cumplimiento", "ordenamiento", "república", "reforma", "cada", "dar", "federal", "secretaría", "mención", "paso", "dejar", "principio", "ser", "paridad", "así", "derechos", "reformar", "propone", "nacional", "establecer", "méxico", "persona", "ley", "ciudad", "deberá", "legal", "personas")
    column = column.apply(lambda x: [w for w in x if w not in stop_words])
    column = column.apply(lambda x: [w for w in x if w not in stop_words_extra])
    # Lemmatizing the verbs
    column = column.apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos = "v") for word in x])
    # 2 - Lemmatizing the nouns
    column = column.apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos = "n") for word in x])
    # Rejoin words to make sentences
    column = column.apply(lambda x: " ".join(x))
    return column

In [None]:
inipros["sintesis_clean"] = clean(inipros["sintesis"])

### Training vectorization model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
vectorizer = TfidfVectorizer()

vectorized_text = vectorizer.fit_transform(inipros["sintesis_clean"])

# Instantiate the LDA 
n_components = 15
lda_model = LatentDirichletAllocation(n_components=n_components)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_text)

### Visualize potential topics

In [None]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i]) for i in np.argsort(topic)[:-5 -1:-1]])

In [None]:
print_topics(lda_model, vectorizer)

### Test with real initiatives

In [None]:
random_num = np.random.randint(0, len(inipros))
example = [inipros["sintesis"][random_num]]
example_df = pd.DataFrame(example, columns = ["text"])
print(example_df["text"][0])

In [None]:
clean_example = clean(example_df["text"])
example_vectorized = vectorizer.transform(clean_example)
lda_vectors = lda_model.transform(example_vectorized)
lda_vectors