# Introducción a la Ciencia de Datos: Tarea 2

Este notebook contiene el código de base para realizar la Tarea 2 del curso. Puede copiarlo en su propio repositorio y trabajar sobre el mismo.
Las **instrucciones para ejecutar el notebook** están en la [página inicial del repositorio](https://gitlab.fing.edu.uy/maestria-cdaa/intro-cd/).

**Se espera que no sea necesario revisar el código para corregir la tarea**, ya que todos los resultados y análisis relevantes deberían estar en el **informe en formato PDF**.

## Cargar dependencias
Para esta tarea, se han agregado algunos requerimientos, asegúrese de instalarlos (puede usar el mismo entorno virtual de la Tarea 1):

In [None]:
%pip install jupyter pandas "sqlalchemy<2.0" pymysql seaborn pillow scikit-learn

In [1]:
from time import time
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay

## Conexión a la Base y Lectura de Datos

In [3]:
# Creamos el directorio Tarea_1/data/shakespeare
data_dir = Path("data") / "shakespeare"
data_dir.mkdir(parents=True, exist_ok=True)


def load_table(table_name, engine):
    """
    Leer la tabla con SQL y guardarla como CSV,
    o cargarla desde el CSV si ya existe
    """
    path_table = data_dir / f"{table_name}.csv"
    if not path_table.exists():
        print(f"Consultando tabla con SQL: {table_name}")
        t0 = time()
        with engine.connect() as conn:
            df_table = pd.read_sql(sql=f"SELECT * FROM {table_name}", con=conn.connection)
        t1 = time()
        print(f"Tiempo: {t1 - t0:.1f} segundos")

        print(f"Guardando: {path_table}\n")
        df_table.to_csv(path_table)
    else:
        print(f"Cargando tabla desde CSV: {path_table}")
        df_table = pd.read_csv(path_table, index_col=[0])
    return df_table


print("Conectando a la base...")
conn_str = "mysql+pymysql://guest:relational@db.relational-data.org:3306/Shakespeare"
engine = create_engine(conn_str)

# DataFrame con todas las obras:
df_works = load_table("works", engine)

# Todos los párrafos de todas las obras
df_paragraphs = load_table("paragraphs", engine)

# Todos los párrafos de todas las obras
df_chapters = load_table("chapters", engine)

# Todos los párrafos de todas las obras
df_characters = load_table("characters", engine)

Conectando a la base...
Consultando tabla con SQL: works


  df_table = pd.read_sql(sql=f"SELECT * FROM {table_name}", con=conn.connection)


Tiempo: 3.6 segundos
Guardando: data/shakespeare/works.csv

Consultando tabla con SQL: paragraphs
Tiempo: 2.6 segundos
Guardando: data/shakespeare/paragraphs.csv

Consultando tabla con SQL: chapters
Tiempo: 0.5 segundos
Guardando: data/shakespeare/chapters.csv

Consultando tabla con SQL: characters
Tiempo: 0.6 segundos
Guardando: data/shakespeare/characters.csv



In [4]:
df_paragraphs

Unnamed: 0,id,ParagraphNum,PlainText,character_id,chapter_id
0,630863,3,"[Enter DUKE ORSINO, CURIO, and other Lords; Mu...",1261,18704
1,630864,4,"If music be the food of love, play on;\nGive m...",840,18704
2,630865,19,"Will you go hunt, my lord?",297,18704
3,630866,20,"What, Curio?",840,18704
4,630867,21,The hart.,297,18704
...,...,...,...,...,...
35460,666323,3460,"That she is living,\nWere it but told you, sho...",866,19648
35461,666324,3467,"You gods, look down\nAnd from your sacred vial...",584,19648
35462,666325,3475,There's time enough for that;\nLest they desir...,866,19648
35463,666326,3483,"O, peace, Paulina!\nThou shouldst a husband ta...",667,19648


## Limpieza de Texto

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Removes stopwords using nltk
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stop_words.update(['thou','thee'])

p_stemmer = PorterStemmer()

def remove_stopwords(text):
    return " ".join([word.strip() for word in text.split() if word.lower() not in stop_words])

def transform_contractions(text):
    text = text.replace("'t", " not")
    text = text.replace("'s", " is")
    text = text.replace("'re", " are")
    text = text.replace("'m", " am")
    text = text.replace("'ve", " have")
    text = text.replace("'ll", " will")
    text = text.replace("'d", " would")
    return text

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

def stemmer(text):
    return ' '.join([p_stemmer.stem(word) for word in text.split(' ')])

def clean_text(df, column_name, stop=False, stem=False):
    # Convertir todo a minúsculas
    result = df[column_name].str.lower()
    # Quitar contracciones
    result = result.apply(transform_contractions)
    # Quitar signos de puntuación y cambiarlos por espacios (" ")
    result = result.apply(remove_punctuation)
    # Quitar stopwords
    if stop:
        result = result.apply(remove_stopwords)
    if stem:
        # Aplicación de stemmer
        result = result.apply(stemmer)

    return result

# Creamos una nueva columna CleanText a partir de PlainText
df_paragraphs["CleanText"] = clean_text(df_paragraphs, "PlainText", stop=True, stem=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leandro.dominguez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leandro.dominguez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Agregamos personajes, obras y géneros en el mismo dataset
df_dataset = df_paragraphs.merge(df_chapters.set_index("id")["work_id"], left_on="chapter_id", right_index=True)
df_dataset = df_dataset.merge(df_works.set_index("id")[["Title", "GenreType"]], left_on="work_id", right_index=True)
df_dataset = df_dataset.merge(df_characters.set_index('id')["CharName"], left_on="character_id", right_index=True).sort_index()
df_dataset = df_dataset[["CleanText", "CharName", "Title", "GenreType"]]

# Usaremos sólo estos personajes
characters = ["Antony", "Cleopatra", "Queen Margaret"]
df_dataset = df_dataset[df_dataset["CharName"].isin(characters)]

df_dataset

Unnamed: 0,CleanText,CharName,Title,GenreType
2058,love inde tell much,Cleopatra,Antony and Cleopatra,Tragedy
2059,beggari love reckon would,Antony,Antony and Cleopatra,Tragedy
2060,set bourn far belov,Cleopatra,Antony and Cleopatra,Tragedy
2061,must need find new heaven new earth,Antony,Antony and Cleopatra,Tragedy
2064,grate sum,Antony,Antony and Cleopatra,Tragedy
...,...,...,...,...
27583,hadst clarenc richard kill would forth kennel ...,Queen Margaret,Richard III,History
27585,bear hungri reveng cloy behold thi edward dead...,Queen Margaret,Richard III,History
27587,call would vain flourish fortun call would poo...,Queen Margaret,Richard III,History
27589,forbear sleep night fast day compar dead happi...,Queen Margaret,Richard III,History


In [10]:
# Párrafos por cada personaje seleccionado
df_dataset["CharName"].value_counts()

CharName
Antony            253
Cleopatra         204
Queen Margaret    169
Name: count, dtype: int64

## Dataset y Features de texto

In [11]:
X = df_dataset["CleanText"].to_numpy()
y = df_dataset["CharName"].to_numpy()

In [23]:
from sklearn.model_selection import train_test_split

# stratify=y asegura que la división de los datos mantenga la proporción de las clases en ambos conjuntos (entrenamiento y prueba).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=5)

print(f"Tamaños de Train/Test: {len(X_train)}/{len(X_test)}", 'Porcentaje de test:', round(len(X_test) /(len(X_train)+len(X_test)), 2))

Tamaños de Train/Test: 438/188 Porcentaje de test: 0.3


### Verificación que el balance de párrafos es similar en train y test

In [34]:
import plotly.graph_objects as go

In [35]:
# Contamos la cantidad de párrafos para cada personaje en train y luego en test
train_percentages = round(pd.Series(y_train).value_counts()/len(y_train),2)
test_percentages = round(pd.Series(y_test).value_counts()/len(y_test),2)

# Graficar ambos porcentajes en una barchart dentro del mismo gráfico con distintos colores usando plotly
fig = go.Figure()

fig.add_trace(go.Bar(
    x=train_percentages.index,
    y=train_percentages.values,
    name='Train',
    marker_color='blue'
))

fig.add_trace(go.Bar(
    x=test_percentages.index,
    y=test_percentages.values,
    name='Test',
    marker_color='red'
))

# Añadir título y actualizar layout
fig.update_layout(
    title='Proporción de personajes en Train y Test',
    xaxis_title='Label',
    yaxis_title='Proporción',
    barmode='group'
)

fig.show()

### Efectivamente, salvo decimales son casi iguales

### Conteo de palabras y TF-IDF

In [61]:
count_vect = CountVectorizer(stop_words=None, ngram_range=(1,3))
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts

<438x15632 sparse matrix of type '<class 'numpy.int64'>'
	with 20349 stored elements in Compressed Sparse Row format>

In [62]:
tf_idf = TfidfTransformer(use_idf=False)
X_train_tf = tf_idf.fit_transform(X_train_counts)
X_train_tf

<438x15632 sparse matrix of type '<class 'numpy.float64'>'
	with 20349 stored elements in Compressed Sparse Row format>

### Reducción de dimensionalidad

In [63]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_train_red = pca.fit_transform(X_train_tf)

In [64]:
fig = go.Figure()

# Añadir trazos para cada clase
for character in np.unique(y_train):
    mask_train = y_train == character
    fig.add_trace(go.Scatter(
        x=X_train_red[mask_train, 0],
        y=X_train_red[mask_train, 1],
        mode='markers',
        name=f'Character {character}'
    ))

# Configurar el layout
fig.update_layout(
    title='PCA por personaje',
    xaxis_title='Componente Principal 1',
    yaxis_title='Componente Principal 2'
)

# Mostrar el gráfico
fig.show()


## Modelos de Clasificación

In [65]:
bayes_clf = MultinomialNB().fit(X_train_tf, y_train)

# Ver las primeras 10 predicciones de train
y_pred_train = bayes_clf.predict(X_train_tf)
y_pred_train[:10]

array(['Antony', 'Cleopatra', 'Antony', 'Antony', 'Antony', 'Antony',
       'Cleopatra', 'Cleopatra', 'Antony', 'Antony'], dtype='<U14')

In [66]:
def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_train, y_pred_train)

0.9657534246575342

In [None]:
# TODO: Predecir para test y ver la matriz de confusión, y reportar accuracy

# X_test_counts = ...
# X_test_tfidf = ...
# y_test_pred = ...


### Búsqueda de hiper-parámetros con Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

# TODO: Agregar más variantes de parámetros que les parezcan relevantes
param_sets = [{"stop_words": None, "ngram": (1,2), "idf": True},
             {"stop_words": None, "ngram": (1,1), "idf": False}]

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Ahora usaremos train/validation/test
# Por lo tanto le renombramos train+validation = dev(elopment) dataset
X_dev = X_train
y_dev = y_train

# # Para evitar errores
# del X_train
# del y_train

for params in param_sets:
    
    # Transormaciones a aplicar (featurizers)
    count_vect = CountVectorizer(stop_words=params["stop_words"], ngram_range=params["ngram"])
    tf_idf = TfidfTransformer(use_idf=params["idf"])
    
    for train_idxs, val_idxs in skf.split(X_dev, y_dev):
        
        # Train y validation para el split actual
        X_train_ = X_dev[train_idxs]
        y_train_ = y_dev[train_idxs]
        X_val = X_dev[val_idxs]
        y_val = y_dev[val_idxs]
        
        # Ajustamos y transformamos Train
        X_train_counts = count_vect.fit_transform(X_train_)
        X_train_tf = tf_idf.fit_transform(X_train_counts)
        
        # TODO: Completar el código para entrenar y evaluar 
        
        # Entrenamos con Train
        # bayes_clf = ...

        # Transformamos Validation
        # X_val_counts = ...
        # X_val_tfidf = ...
        
        # Predecimos y evaluamos en Validation
        y_pred_val = bayes_clf.predict(X_val_tfidf)
        acc = get_accuracy(y_val, y_pred_val)
        print(f"{acc=:.4f} {params=}")


### (Opcional) Comparativa con Fasttext

In [None]:
!pip install fasttext

In [None]:
import fasttext

y_train_s = np.char.replace(y_train.astype(str), " ", "_").astype(object)
y_test_s = np.char.replace(y_test.astype(str), " ", "_").astype(object)

# Convertimos al formato de fasttext: archivo de texto donde cada línea es:
# __label__<label> TEXTO
Xytrains = "__label__" + y_train_s.astype(object) + " " + X_train
Xytests = "__label__" + y_test_s.astype(object) + " " + X_test
np.savetxt(data_dir / "train.txt", Xytrains, fmt="%s")
np.savetxt(data_dir / "test.txt", Xytests, fmt="%s")

Xytests[0]

In [None]:
model = fasttext.train_supervised(input=str(data_dir / "train.txt"), epoch=100, wordNgrams=2)
model.test(str(data_dir / "test.txt"))

In [None]:
y_out = model.predict(list(X_test))
y_pred_test = [y[0].replace("__label__", "") for y in y_out[0]]
    
print(get_accuracy(y_test_s, y_pred_test))