# Introducción a la Ciencia de Datos: Tarea 2

Este notebook contiene el código de base para realizar la Tarea 2 del curso. Puede copiarlo en su propio repositorio y trabajar sobre el mismo.
Las **instrucciones para ejecutar el notebook** están en la [página inicial del repositorio](https://gitlab.fing.edu.uy/maestria-cdaa/intro-cd/).

**Se espera que no sea necesario revisar el código para corregir la tarea**, ya que todos los resultados y análisis relevantes deberían estar en el **informe en formato PDF**.

## Cargar dependencias
Para esta tarea, se han agregado algunos requerimientos, asegúrese de instalarlos (puede usar el mismo entorno virtual de la Tarea 1):

In [None]:
# !pip install jupyter pandas "sqlalchemy<2.0" pymysql seaborn pillow scikit-learn

In [None]:
from time import time
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay

import nltk
from nltk.corpus import stopwords

from sklearn.decomposition import PCA

## Conexión a la Base y Lectura de Datos

In [None]:
data_dir = Path("data") / "shakespeare"
data_dir.mkdir(parents=True, exist_ok=True)


def load_table(table_name, engine):
    """
    Leer la tabla con SQL y guardarla como CSV,
    o cargarla desde el CSV si ya existe
    """
    path_table = data_dir / f"{table_name}.csv"
    if not path_table.exists():
        print(f"Consultando tabla con SQL: {table_name}")
        t0 = time()
        df_table = pd.read_sql(f"SELECT * FROM {table_name}", engine)
        t1 = time()
        print(f"Tiempo: {t1 - t0:.1f} segundos")

        print(f"Guardando: {path_table}\n")
        df_table.to_csv(path_table)
    else:
        print(f"Cargando tabla desde CSV: {path_table}")
        df_table = pd.read_csv(path_table, index_col=[0])
    return df_table


print("Conectando a la base...")
conn_str = "mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/Shakespeare"
engine = create_engine(conn_str)

# Todos los párrafos de todas las obras
df_paragraphs = load_table("paragraphs", engine)

df_characters = load_table("characters", engine)

df_works = load_table("works", engine)

df_chapters = load_table("chapters", engine)

In [None]:
df_paragraphs

## Limpieza de Texto

In [None]:
# TODO: Actualizar con su versión de clean_text() de la Tarea_1


def clean_text(df, column_name):
    # Convertir todo a minúsculas
    result = df[column_name].str.lower()

    # Quitar signos de puntuación y cambiarlos por espacios (" ")
    # TODO: completar signos de puntuación faltantes
    for punc in ["[", "\n", ",", ";", "]", ".", ":", "!", "¡", "?", "¿", "-", " '", "' "]:
        result = result.str.replace(punc, " ")
    return result

# Creamos una nueva columna CleanText a partir de PlainText
df_paragraphs["CleanText"] = clean_text(df_paragraphs, "PlainText")


def clean_text_stopwords (df, column_name):

    result = df[column_name].str.lower()

    for word in stops:
        result = result.str.replace(" "+ word +" ", " ")
    return result

# nltk.download('stopwords')
# stops = set(stopwords.words('english'))
# df_paragraphs["CleanText"] = clean_text_stopwords(df_paragraphs, "CleanText")

# Veamos la diferencia
df_paragraphs[["PlainText", "CleanText"]]


In [None]:
# Agregamos personajes, obras y géneros en el mismo dataset
df_dataset = df_paragraphs.merge(df_chapters.set_index("id")["work_id"], left_on="chapter_id", right_index=True)
df_dataset = df_dataset.merge(df_works.set_index("id")[["Title", "GenreType"]], left_on="work_id", right_index=True)
df_dataset = df_dataset.merge(df_characters.set_index('id')["CharName"], left_on="character_id", right_index=True).sort_index()
df_dataset = df_dataset[["CleanText", "CharName", "Title", "GenreType"]]

# Usaremos sólo estos personajes
characters = ["Antony", "Cleopatra", "Queen Margaret"]
df_dataset = df_dataset[df_dataset["CharName"].isin(characters)]

df_dataset

In [None]:
# Párrafos por cada personaje seleccionado
df_dataset["CharName"].value_counts()

## Dataset y Features de texto

In [None]:
X = df_dataset["CleanText"].to_numpy()
y = df_dataset["CharName"].to_numpy()

In [None]:

# -> Definir X_train, X_test, y_train, y_test

# X_train, X_test, y_train, y_test = ...

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=345)

print(f"Tamaños de Train/Test: {len(X_train)}/{len(X_test)}")


In [None]:
Cleo_train=sum(1 for item in y_train if item==("Cleopatra"))
Anto_train=sum(1 for item in y_train if item==("Antony"))
Que_train=sum(1 for item in y_train if item==("Queen Margaret"))

Cleo_test=sum(1 for item in y_test if item==("Cleopatra"))
Anto_test=sum(1 for item in y_test if item==("Antony"))
Que_test=sum(1 for item in y_test if item==("Queen Margaret"))

nombres_cat = ["Cleopatra","Antony","Queen Margaret"]
plt.bar(nombres_cat,[Cleo_train, Anto_train, Que_train], color = '#F29727',label="Train",width=0.5)
plt.bar(nombres_cat,[Cleo_test, Anto_test, Que_test], color = '#1B6B93', bottom=[Cleo_train, Anto_train, Que_train],label="Test",width=0.5)
plt.xlabel("Personajes")
plt.ylabel("Cantidad de parrafos")
plt.title('Proporcion de parrafos por personaje en los conjuntos de datos')
plt.legend()
plt.show()

# nombres_cat = ["Train","Test"]
# plt.bar(nombres_cat,[Cleo_train, Cleo_test], color = '#F29727',label="Cleopatra",width=0.35)
# plt.bar(nombres_cat,[Anto_train, Anto_test], color = '#1B6B93', bottom=[Cleo_train, Cleo_test],label="Antony",width=0.35)
# plt.bar(nombres_cat,[Que_train, Que_test], color = '#88DC65', bottom=[Anto_train+Cleo_train, Anto_test+Cleo_test],label="Queen Margaret",width=0.35)
# plt.xlabel("Personajes")
# plt.ylabel("Cantidad de parrafos")
# plt.title('Proporcion de parrafos por personaje en los conjuntos de datos')
# plt.legend()
# plt.show()


# categories = [" ","Antony","Queen Margaret", "Cleopatra"]

# radar_train = [Cleo_train, Anto_train, Que_train,Cleo_train]
# radar_test = [Cleo_test, Anto_test, Que_test,Cleo_test]

# #label_loc = np.linspace(start=0, stop=2 * np.pi, num=len(radar_train))
# label_loc = np.array([np.pi/2, np.pi/2 + 2*np.pi/3, np.pi/2 + 4*np.pi/3, np.pi/2 ])

# plt.figure(figsize=(8, 8))
# plt.subplot(polar=True)
# plt.plot(label_loc, radar_train, label='Train')
# plt.plot(label_loc, radar_test, label='Test')

# plt.title(' ', size=20)
# lines, labels = plt.thetagrids(np.degrees(label_loc), labels=categories)
# plt.legend()
# plt.show()

### Conteo de palabras y TF-IDF

In [None]:
count_vect = CountVectorizer(stop_words=None, ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts
  ##TODO:
    ##Explicar en el documento lastecnicas planteadas en la tarea.

In [None]:
tf_idf = TfidfTransformer(use_idf=True)
X_train_tf = tf_idf.fit_transform(X_train_counts)
X_train_tf

  ##TODO:
    ##Explicar en el documento lastecnicas planteadas en la tarea.

### Reducción de dimensionalidad

In [None]:
# TODO: Realizar PCA sobre los datos de entrenamiento

reductor = PCA(n_components=10)

# Transformar train
X_train_red = reductor.fit_transform(X_train_tf.toarray())

In [None]:
var_ratio = reductor.explained_variance_ratio_
acumulacion_varianza= np.cumsum(var_ratio)

plt.bar(range(1,len(var_ratio)+1), var_ratio, alpha=0.5, align='center', label='Varianza explicada individual')
plt.step(range(1,len(acumulacion_varianza)+1), acumulacion_varianza, where='mid',label='Acumulacion de varianza explicada')
plt.xlabel("Numero de componetes")
plt.ylabel("Ralacion de la varianza explicada de los datos")
plt.legend(loc='best')
plt.xticks(range(1,len(var_ratio)+1))

plt.show()


In [None]:
# Visualización de las dos primeras componentes de PCA
fig, ax = plt.subplots(figsize=(6, 6))
for character in np.unique(y_train):
    mask_train = y_train == character
    ax.scatter(X_train_red[mask_train, 0], X_train_red[mask_train, 1], label=character)

ax.set_title("PCA por personaje")
ax.legend()

## Modelos de Clasificación

In [None]:
bayes_clf = MultinomialNB().fit(X_train_tf, y_train)


# Ver las primeras 10 predicciones de train
y_pred_train = bayes_clf.predict(X_train_tf)
y_pred_train[:10]

In [None]:
def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_train, y_pred_train)

ConfusionMatrixDisplay.from_predictions(y_train, y_pred_train )
plt.show()


In [None]:
X_test_counts = count_vect.transform(X_test)
X_test_tf = tf_idf.transform(X_test_counts)
y_pred_test = bayes_clf.predict(X_test_tf)

def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_test, y_pred_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test )
plt.show()

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred_test, target_names=["Antony","Cleopatra","Queen Margaret"]))

## TODO:

   ##  Explique cómo se relacionan estos valores con la matriz anterior.
   ## ¿Qué problemas puede tener el hecho de mirar sólamente el valor de accuracy?
   ## Considere qué sucedería con esta métrica si el desbalance de datos fuera aún mayor entre personajes

### Búsqueda de hiper-parámetros con Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

# TODO: Agregar más variantes de parámetros que les parezcan relevantes
param_sets = [{"stop_words": None, "ngram": (1,2), "idf": True, "alpha":1.0},
             {"stop_words": None, "ngram": (1,1), "idf": False, "alpha":1.0},
             {"stop_words": 'english', "ngram": (1,1), "idf": False, "alpha":1.0},
             {"stop_words": 'english', "ngram": (1,1), "idf": True, "alpha":1.0},
             {"stop_words": 'english', "ngram": (1,2), "idf": False, "alpha":1.0},
             {"stop_words": 'english', "ngram": (1,1), "idf": False, "alpha":2.0},
             {"stop_words": 'english', "ngram": (1,1), "idf": False, "alpha":0.5},
             {"stop_words": 'english', "ngram": (1,1), "idf": False, "alpha":0.4},
             {"stop_words": 'english', "ngram": (1,1), "idf": False, "alpha":0.35}]
ns=5
skf = StratifiedKFold(n_splits=ns, shuffle=True, random_state=42)

# Ahora usaremos train/validation/test
# Por lo tanto le renombramos train+validation = dev(elopment) dataset
X_dev = X_train
y_dev = y_train

# # Para evitar errores
#del X_train
#del y_train
M=np.zeros((ns,len(param_sets)))
for i, params in enumerate(param_sets):

    # Transormaciones a aplicar (featurizers)
    count_vect = CountVectorizer(stop_words=params["stop_words"], ngram_range=params["ngram"])
    tf_idf = TfidfTransformer(use_idf=params["idf"])
    j=0
    for train_idxs, val_idxs in skf.split(X_dev, y_dev):

        # Train y validation para el split actual
        X_train_ = X_dev[train_idxs]
        y_train_ = y_dev[train_idxs]
        X_val = X_dev[val_idxs]
        y_val = y_dev[val_idxs]

        # Ajustamos y transformamos Train
        X_train_counts = count_vect.fit_transform(X_train_)
        X_train_tf = tf_idf.fit_transform(X_train_counts)

        # TODO: Completar el código para entrenar y evaluar

        # Entrenamos con Train
        bayes_clf = MultinomialNB(alpha=params["alpha"]).fit(X_train_tf, y_train_)

        # Transformamos Validation
        X_val_counts = count_vect.transform(X_val)
        X_val_tfidf = tf_idf.transform(X_val_counts)


        # Predecimos y evaluamos en Validation
        y_pred_val = bayes_clf.predict(X_val_tfidf)
        acc = get_accuracy(y_val, y_pred_val)
        print(f"{acc=:.4f} {params=}")
        M[j,i]=acc
        j=j+1

plt.figure()
plt.violinplot(M)

In [None]:
count_vect = CountVectorizer(stop_words='english', ngram_range=(1,1))
tf_idf = TfidfTransformer(use_idf=False)


X_train_counts = count_vect.fit_transform(X_train)
X_train_tf = tf_idf.fit_transform(X_train_counts)

bayes_clf = MultinomialNB(alpha=0.4).fit(X_train_tf, y_train)

X_val_counts = count_vect.transform(X_test)
X_val_tfidf = tf_idf.transform(X_val_counts)

y_pred_test = bayes_clf.predict(X_val_tfidf)

def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_test, y_pred_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test )
plt.show()

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred_test, target_names=["Antony","Cleopatra","Queen Margaret"]))

##TODO
    #Discuta las limitaciones de utilizar un modelo basado en bag-of-words o tf-idf en cuanto al análisis de texto.


In [None]:
from sklearn.linear_model import SGDClassifier



SGDC_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=1000, tol=1).fit(X_train_tf, y_train)

y_pred_test = SGDC_clf.predict(X_val_tfidf)

def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_test, y_pred_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test )
plt.show()

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred_test, target_names=["Antony","Cleopatra","Queen Margaret"]))

In [None]:
# from sklearn.model_selection import GridSearchCV
# parameters = {'vect__ngram_range': [(1, 1)],'tfidf__use_idf': (True),'clf__alpha': (1e-2)}

# gs_clf = GridSearchCV(estimator=SVC(), param_grid=parameters, cv=5, n_jobs=-1)
# gs_clf = gs_clf.fit(X_train_tf, y_train)

# y_pred_test = gs_clf.predict(X_val_tfidf)

# def get_accuracy(y_true, y_pred):
#     return (y_true == y_pred).sum() / len(y_true)

# get_accuracy(y_test, y_pred_test)
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test )
# plt.show()

# from sklearn import metrics

# print(metrics.classification_report(y_test, y_pred_test, target_names=["Antony","Cleopatra","Queen Margaret"]))

In [None]:
# Agregamos personajes, obras y géneros en el mismo dataset
df_dataset = df_paragraphs.merge(df_chapters.set_index("id")["work_id"], left_on="chapter_id", right_index=True)
df_dataset = df_dataset.merge(df_works.set_index("id")[["Title", "GenreType"]], left_on="work_id", right_index=True)
df_dataset = df_dataset.merge(df_characters.set_index('id')["CharName"], left_on="character_id", right_index=True).sort_index()
df_dataset = df_dataset[["CleanText", "CharName", "Title", "GenreType"]]

# Usaremos sólo estos personajes
characters = ["Falstaff", "Cleopatra", "Queen Margaret"]
df_dataset = df_dataset[df_dataset["CharName"].isin(characters)]

df_dataset

# Párrafos por cada personaje seleccionado
df_dataset["CharName"].value_counts()


X = df_dataset["CleanText"].to_numpy()
y = df_dataset["CharName"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=345)

print(f"Tamaños de Train/Test: {len(X_train)}/{len(X_test)}")

Cleo_train=sum(1 for item in y_train if item==("Cleopatra"))
Fal_train=sum(1 for item in y_train if item==("Falstaff"))
Que_train=sum(1 for item in y_train if item==("Queen Margaret"))

Cleo_test=sum(1 for item in y_test if item==("Cleopatra"))
Fal_test=sum(1 for item in y_test if item==("Falstaff"))
Que_test=sum(1 for item in y_test if item==("Queen Margaret"))


plt.figure()
nombres_cat = ["Cleopatra","Falstaff","Queen Margaret"]
plt.bar(nombres_cat,[Cleo_train, Fal_train, Que_train], color = '#F29727',label="Train",width=0.5)
plt.bar(nombres_cat,[Cleo_test, Fal_test, Que_test], color = '#1B6B93', bottom=[Cleo_train, Fal_train, Que_train],label="Test",width=0.5)
plt.xlabel("Personajes")
plt.ylabel("Cantidad de parrafos")
plt.title('Proporcion de parrafos por personaje en los conjuntos de datos')
plt.legend()
plt.show()



X_train_counts = count_vect.fit_transform(X_train)

X_train_tf = tf_idf.fit_transform(X_train_counts)

reductor = PCA(n_components=10)
X_train_red = reductor.fit_transform(X_train_tf.toarray())
var_ratio = reductor.explained_variance_ratio_
acumulacion_varianza= np.cumsum(var_ratio)

plt.figure()
plt.bar(range(1,len(var_ratio)+1), var_ratio, alpha=0.5, align='center', label='Varianza explicada individual')
plt.step(range(1,len(acumulacion_varianza)+1), acumulacion_varianza, where='mid',label='Acumulacion de varianza explicada')
plt.xlabel("Numero de componetes")
plt.ylabel("Relacion de la varianza explicada de los datos")
plt.legend(loc='best')
plt.xticks(range(1,len(var_ratio)+1))
plt.show()

plt.figure()
fig, ax = plt.subplots(figsize=(6, 6))
for character in np.unique(y_train):
    mask_train = y_train == character
    ax.scatter(X_train_red[mask_train, 0], X_train_red[mask_train, 1], label=character)
ax.legend()
ax.set_title("PCA por personaje")


bayes_clf = MultinomialNB(alpha=0.4).fit(X_train_tf, y_train)
X_test_counts = count_vect.transform(X_test)
X_test_tf = tf_idf.transform(X_test_counts)
y_pred_test = bayes_clf.predict(X_test_tf)

def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_test, y_pred_test)
plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test )
plt.show()

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred_test, target_names=["Cleopatra","Falstaff","Queen Margaret"]))


### (Opcional) Comparativa con Fasttext

In [None]:
# !pip install fasttext

In [None]:
# Cargamos nuevamente los datos
df_dataset = df_paragraphs.merge(df_chapters.set_index("id")["work_id"], left_on="chapter_id", right_index=True)
df_dataset = df_dataset.merge(df_works.set_index("id")[["Title", "GenreType"]], left_on="work_id", right_index=True)
df_dataset = df_dataset.merge(df_characters.set_index('id')["CharName"], left_on="character_id", right_index=True).sort_index()
df_dataset = df_dataset[["CleanText", "CharName", "Title", "GenreType"]]

# Usaremos sólo estos personajes
characters = ["Antony", "Cleopatra", "Queen Margaret"]
df_dataset = df_dataset[df_dataset["CharName"].isin(characters)]

df_dataset

# Párrafos por cada personaje seleccionado
df_dataset["CharName"].value_counts()


X = df_dataset["CleanText"].to_numpy()
y = df_dataset["CharName"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=345)


In [None]:
import fasttext

y_train_s = np.char.replace(y_train.astype(str), " ", "_").astype(object)
y_test_s = np.char.replace(y_test.astype(str), " ", "_").astype(object)

# Convertimos al formato de fasttext: archivo de texto donde cada línea es:
# __label__<label> TEXTO
Xytrains = "__label__" + y_train_s.astype(object) + " " + X_train
Xytests = "__label__" + y_test_s.astype(object) + " " + X_test
np.savetxt(data_dir / "train.txt", Xytrains, fmt="%s")
np.savetxt(data_dir / "test.txt", Xytests, fmt="%s")

Xytests[0]

In [None]:
model = fasttext.train_supervised(input=str(data_dir / "train.txt"), epoch=100, wordNgrams=2)
model.test(str(data_dir / "test.txt"))

In [None]:
y_out = model.predict(list(X_test))
y_pred_test = [y[0].replace("__label__", "") for y in y_out[0]]

print(get_accuracy(y_test_s, y_pred_test))

In [None]:
plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test_s, y_pred_test )
plt.show()

print(metrics.classification_report(y_test_s, y_pred_test, target_names=["Antony","Cleopatra","Queen_Margaret"]))