# Clase 1: Del texto al vector.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

## Extracci贸n de datos

In [None]:
data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [None]:
data.target_names

In [None]:
# Carga de los datos.

categories = ['alt.atheism', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
texts = data.data
labels = data.target
target_names = data.target_names

In [None]:
# Plot general para los datos
def plot_representation(data_2d, labels, title):
    plt.figure(figsize=(10, 7))
    for label_idx in range(len(categories)):
        indices = np.where(labels == label_idx)
        plt.scatter(data_2d[indices, 0], data_2d[indices, 1], 
                    label=target_names[label_idx], alpha=0.7, edgecolors='w')
    
    plt.title(f"Representaci贸n: {title}", fontsize=14)
    plt.xlabel("Dimensi贸n Latente 1")
    plt.ylabel("Dimensi贸n Latente 2")
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

## Bag of Words.

In [None]:
bow_vectorizer = CountVectorizer(stop_words='english')
bow_matrix = bow_vectorizer.fit_transform(texts)

In [None]:
bow_matrix.shape

In [None]:
bow_2d = TruncatedSVD(n_components=2).fit_transform(bow_matrix)
bow_2d.shape

In [None]:
plot_representation(bow_2d,labels,"Bag of words")

## TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

In [None]:
tfidf_matrix.shape

In [None]:
tfidf_2d = TruncatedSVD(n_components=2).fit_transform(tfidf_matrix)
tfidf_2d

In [None]:
plot_representation(tfidf_2d,labels,"Representacion: TF-IDF")

## LSA

In [None]:
# Es practicamente lo que hicimos arriba
lsa_model = TruncatedSVD(n_components=100)
lsa_low_dim = lsa_model.fit_transform(tfidf_matrix)
lsa_2d = TruncatedSVD(n_components=2).fit_transform(lsa_low_dim)

In [None]:
plot_representation(lsa_2d,labels,"Representacion: TF-IDF")

## W2V

In [None]:
# Tokenizacion de texto
nltk.download('punkt')
tokenized_texts = [word_tokenize(doc.lower()) for doc in texts]

In [None]:
word_tokenize(texts[0])

In [None]:
def get_doc_vector(w2v_model,tokens):
    vecs = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(512)

In [None]:
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=512, window=5, min_count=2, workers=4)

In [None]:
w2v_matrix = np.array([get_doc_vector(w2v_model,t) for t in tokenized_texts])
w2v_matrix.shape

In [None]:
w2v_2d = TruncatedSVD(n_components=2).fit_transform(w2v_matrix)
w2v_2d

In [None]:
plot_representation(w2v_2d,labels,"Representacion: W2V")