### Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

## Carga de datos

In [None]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Vectorización

In [None]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn
tfidfvect = TfidfVectorizer()

In [None]:
# en el atributo `data` accedemos al texto
newsgroups_train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [None]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [None]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'cantidad de documentos: {X_train.shape[0]}')
print(f'tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
cantidad de documentos: 11314
tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [None]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [None]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [None]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [None]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Similaridad de documentos

In [None]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 7492
print(newsgroups_train.data[idx])

Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging one's head
rrn@po.cwru.edu  |  against a wall...with less opportunity for reward" 


In [None]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [None]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [None]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750])

In [None]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [None]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [None]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [None]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)

In [None]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

### Consigna del desafío 1

**1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

**2**. Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.

**3**. Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares.


Consigna 1 - Vectorizar documentos:

In [None]:
# Cargar los datos
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Instanciar y fit el vectorizador TF-IDF
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target



In [None]:
# Tomar 5 documentos al azar
np.random.seed(42)
random_indices = np.random.choice(X_train.shape[0], 5, replace=False)
random_indices

array([7492, 3546, 5582, 4793, 3813])

In [None]:
print(newsgroups_train.data[7492])

Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging one's head
rrn@po.cwru.edu  |  against a wall...with less opportunity for reward" 


In [None]:
cossim = cosine_similarity(X_train[random_indices[0]], X_train)[0]
mostsim = np.argsort(cossim)[::-1][1:6]
mostsim

array([10935,  7258,  4971,  4303,   645])

In [None]:
for idx in mostsim:
    print(f"\n-------------- Documento Similar {idx}:")
    print(newsgroups_train.data[idx][:500])  # Mostrar un extracto del documento original



-------------- Documento Similar 10935:
Hey everybody:

   I want to buy a mac and I want to get a good price...who doesn't?  So,
could anyone out there who has found a really good deal on a Centris 650
send me the price.  I don't want to know where, unless it is mail order or
areound cleveland, Ohio.  Also, should I buy now or wait for the Power PC.

Thanks.
BoB
reply via post or e-mail at rrn@po.cwru.edu
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to bangi

-------------- Documento Similar 7258:
Hay all:

    Has anyone out there heard of any performance stats on the fabled p24t.
 I was wondering what it's performance compared to the 486/66 and/or
pentium would be.  Any info would be helpful.

Later
BoB
-- 
Robert Novitskey | rrn@po.cwru.edu | (216)754-2134 | CWRU Cleve. Ohio
----------------------------------------------------------------------
COMPUTER ENGINEER AND C PROGRAMMER |  NOW SEEKING SUMMER JOBS

In [None]:
# Medir similaridad y analizar
for idx in random_indices:
    print(f"\n------------------- Documento {idx}:")
    print(f"Clase: {newsgroups_train.target_names[y_train[idx]]}")

    cossim = cosine_similarity(X_train[idx], X_train)[0]
    mostsim = np.argsort(cossim)[::-1][1:6]

    for i in mostsim:
        print(f"\nDocumento similar {i}:")
        print(f"Clase: {newsgroups_train.target_names[y_train[i]]}")



------------------- Documento 7492:
Clase: comp.sys.mac.hardware

Documento similar 10935:
Clase: comp.sys.mac.hardware

Documento similar 7258:
Clase: comp.sys.ibm.pc.hardware

Documento similar 4971:
Clase: comp.sys.mac.hardware

Documento similar 4303:
Clase: misc.forsale

Documento similar 645:
Clase: comp.sys.mac.hardware

------------------- Documento 3546:
Clase: comp.os.ms-windows.misc

Documento similar 5665:
Clase: comp.sys.ibm.pc.hardware

Documento similar 2011:
Clase: comp.sys.ibm.pc.hardware

Documento similar 8643:
Clase: comp.sys.ibm.pc.hardware

Documento similar 1546:
Clase: comp.sys.ibm.pc.hardware

Documento similar 8765:
Clase: comp.sys.ibm.pc.hardware

------------------- Documento 5582:
Clase: misc.forsale

Documento similar 5510:
Clase: misc.forsale

Documento similar 4922:
Clase: misc.forsale

Documento similar 4347:
Clase: comp.graphics

Documento similar 8057:
Clase: misc.forsale

Documento similar 4028:
Clase: comp.graphics

------------------- Documento 47

Podemos ver en varios casos que las subcategorías tienen alguna relación entre sí.

In [None]:
# Medir similaridad y analizar
for idx in random_indices:
    print(f"\n--------------------------------- Documento {idx}:")
    print(newsgroups_train.data[idx][:500])  # Mostrar un extracto del documento original
    print(f"Clase: {newsgroups_train.target_names[y_train[idx]]}")

    cossim = cosine_similarity(X_train[idx], X_train)[0]
    mostsim = np.argsort(cossim)[::-1][1:6]

    for i in mostsim:
        print(f"\n..............Documento similar {i}:")
        print(newsgroups_train.data[i][:500])  # Mostrar un extracto del documento similar
        print(f"Clase: {newsgroups_train.target_names[y_train[i]]}")



--------------------------------- Documento 7492:
Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging one's head
rrn@po.cwru.edu  |  against a wall...with less opportunity for reward" 
Clase: comp.sys.mac.hardware

..............Documento similar 10935:
Hey everybody:

   I want to buy a mac and I want to get a good price...who doesn't?  So,
could anyone out there who has found a really good deal on a Centris 650
send me the price.  I don't want to know where, unless it is mail order or
areound cleveland, Ohio.  Also, should I buy now or wait for the Power PC.

Thanks.
BoB
reply via post or e-mail at rrn@po.cwru.edu
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to bangi
Clase: comp.sys.mac.hardware

..............Documento similar 7258:
Hay all:

    Has anyone out 

Respecto a los textos analizados podemos reconocer ciertos patrones:

caso 1: consultas respecto a recomendaciones de hardware relacionadas a un mismo remitente.

caso 2: consultas sobre hardware DMA

caso 3: ventas de hardware

caso 4: cuestiones relacionadas con política

caso 5: cuestiones relacionadas con religión

Consigna 2 - Entrenar modelos:

In [None]:
# Vectorizador con diferentes parámetros
tfidfvect = TfidfVectorizer(max_df=0.99, min_df=2, analyzer='word')
X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_test = tfidfvect.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

# Modelo MultinomialNB
clf_multinomial = MultinomialNB()
clf_multinomial.fit(X_train, y_train)
y_pred_multinomial = clf_multinomial.predict(X_test)
f1_multinomial = f1_score(y_test, y_pred_multinomial, average='macro')

print(f'F1-score MultinomialNB (macro): {f1_multinomial}')

# Modelo ComplementNB
clf_complement = ComplementNB()
clf_complement.fit(X_train, y_train)
y_pred_complement = clf_complement.predict(X_test)
f1_complement = f1_score(y_test, y_pred_complement, average='macro')

print(f'F1-score ComplementNB (macro): {f1_complement}')

F1-score MultinomialNB (macro): 0.5970494639319617
F1-score ComplementNB (macro): 0.6934824308370665


In [None]:
# Definir diferentes configuraciones del vectorizador
vectorizer_params = [
    {'max_df': 1.0, 'min_df': 1, 'stop_words': None},
    {'max_df': 0.95, 'min_df': 2, 'stop_words': 'english'},
    {'max_df': 0.9, 'min_df': 5, 'stop_words': 'english'},
    {'max_df': 0.85, 'min_df': 10, 'stop_words': 'english'}
]

X_train_data = newsgroups_train.data
y_train = newsgroups_train.target
X_test_data = newsgroups_test.data
y_test = newsgroups_test.target

# Definir diferentes configuraciones del vectorizador
vectorizer_params = [
    {'max_df': 1.0, 'min_df': 1, 'stop_words': None},
    {'max_df': 0.95, 'min_df': 2, 'stop_words': 'english'},
    {'max_df': 0.9, 'min_df': 5, 'stop_words': 'english'},
    {'max_df': 0.85, 'min_df': 10, 'stop_words': 'english'}
]


# Función para evaluar modelos con diferentes configuraciones
def evaluate_model(vectorizer_params, model_params):
    best_f1_multinomial = 0
    best_f1_complement = 0
    best_params_multinomial = None
    best_params_complement = None

    for params in vectorizer_params:
        print(f"Evaluando configuración del vectorizador: {params}")
        vectorizer = TfidfVectorizer(**params)
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)

        # Entrenar y evaluar MultinomialNB
        clf_multinomial = MultinomialNB()
        clf_multinomial.fit(X_train, y_train)
        y_pred_multinomial = clf_multinomial.predict(X_test)
        f1_multinomial = f1_score(y_test, y_pred_multinomial, average='macro')
        print(f'F1-score MultinomialNB (macro): {f1_multinomial}')
        if f1_multinomial > best_f1_multinomial:
            best_f1_multinomial = f1_multinomial
            best_params_multinomial = params

        # Entrenar y evaluar ComplementNB
        clf_complement = ComplementNB()
        clf_complement.fit(X_train, y_train)
        y_pred_complement = clf_complement.predict(X_test)
        f1_complement = f1_score(y_test, y_pred_complement, average='macro')
        print(f'F1-score ComplementNB (macro): {f1_complement}')
        if f1_complement > best_f1_complement:
            best_f1_complement = f1_complement
            best_params_complement = params

    return best_f1_multinomial, best_params_multinomial, best_f1_complement, best_params_complement

# Evaluar modelos con diferentes configuraciones

best_f1_multinomial, best_params_multinomial, best_f1_complement, best_params_complement = evaluate_model(vectorizer_params, model_params)

print(f"\nMejor F1-score MultinomialNB (macro): {best_f1_multinomial} con parámetros: {best_params_multinomial}")
print(f"Mejor F1-score ComplementNB (macro): {best_f1_complement} con parámetros: {best_params_complement}")


Evaluando configuración del vectorizador: {'max_df': 1.0, 'min_df': 1, 'stop_words': None}
F1-score MultinomialNB (macro): 0.5854345727938506
F1-score ComplementNB (macro): 0.692953349950875
Evaluando configuración del vectorizador: {'max_df': 0.95, 'min_df': 2, 'stop_words': 'english'}
F1-score MultinomialNB (macro): 0.6511573382063232
F1-score ComplementNB (macro): 0.6942920490839366
Evaluando configuración del vectorizador: {'max_df': 0.9, 'min_df': 5, 'stop_words': 'english'}
F1-score MultinomialNB (macro): 0.6503653727588931
F1-score ComplementNB (macro): 0.6822855200322546
Evaluando configuración del vectorizador: {'max_df': 0.85, 'min_df': 10, 'stop_words': 'english'}
F1-score MultinomialNB (macro): 0.64762143634484
F1-score ComplementNB (macro): 0.6642153476904588

Mejor F1-score MultinomialNB (macro): 0.6511573382063232 con parámetros: {'max_df': 0.95, 'min_df': 2, 'stop_words': 'english'}
Mejor F1-score ComplementNB (macro): 0.6942920490839366 con parámetros: {'max_df': 0.95,

Consigna 3 - Transponer matriz documento-término

In [None]:
# Transponer la matriz documento-término
X_train_T = X_train.T


In [None]:

# Tomar 5 palabras al azar del vocabulario
np.random.seed(21)
random_words_indices = np.random.choice(X_train_T.shape[0], 5, replace=False)
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}


In [None]:
# Medir similaridad y analizar
for idx in random_words_indices:
    word = idx2word[idx]
    print(f"\nPalabra '{word}':")

    cossim = cosine_similarity(X_train_T[idx], X_train_T)[0]
    mostsim = np.argsort(cossim)[::-1][1:6]

    for i in mostsim:
        similar_word = idx2word[i]
        print(f"Palabra similar: {similar_word}")



Palabra 'amber':
Palabra similar: vt100
Palabra similar: 1200
Palabra similar: corvettes
Palabra similar: screens
Palabra similar: _national

Palabra 'armchair':
Palabra similar: flowers
Palabra similar: costello
Palabra similar: theatre
Palabra similar: mccartney
Palabra similar: princeton

Palabra 'moore':
Palabra similar: cindy
Palabra similar: tittle
Palabra similar: kittys
Palabra similar: trumbull
Palabra similar: illitch

Palabra 'irregulars':
Palabra similar: blockaded
Palabra similar: troups
Palabra similar: deir
Palabra similar: snipers
Palabra similar: yassin

Palabra 'appearence':
Palabra similar: messes
Palabra similar: splits
Palabra similar: hellcats
Palabra similar: solicit
Palabra similar: breifly
