In [10]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score


In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [24]:
# vectorizacion y split
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target


## 1. Vectorizar documentos.

In [4]:
# pick 5 
np.random.seed(42)  
random_idxs = np.random.choice(X_train.shape[0], size=5, replace=False)

In [5]:
for idx in random_idxs:
    print("="*80)
    print(f"DOCUMENTO {idx:10} \t(CLASE: {newsgroups_train.target_names[y_train[idx]]})")
    print("-"*80)

    # similaridad con el resto de los documentos
    cossim = cosine_similarity(X_train[idx], X_train)[0]
    mostsim = np.argsort(cossim)[::-1][1:6]  # skip itself [0]

    for i in mostsim:
        print(f"  -> SIMILAR {i:10} | CLASE: {newsgroups_train.target_names[y_train[i]]:30} | sim={cossim[i]:.3f}")

DOCUMENTO       7492 	(CLASE: comp.sys.mac.hardware)
--------------------------------------------------------------------------------
  -> SIMILAR      10935 | CLASE: comp.sys.mac.hardware          | sim=0.667
  -> SIMILAR       7258 | CLASE: comp.sys.ibm.pc.hardware       | sim=0.348
  -> SIMILAR       4971 | CLASE: comp.sys.mac.hardware          | sim=0.180
  -> SIMILAR       4303 | CLASE: misc.forsale                   | sim=0.155
  -> SIMILAR        645 | CLASE: comp.sys.mac.hardware          | sim=0.141
DOCUMENTO       3546 	(CLASE: comp.os.ms-windows.misc)
--------------------------------------------------------------------------------
  -> SIMILAR       5665 | CLASE: comp.sys.ibm.pc.hardware       | sim=0.204
  -> SIMILAR       2011 | CLASE: comp.sys.ibm.pc.hardware       | sim=0.192
  -> SIMILAR       8643 | CLASE: comp.sys.ibm.pc.hardware       | sim=0.172
  -> SIMILAR       1546 | CLASE: comp.sys.ibm.pc.hardware       | sim=0.171
  -> SIMILAR       8765 | CLASE: comp.sys.ibm.

In [6]:
# algunos ejemplos
example_idx = random_idxs[0]
print("="*80)
print(f"DOCUMENTO ORIGINAL {example_idx} (CLASE: {newsgroups_train.target_names[y_train[example_idx]]})")
print(newsgroups_train.data[example_idx][:500], "...\n")  

cossim = cosine_similarity(X_train[example_idx], X_train)[0]
mostsim = np.argsort(cossim)[::-1][1:6]

for i in mostsim[:2]:  
    print(f"DOCUMENTO SIMILAR {i} (CLASE: {newsgroups_train.target_names[y_train[i]]}, sim={cossim[i]:.3f})")
    print(newsgroups_train.data[i][:500], "...\n")
print("="*80)

DOCUMENTO ORIGINAL 7492 (CLASE: comp.sys.mac.hardware)
Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging one's head
rrn@po.cwru.edu  |  against a wall...with less opportunity for reward"  ...

DOCUMENTO SIMILAR 10935 (CLASE: comp.sys.mac.hardware, sim=0.667)
Hey everybody:

   I want to buy a mac and I want to get a good price...who doesn't?  So,
could anyone out there who has found a really good deal on a Centris 650
send me the price.  I don't want to know where, unless it is mail order or
areound cleveland, Ohio.  Also, should I buy now or wait for the Power PC.

Thanks.
BoB
reply via post or e-mail at rrn@po.cwru.edu
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to bangi ...

DOCUMENTO SIMILAR 7258 (CLASE: comp.sys.ibm.pc.hardware, sim=0.348)
Hay all:

    Has 

## 2. Construir un modelo de clasificación por prototipos

In [25]:
X_test  = tfidfvect.transform(newsgroups_test.data)
y_test  = newsgroups_test.target
sim_matrix = cosine_similarity(X_test, X_train)

In [26]:
nearest_train_idx = np.argmax(sim_matrix, axis=1)
y_pred_pre = y_train[nearest_train_idx]

In [9]:
f1_macro = f1_score(y_test, y_pred_pre, average="macro")
print(f"Macro F1: {f1_macro:.4f}")

Macro F1: 0.5050


## 3. Entrenar modelos de clasificación Naïve Bayes

In [81]:
# multinomial
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_test  = tfidfvect.transform(newsgroups_test.data)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
# complement
cnb = ComplementNB()
cnb.fit(X_train, y_train)
y_pred_cnb = cnb.predict(X_test)

In [82]:
print(f"MNB  Macro F1: {f1_score(y_test, y_pred_mnb, average='macro'):.3f}")
print(f"CNB  Macro F1: {f1_score(y_test, y_pred_cnb, average='macro'):.3f}")

MNB  Macro F1: 0.585
CNB  Macro F1: 0.693


Sin ningun tipo de _tuneo_ los calsificadores de naive bayes obtienne f1 scores mas altos que la clasificacion por prototipos

In [83]:
tfidfvect = TfidfVectorizer(stop_words="english")
X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_test  = tfidfvect.transform(newsgroups_test.data)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
cnb = ComplementNB()
cnb.fit(X_train, y_train)
y_pred_cnb = cnb.predict(X_test)
print(f"MNB  Macro F1: {f1_score(y_test, y_pred_mnb, average='macro'):.3f}")
print(f"CNB  Macro F1: {f1_score(y_test, y_pred_cnb, average='macro'):.3f}")

MNB  Macro F1: 0.647
CNB  Macro F1: 0.694


vemos que agregando stop_words el score para MNB sube considerablemente:
    
    stop_words{‘english’}, list, default=None

    If a string, it is passed to _check_stop_list and the appropriate stop list is returned. ‘english’ is currently the only supported string value. There are several known issues with ‘english’ and you should consider an alternative (see Using stop words).

    If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if analyzer == 'word'.

    If None, no stop words will be used. In this case, setting max_df to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.

le hacemos caso y probamos con None y max_df:


In [84]:
max_dfs = [0.1, 0.2, 0.5, 0.7, 1.0]
for max_df in max_dfs:
    tfidfvect = TfidfVectorizer(max_df=max_df)
    X_train = tfidfvect.fit_transform(newsgroups_train.data)
    X_test  = tfidfvect.transform(newsgroups_test.data)
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred_mnb = mnb.predict(X_test)
    cnb = ComplementNB()
    cnb.fit(X_train, y_train)
    y_pred_cnb = cnb.predict(X_test)
    print(f"MAX DF: {max_df}")
    print(f"\tMNB  Macro F1: {f1_score(y_test, y_pred_mnb, average='macro'):.3f}")
    print(f"\tCNB  Macro F1: {f1_score(y_test, y_pred_cnb, average='macro'):.3f}")

MAX DF: 0.1
	MNB  Macro F1: 0.643
	CNB  Macro F1: 0.695
MAX DF: 0.2
	MNB  Macro F1: 0.627
	CNB  Macro F1: 0.694
MAX DF: 0.5
	MNB  Macro F1: 0.601
	CNB  Macro F1: 0.693
MAX DF: 0.7
	MNB  Macro F1: 0.591
	CNB  Macro F1: 0.692
MAX DF: 1.0
	MNB  Macro F1: 0.585
	CNB  Macro F1: 0.693


Un valor bajo de `max_df` mejora el score, con 0.1 el f1 para mnb es similar a usar `stop_words='english'`

Ademas podemos usar `ngram_range` para que el algoritmo considere n-gramas

In [92]:
tfidfvect = TfidfVectorizer(stop_words="english", max_df=0.1, ngram_range=(1,2))
X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_test  = tfidfvect.transform(newsgroups_test.data)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
cnb = ComplementNB()
cnb.fit(X_train, y_train)
y_pred_cnb = cnb.predict(X_test)
print(f"MNB  Macro F1: {f1_score(y_test, y_pred_mnb, average='macro'):.2f}")
print(f"CNB  Macro F1: {f1_score(y_test, y_pred_cnb, average='macro'):.2f}")

MNB  Macro F1: 0.65
CNB  Macro F1: 0.70


## 4. Transponer la matriz documento-término.

In [123]:
tfidfvect = TfidfVectorizer(stop_words="english", max_df=0.1) # saco los 2-gramas

X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_dag = X_train.T

In [124]:
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}
idx2word

{96859: 'wondering',
 37254: 'enlighten',
 25717: 'car',
 80409: 'saw',
 31927: 'day',
 34739: 'door',
 84300: 'sports',
 57241: 'looked',
 55601: 'late',
 9843: '60s',
 35900: 'early',
 11174: '70s',
 25437: 'called',
 24108: 'bricklin',
 34740: 'doors',
 76259: 'really',
 83196: 'small',
 16806: 'addition',
 24583: 'bumper',
 81438: 'separate',
 77666: 'rest',
 23430: 'body',
 87901: 'tellme',
 62587: 'model',
 37206: 'engine',
 84038: 'specs',
 99588: 'years',
 73164: 'production',
 46687: 'history',
 49797: 'info',
 41872: 'funky',
 57244: 'looking',
 59065: 'mail',
 39294: 'fair',
 66671: 'number',
 23973: 'brave',
 83767: 'souls',
 92374: 'upgraded',
 82325: 'si',
 27889: 'clock',
 68510: 'oscillator',
 81836: 'shared',
 38635: 'experiences',
 72029: 'poll',
 81366: 'send',
 24125: 'brief',
 60916: 'message',
 33127: 'detailing',
 73112: 'procedure',
 84076: 'speed',
 20236: 'attained',
 30233: 'cpu',
 75894: 'rated',
 16791: 'add',
 25769: 'cards',
 16776: 'adapters',
 45994: 'h

In [125]:
words = ['cpu', 'computer', 'religion', 'history', 'space']
for word in words:
  
    word_idx = tfidfvect.vocabulary_[word]
    sims = cosine_similarity(X_dag[word_idx], X_dag)[0]
    mostsim = np.argsort(sims)[::-1][1:6]

    print("="*80)
    print(f"WORD: '{word}'")
    for i in mostsim:
        print(f"  -> {idx2word[i]} (sim={sims[i]:.3f})")

WORD: 'cpu'
  -> dislodge (sim=0.362)
  -> heatsink (sim=0.316)
  -> cooler (sim=0.297)
  -> tasking (sim=0.260)
  -> bending (sim=0.256)
WORD: 'computer'
  -> decwriter (sim=0.159)
  -> harkens (sim=0.152)
  -> deluged (sim=0.152)
  -> shopper (sim=0.143)
  -> delicate (sim=0.136)
WORD: 'religion'
  -> religious (sim=0.265)
  -> religions (sim=0.227)
  -> purpsoe (sim=0.209)
  -> crusades (sim=0.204)
  -> categorized (sim=0.199)
WORD: 'history'
  -> inalcik (sim=0.287)
  -> hurewitz (sim=0.285)
  -> chester (sim=0.285)
  -> nubar (sim=0.285)
  -> nikolayef (sim=0.285)
WORD: 'space'
  -> nasa (sim=0.327)
  -> shuttle (sim=0.289)
  -> seds (sim=0.283)
  -> enfant (sim=0.268)
  -> exploration (sim=0.238)


Veamos ahora que pasa si consideramos 2-gramas:

In [126]:
tfidfvect = TfidfVectorizer(stop_words="english", max_df=0.1, ngram_range=(1,2)) # saco los 2-gramas

X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_dag = X_train.T
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}
words = ['cpu', 'computer', 'religion', 'history', 'space']
for word in words:
  
    word_idx = tfidfvect.vocabulary_[word]
    sims = cosine_similarity(X_dag[word_idx], X_dag)[0]
    mostsim = np.argsort(sims)[::-1][1:6]

    print("="*80)
    print(f"WORD: '{word}'")
    for i in mostsim:
        print(f"  -> {idx2word[i]} (sim={sims[i]:.3f})")

WORD: 'cpu'
  -> problem ensure (sim=0.426)
  -> vertical case (sim=0.426)
  -> weight cooler (sim=0.426)
  -> weight cpu (sim=0.426)
  -> cooler chip (sim=0.426)
WORD: 'computer'
  -> computer science (sim=0.235)
  -> computer graphics (sim=0.234)
  -> turn computer (sim=0.206)
  -> everytime turn (sim=0.178)
  -> time fix (sim=0.178)
WORD: 'religion'
  -> say religious (sim=0.316)
  -> religion does (sim=0.289)
  -> religious (sim=0.281)
  -> use religion (sim=0.265)
  -> freedom religion (sim=0.259)
WORD: 'history'
  -> turkish studies (sim=0.350)
  -> inalcik (sim=0.349)
  -> ottoman history (sim=0.349)
  -> professor history (sim=0.346)
  -> michigan list (sim=0.346)
WORD: 'space'
  -> nasa (sim=0.370)
  -> space shuttle (sim=0.356)
  -> seds (sim=0.353)
  -> space station (sim=0.342)
  -> shuttle (sim=0.340)


podemos tunear un poco los parametros del vectorizador...

In [128]:
tfidfvect = TfidfVectorizer(stop_words="english", max_df=0.3, min_df=5, ngram_range=(1,2)) # saco los 2-gramas

X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_dag = X_train.T
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}
words = ['cpu', 'computer', 'religion', 'history', 'space']
for word in words:
  
    word_idx = tfidfvect.vocabulary_[word]
    sims = cosine_similarity(X_dag[word_idx], X_dag)[0]
    mostsim = np.argsort(sims)[::-1][1:6]

    print("="*80)
    print(f"WORD: '{word}'")
    for i in mostsim:
        print(f"  -> {idx2word[i]} (sim={sims[i]:.3f})")

WORD: 'cpu'
  -> cooler (sim=0.289)
  -> bending (sim=0.246)
  -> alain (sim=0.240)
  -> multi tasking (sim=0.238)
  -> tasking (sim=0.238)
WORD: 'computer'
  -> computer science (sim=0.232)
  -> computer graphics (sim=0.223)
  -> turn computer (sim=0.195)
  -> use computer (sim=0.174)
  -> new computer (sim=0.170)
WORD: 'religion'
  -> religious (sim=0.253)
  -> religion does (sim=0.250)
  -> freedom religion (sim=0.230)
  -> religions (sim=0.228)
  -> categorized (sim=0.202)
WORD: 'history'
  -> sic (sim=0.280)
  -> associate professor (sim=0.264)
  -> university chicago (sim=0.242)
  -> stanford shaw (sim=0.229)
  -> john dewey (sim=0.224)
WORD: 'space'
  -> space station (sim=0.335)
  -> nasa (sim=0.321)
  -> space shuttle (sim=0.308)
  -> sci space (sim=0.300)
  -> space exploration (sim=0.293)


de esta forma aparecen formaciones con "sentido" como

- multi tasking
- turn/use/new computer
- associate professor
- spae station
- space exploration