In [1]:
import numpy as np
import scipy
from scipy.io import savemat

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
cats = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

In [3]:
newsgroups_train = fetch_20newsgroups(data_home = './newsgroups', subset='train', categories=cats, remove=('headers', 'footers', 'quotes'))

In [4]:
len(newsgroups_train.data)


2389

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2389, 21238)

In [6]:
type(vectors)

scipy.sparse._csr.csr_matrix

In [7]:
vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [8]:
terms = vectorizer.get_feature_names_out();

In [9]:
len(terms)

21238

In [10]:
vectorizer.vocabulary_

{'friend': 8599,
 'brought': 4122,
 'subaru': 18470,
 'svx': 18703,
 'recently': 15758,
 'drove': 7132,
 'couples': 5785,
 'times': 19251,
 'think': 19117,
 'great': 9163,
 'car': 4503,
 'esp': 7652,
 'snow': 17737,
 'took': 19336,
 'local': 11842,
 'dealer': 6223,
 'oil': 13799,
 'change': 4751,
 'came': 4419,
 '80': 1679,
 'dollars': 6969,
 'told': 19310,
 'filter': 8205,
 'necessary': 13304,
 'disassemble': 6768,
 'metal': 12628,
 'cover': 5802,
 'engine': 7523,
 'hour': 9932,
 'labour': 11353,
 'ripped': 16359,
 'phone': 14587,
 'toronto': 19356,
 'charging': 4777,
 'roughly': 16542,
 'price': 15111,
 'owner': 14118,
 'problem': 15161,
 'story': 18339,
 'true': 19594,
 'engineer': 7524,
 'looks': 11900,
 'pretty': 15095,
 'stubid': 18432,
 'way': 20569,
 'scoring': 16926,
 'stats': 18180,
 'swedish': 18722,
 'nhl': 13420,
 'players': 14753,
 'april': 2797,
 'mats': 12370,
 'sundin': 18591,
 'watch': 20543,
 'points': 14815,
 'season': 17003,
 '131': 435,
 'kent': 11077,
 'nilsson':

In [11]:
savemat("newsgroups.mat", {'X': vectors, 'terms': terms} )