# Example of using scikit-learn [`TfIdfVectortizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [1]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
import pandas
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### Download newsgroups dataset
Remove headers, footers and quotes to only train on the content of the post (instead of overfitting on headers etc.)

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                      categories=categories,
                                      remove=('headers', 'footers', 'quotes'))

In [3]:
print('Number of news: %d\n' % len(newsgroups_train.data))
print('Number of targets: %d\n' % len(newsgroups_train.target_names))
print('Target names: %s\n' % ', '.join(newsgroups_train.target_names))
print('Sample news: %s\n' % newsgroups_train.data[0])
pprint('Sample targets: %s' % newsgroups_train.target[:10])

Number of news: 2034

Number of targets: 4

Target names: alt.atheism, comp.graphics, sci.space, talk.religion.misc

Sample news: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

'Sample targets: [1 3 2 0 2 0 2 1 2 1]'


### Vectorize with TfidfVectorizer

In [4]:
tfidf_vectorizer = TfidfVectorizer()
vectors = tfidf_vectorizer.fit_transform(newsgroups_train.data)
print('Shape: ', vectors.shape)
print('Dimensionality of vocabulary: %d' % vectors.shape[1])
print('Size of vocabulary: %d' % len(tfidf_vectorizer.vocabulary_.keys()))
nnz = vectors.nnz
n = vectors.shape[0] * vectors.shape[1]
print('Non-zero values: %d/%d (%.2f %%)' % (vectors.nnz, n, vectors.nnz / n * 100))
print('Average non-zero values per document: %.1f' % (vectors.nnz / vectors.shape[0]))
print('Sample values:', vectors[0, :10000])

Shape:  (2034, 26879)
Dimensionality of vocabulary: 26879
Size of vocabulary: 26879
Non-zero values: 196700/54671886 (0.36 %)
Average non-zero values per document: 96.7
Sample values:   (0, 4030)	0.0662337497043734
  (0, 5604)	0.1373223718234628
  (0, 2408)	0.0697108745652997
  (0, 4326)	0.030618868877639698
  (0, 2427)	0.03931807830514708
  (0, 9935)	0.10024033927265849
  (0, 3397)	0.10759772335500424
  (0, 8620)	0.09565891146117879
  (0, 5220)	0.03282557123888892
  (0, 3254)	0.024133239174785947
  (0, 7761)	0.10778332049617564
  (0, 3607)	0.0653195128299407
  (0, 2853)	0.05796490248169242
  (0, 1152)	0.3399039986832978
  (0, 5443)	0.10024033927265849
  (0, 3042)	0.039840864696320574


### Vectorize with CountVectorizer

In [5]:
count_vectorizer = CountVectorizer()
vectors_count = count_vectorizer.fit_transform(newsgroups_train.data)
print('Shape: ', vectors_count.shape)
print('Dimensionality of vocabulary: %d' % vectors_count.shape[1])
print('Size of vocabulary: %d' % len(count_vectorizer.vocabulary_.keys()))
print('Sample values', vectors_count[0, :10000])

Shape:  (2034, 26879)
Dimensionality of vocabulary: 26879
Size of vocabulary: 26879
Sample values   (0, 3042)	1
  (0, 5443)	1
  (0, 1152)	3
  (0, 2853)	1
  (0, 3607)	2
  (0, 7761)	1
  (0, 3254)	1
  (0, 5220)	1
  (0, 8620)	2
  (0, 3397)	2
  (0, 9935)	1
  (0, 2427)	1
  (0, 4326)	1
  (0, 2408)	1
  (0, 5604)	1
  (0, 4030)	1


### Classify with Naive Bayes

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

#### Pipeline

In [7]:
preprocessing = Pipeline(steps=[('vectorize', TfidfVectorizer())])
classifier = MultinomialNB(alpha=0.1)
pipeline = Pipeline(steps=[('preprocessing', preprocessing), ('clf', classifier)])
pipeline.fit(newsgroups_train.data, newsgroups_train.target)

Pipeline(memory=None,
         steps=[('preprocessing',
                 Pipeline(memory=None,
                          steps=[('vectorize',
                                  TfidfVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.float64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=1,
                                                  ngram_range=(1, 1), norm='l2',
                                                  preprocessor=None,
                                                  smooth_idf=True,
                                                  stop_words=None,
                                               

In [8]:
pred = pipeline.predict(newsgroups_test.data)
print('F1 micro score: %.2f' % metrics.f1_score(newsgroups_test.target, pred, average='micro'))

F1 micro score: 0.79
