In [1]:
%matplotlib inline

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import collections
import re
import nltk

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# Tfidf (recap)

## Load 'news' dataset

Описание:
- постове в групи за дискусия
- 20 теми напр. автомобили, медицина, оръжия, MS Windows...
- 18,000 записа

In [37]:
news_raw = fetch_20newsgroups()

news_data = news_raw.data
news_target = news_raw.target
news_target_names = news_raw.target_names

len(news_data), len(news_target)

(11314, 11314)

In [27]:
for i in range(1):
    print('IN:')
    print('-' * 80)
    print(news_data[i])

    print('OUT:')
    print('-' * 80)
    print(news_target_names[news_target[i]])
    
    print('\n' * 3 )

IN:
--------------------------------------------------------------------------------
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





OUT:
--------------------------------------------------------------------------------
rec.autos






In [28]:
news_target_names[:5]

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware']

In [None]:
news_target

array([7, 4, 4, ..., 3, 1, 8])

In [30]:
len(news_data)

11314

## Vectorize using TFIDF

In [31]:
stopwords = nltk.corpus.stopwords.words(fileids=['english',])

In [32]:
vectorizer = TfidfVectorizer(
    input = "content", 
    analyzer = "word", 
    ngram_range = (1, 4), 
    min_df = 0, 
    stop_words = stopwords, 
    sublinear_tf = True,
)

In [33]:
matrix = tfidf.fit_transform(news_data)
matrix

<11314x4264838 sparse matrix of type '<class 'numpy.float64'>'
	with 7309620 stored elements in Compressed Sparse Row format>

In [34]:
feature_names = vectorizer.get_feature_names_out()
feature_names.shape

NotFittedError: Vocabulary not fitted or provided

In [38]:
doc = 0 # Change the index to view another document

feature_index = matrix[doc, :].nonzero()[1]
tfidf_scores = zip(feature_index, [matrix[doc, x] for x in feature_index])

scores = [(feature_names[i], s) for (i, s) in tfidf_scores]
scores = pd.DataFrame(scores, columns=['token', 'tfidf'])

scores_sorted = scores.sort_values('tfidf', ascending=False)
scores_sorted

Unnamed: 0,token,tfidf
265,lerxst,0.117097
264,wam,0.085947
202,wam umd,0.085947
134,wam umd edu,0.085947
259,car,0.077798
...,...,...
256,host,0.013847
257,posting,0.013475
254,organization,0.007790
249,lines,0.007513


In [45]:
# tf-idf scores for the first document
s = matrix[0, :].toarray()
s[s != 0].shape

(266,)

In [47]:
s.nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0], dtype=int64),
 array([  69990,   73411,   73412,   73413,  271586,  271610,  271611,
         271

## Custom function

In [17]:
def calculate_tfidf(tf, idf):
    return tf * np.log(idf)

calculate_tfidf(2/15, 10000/2488), calculate_tfidf(1/15, 10000/506) 

(0.18548078908228172, 0.19892024684591447)

In [18]:
calculate_tfidf(2/15, 10000/7245)

0.04296980229618667

In [19]:
calculate_tfidf(1/15, 10000/102)

0.3056911705794608

In [20]:
calculate_tfidf(1/15, 10000/102), calculate_tfidf(4/15, 10000/102)

(0.3056911705794608, 1.2227646823178433)