# Example computations for TFIDF using pandas data frames

Looking at the words in the next code cell, which words best distinguish the three documents?

In [1]:
import pandas as pd
from collections import Counter
import numpy as np

d1 = "the new new york times" # (repeated new)
d2 = "the new york post"
d3 = "the los angeles times"

docstrs = [d1,d2,d3]
docs = [s.split() for s in docstrs]
N = len(docs)
docs

[['the', 'new', 'new', 'york', 'times'],
 ['the', 'new', 'york', 'post'],
 ['the', 'los', 'angeles', 'times']]

In [2]:
uniq = set(' '.join(docstrs).split())
uniq = sorted(list(uniq))
uniq

['angeles', 'los', 'new', 'post', 'the', 'times', 'york']

# Word vectors and term counts

First, let's do this the hard way to see the actual computations.

In [3]:
tf = pd.DataFrame(data=list(uniq), columns=['word'])
tf = tf.set_index('word')
tf

angeles
los
new
post
the
times
york


In [25]:
# slow but obvious implementation
for i,d in enumerate(docs):
    c = Counter(d)
    print(c.items())
    for w,c in c.items():
        tf.loc[w,f"d{i+1}"] = c

dict_items([('the', 1), ('new', 2), ('york', 1), ('times', 1)])
dict_items([('the', 1), ('new', 1), ('york', 1), ('post', 1)])
dict_items([('the', 1), ('los', 1), ('angeles', 1), ('times', 1)])


In [6]:
tf

Unnamed: 0_level_0,d1,d2,d3
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
angeles,,,1.0
los,,,1.0
new,2.0,1.0,
post,,1.0,
the,1.0,1.0,1.0
times,1.0,,1.0
york,1.0,1.0,


In [7]:
tf = tf.fillna(0).astype('int')
tf

Unnamed: 0_level_0,d1,d2,d3
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
angeles,0,0,1
los,0,0,1
new,2,1,0
post,0,1,0
the,1,1,1
times,1,0,1
york,1,1,0


In [8]:
tf.T

word,angeles,los,new,post,the,times,york
d1,0,0,2,0,1,1,1
d2,0,0,1,1,1,0,1
d3,1,1,0,0,1,1,0


## Convert term counts to term frequencies

In [23]:
tf['d1 tf'] = tf['d1'] / len(docs[0])
tf['d2 tf'] = tf['d2'] / len(docs[1])
tf['d3 tf'] = tf['d3'] / len(docs[2])
tf

Unnamed: 0_level_0,d1,d2,d3,d1 tf,d2 tf,d3 tf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
angeles,0,0,1,0.0,0.0,0.25
los,0,0,1,0.0,0.0,0.25
new,2,1,0,0.4,0.25,0.0
post,0,1,0,0.0,0.25,0.0
the,1,1,1,0.2,0.25,0.25
times,1,0,1,0.2,0.0,0.25
york,1,1,0,0.2,0.25,0.0


# Document frequencies

In [10]:
df = pd.DataFrame(data=uniq, columns=['word'])
df = df.set_index('word')
df['doc count'] = [np.sum([w in d for d in docs]) for w in uniq]
df

Unnamed: 0_level_0,doc count
word,Unnamed: 1_level_1
angeles,1
los,1
new,2
post,1
the,3
times,2
york,2


In [11]:
df['df'] = (df['doc count']+1) / (N+1)   # plus 1 for "additive smoothing"
#df['df'] = df['doc count'] / N          # use this one for no smoothing
df['idf'] = 1 / df['df']
df['log idf'] = np.log10(df['idf'])
df

Unnamed: 0_level_0,doc count,df,idf,log idf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
angeles,1,0.5,2.0,0.30103
los,1,0.5,2.0,0.30103
new,2,0.75,1.333333,0.124939
post,1,0.5,2.0,0.30103
the,3,1.0,1.0,0.0
times,2,0.75,1.333333,0.124939
york,2,0.75,1.333333,0.124939


# TF-IDF

In [12]:
tfidf = pd.concat([tf[['d1 tf','d2 tf','d3 tf']],
                   df[['df','idf','log idf']]], axis=1)
tfidf

Unnamed: 0_level_0,d1 tf,d2 tf,d3 tf,df,idf,log idf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
angeles,0.0,0.0,0.25,0.5,2.0,0.30103
los,0.0,0.0,0.25,0.5,2.0,0.30103
new,0.4,0.25,0.0,0.75,1.333333,0.124939
post,0.0,0.25,0.0,0.5,2.0,0.30103
the,0.2,0.25,0.25,1.0,1.0,0.0
times,0.2,0.0,0.25,0.75,1.333333,0.124939
york,0.2,0.25,0.0,0.75,1.333333,0.124939


In [13]:
tfidf['d1 tfidf'] = tfidf['d1 tf'] * tfidf['log idf']
tfidf['d2 tfidf'] = tfidf['d2 tf'] * tfidf['log idf']
tfidf['d3 tfidf'] = tfidf['d3 tf'] * tfidf['log idf']
tfidf

Unnamed: 0_level_0,d1 tf,d2 tf,d3 tf,df,idf,log idf,d1 tfidf,d2 tfidf,d3 tfidf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
angeles,0.0,0.0,0.25,0.5,2.0,0.30103,0.0,0.0,0.075257
los,0.0,0.0,0.25,0.5,2.0,0.30103,0.0,0.0,0.075257
new,0.4,0.25,0.0,0.75,1.333333,0.124939,0.049975,0.031235,0.0
post,0.0,0.25,0.0,0.5,2.0,0.30103,0.0,0.075257,0.0
the,0.2,0.25,0.25,1.0,1.0,0.0,0.0,0.0,0.0
times,0.2,0.0,0.25,0.75,1.333333,0.124939,0.024988,0.0,0.031235
york,0.2,0.25,0.0,0.75,1.333333,0.124939,0.024988,0.031235,0.0


# CountVectorizer

Ok, now get term counts the the easy way.

In [14]:
docstrs

['the new new york times', 'the new york post', 'the los angeles times']

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(docstrs)
X

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [16]:
print(X) # maps (i,j) key to value as sparse matrix implementation

  (0, 4)	1
  (0, 2)	2
  (0, 6)	1
  (0, 5)	1
  (1, 4)	1
  (1, 2)	1
  (1, 6)	1
  (1, 3)	1
  (2, 4)	1
  (2, 5)	1
  (2, 1)	1
  (2, 0)	1


In [17]:
X.toarray()

array([[0, 0, 2, 0, 1, 1, 1],
       [0, 0, 1, 1, 1, 0, 1],
       [1, 1, 0, 0, 1, 1, 0]])

In [18]:
vectorizer.get_feature_names()

['angeles', 'los', 'new', 'post', 'the', 'times', 'york']

In [19]:
pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,angeles,los,new,post,the,times,york
0,0,0,2,0,1,1,1
1,0,0,1,1,1,0,1
2,1,1,0,0,1,1,0


Compare to manually computed values. Heh, we match!

In [24]:
tf[['d1','d2','d3']].T

word,angeles,los,new,post,the,times,york
d1,0,0,2,0,1,1,1
d2,0,0,1,1,1,0,1
d3,1,1,0,0,1,1,0


## TfidfTransformer

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

trans = TfidfTransformer()
D = trans.fit_transform(X).toarray()
pd.DataFrame(data=D, columns=vectorizer.get_feature_names())

Unnamed: 0,angeles,los,new,post,the,times,york
0,0.0,0.0,0.778317,0.0,0.302216,0.389158,0.389158
1,0.0,0.0,0.480458,0.631745,0.373119,0.0,0.480458
2,0.584483,0.584483,0.0,0.0,0.345205,0.444514,0.0


Compare to manually computed. Note that sklearn says "*Note that the idf formula above differs from the standard textbook notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ]*" so we're a bit different.

In [22]:
tfidf[['d1 tfidf', 'd2 tfidf', 'd3 tfidf']].T

word,angeles,los,new,post,the,times,york
d1 tfidf,0.0,0.0,0.049975,0.0,0.0,0.024988,0.024988
d2 tfidf,0.0,0.0,0.031235,0.075257,0.0,0.0,0.031235
d3 tfidf,0.075257,0.075257,0.0,0.0,0.0,0.031235,0.0


## What can we conclude?

For these three documents, the words with the highest TFIDF scores are *new*, *post*, and *angeles* (or *los*). That corresponds to our intuition that those words distinguish between the documents best.  Words like *the* appear in every document so they are not good distinguishing features.  Because *york* appears in two of three documents, it's score was attenuated but not zero doubt.