# Class of Week 7
Content:  
1. TF-IDF  
2. Singular Value Decomposition  
3. Latent Semantic Indexing
4. Latent Dirichlet Allocation  
5. Open Questions and Project Support

## TF-IDF
Term-Frequency-Inverse-Document Frequency.  
Bag-of-words approach that gives more weight to 'important' words.  
Useful resources:  
https://www.youtube.com/watch?v=4vT4fzjkGCQ  
https://www.youtube.com/watch?v=hXNbFNCgPfY  


### Basic TF-IDF

In [1]:
# Create two documents.
docs = ['the dog sits on the table',
        'the cat sits on the sofa']

docs

['the dog sits on the table', 'the cat sits on the sofa']

In [3]:
import collections

# Convert documents to counter.
docs_counter = [collections.Counter(doc.split()) for doc in docs]

docs_counter

[Counter({'dog': 1, 'on': 1, 'sits': 1, 'table': 1, 'the': 2}),
 Counter({'cat': 1, 'on': 1, 'sits': 1, 'sofa': 1, 'the': 2})]

In [4]:
import itertools

# Create unique term set.
terms = set(itertools.chain.from_iterable(docs_counter))

terms

{'cat', 'dog', 'on', 'sits', 'sofa', 'table', 'the'}

In [6]:
# Create term-frequency function.
def tf(t, d):
    """Calculates term-frequency for term t in document d."""

    # If term in document, return frequency. Else, return null:
    if t in d.keys():
        return d[t]
    else:
        return 0
    
print(tf('cat', docs_counter[0]))
print(tf('cat', docs_counter[1]))

0
1


In [7]:
# Calculate term-frequency matrix.
tf_matrix = [{t:tf(t, d) for t in terms} for d in docs_counter]

tf_matrix

[{'cat': 0, 'dog': 1, 'on': 1, 'sits': 1, 'sofa': 0, 'table': 1, 'the': 2},
 {'cat': 1, 'dog': 0, 'on': 1, 'sits': 1, 'sofa': 1, 'table': 0, 'the': 2}]

In [10]:
import math

# Create inverse document-frequency function.
def idf(t, D):
    """Calculates inverse document-frequency for term t in documents D."""
    return math.log(len(D) / len([d for d in D if t in d.keys()]),2)

print(idf('the', docs_counter))
print(idf('cat', docs_counter))

0.0
1.0


In [11]:
# Calculate inverse document-frequency vector.
idf_vector = {t: idf(t, docs_counter) for t in terms}

idf_vector

{'cat': 1.0,
 'dog': 1.0,
 'on': 0.0,
 'sits': 0.0,
 'sofa': 1.0,
 'table': 1.0,
 'the': 0.0}

In [46]:
# Calculate term-frequency inverse document-frequency matrix.
tfidf_matrix = [{t: tf_vector[t]*idf_vector[t] for t in terms} for tf_vector in tf_matrix]

tfidf_matrix

[{'cat': 0.0,
  'dog': 0.0,
  'on': 0.0,
  'sits': 0.0,
  'sofa': 0.0,
  'table': 0.0,
  'the': 0.6931471805599453},
 {'cat': 0.0,
  'dog': 0.0,
  'on': 0.0,
  'sits': 0.0,
  'sofa': 0.0,
  'table': 0.0,
  'the': 0.6931471805599453}]

In [47]:
import pandas

# Label term-frequency columns.
tf_cols = ['tf_' + str(i + 1) for i in range(len(tf_matrix))]

# Labels term-frequency inverse-document-frequency columns.
tfidf_cols = ['tfidf_' + str(i + 1) for i in range(len(tfidf_matrix))]

# Create function to build pandas data frame.
def create_df(tf_matrix, idf_vector, tfidf_matrix, tf_cols, tfidf_cols):
    # Create data frame dictionary.
    df_dict = {}

    # Fill data frame dictionary.
    for tf_col, tf_vector in zip(tf_cols, tf_matrix): df_dict[tf_col] = tf_vector
    df_dict['idf'] = idf_vector
    for tfidf_col, tfidf_vector in zip(tfidf_cols, tfidf_matrix): df_dict[tfidf_col] = tfidf_vector

    # Create column order.
    col_order = []
    col_order.extend(tf_cols)
    col_order.append('idf')
    col_order.extend(tfidf_cols)

    # Create data frame and order by column order.
    df = pandas.DataFrame.from_dict(df_dict)
    df = df[col_order]

    return df

In [18]:
# Print data frame.
print('Standard TF-IDF\n')
print(create_df(tf_matrix, idf_vector, tfidf_matrix, tf_cols, tfidf_cols))

Standard TF-IDF

       tf_1  tf_2  idf  tfidf_1  tfidf_2
cat       0     1  1.0      0.0      1.0
dog       1     0  1.0      1.0      0.0
on        1     1  0.0      0.0      0.0
sits      1     1  0.0      0.0      0.0
sofa      0     1  1.0      0.0      1.0
table     1     0  1.0      1.0      0.0
the       2     2  0.0      0.0      0.0


### Advanced TF-IDF

In [19]:
# Sub-linear term-frequency.
def tf(t, d, sub_linear=False):
    """Calculates term-frequency for term t in document d."""

    # If term in document, return frequency. Else, return null:
    if t in d.keys():
        # If sub_linear, return log of tf.
        if sub_linear:
            return math.log(d[t]) + 1
        else:
            return d[t]
    else:
        return 0

In [29]:
tf_matrix = [{t:tf(t, d, sub_linear=True) for t in terms} for d in docs_counter]
tfidf_matrix = [{t: tf_vector[t] * idf_vector[t] for t in terms} for tf_vector in tf_matrix]

print('Sub-Linear TF-IDF\n')
print(create_df(tf_matrix, idf_vector, tfidf_matrix, tf_cols, tfidf_cols))

Sub-Linear TF-IDF

           tf_1      tf_2       idf   tfidf_1   tfidf_2
cat    0.000000  0.000000  1.584963  0.000000  0.000000
dog    0.000000  0.000000  1.584963  0.000000  0.000000
on     0.000000  0.000000  1.000000  0.000000  0.000000
sits   0.000000  0.000000  1.000000  0.000000  0.000000
sofa   0.000000  0.000000  1.584963  0.000000  0.000000
table  0.000000  0.000000  1.584963  0.000000  0.000000
the    0.693147  0.693147  1.000000  0.693147  0.693147


In [21]:
# Smoother for inverse document-frequency.
def idf(t, D, smoother=False):
    """Calculates inverse document-frequency for term t in documents D."""

    val = len(D) / len([d for d in D if t in d.keys()])

    # If smoother, add 1 to val
    if smoother:
        val += 1
    return math.log(val, 2)

In [28]:
idf_vector = {t: idf(t, docs_counter, True) for t in terms}
tfidf_matrix = [{t: tf_vector[t] * idf_vector[t] for t in terms} for tf_vector in tf_matrix]

print('Smoother TF-IDF\n')
print(create_df(tf_matrix, idf_vector, tfidf_matrix, tf_cols, tfidf_cols))

Smoother TF-IDF

           tf_1      tf_2       idf   tfidf_1   tfidf_2
cat    0.000000  0.353553  1.584963  0.000000  0.560369
dog    0.353553  0.000000  1.584963  0.560369  0.000000
on     0.353553  0.353553  1.000000  0.353553  0.353553
sits   0.353553  0.353553  1.000000  0.353553  0.353553
sofa   0.000000  0.353553  1.584963  0.000000  0.560369
table  0.353553  0.000000  1.584963  0.560369  0.000000
the    0.707107  0.707107  1.000000  0.707107  0.707107


In [24]:
import numpy

# Normalizing term-frequency.
def tf(t, d, sub_linear=False, normalization=None):
    """Calculates term-frequency for term t in document d."""

    # If normalization is in ['l1', 'l2'], apply normalization.
    if normalization in ['l1', 'l2']:
        # If normalization is 'l1', apply l1 normalization.
        if normalization == 'l1':
            normalizer = numpy.sum(numpy.abs(list(d.values())))
        # If normalization is 'l2', apply l2 normalization.
        if normalization == 'l2':
            normalizer = numpy.sqrt(numpy.sum(numpy.square(list(d.values()))))
        d_norm = {word: d[word] / normalizer for word in d.keys()}
    else:
        d_norm = d

    # If term in document, return frequency. Else, return null:
    if t in d_norm.keys():
        # If sub_linear, return log of tf.
        if sub_linear:
            return math.log(d_norm[t])
        else:
            return d_norm[t]
    else:
        return 0

In [25]:
tf_matrix = [{t:tf(t, d, False, 'l1') for t in terms} for d in docs_counter]
tfidf_matrix = [{t: tf_vector[t] * idf_vector[t] for t in terms} for tf_vector in tf_matrix]

print('L1 TF-IDF\n')
print(create_df(tf_matrix, idf_vector, tfidf_matrix, tf_cols, tfidf_cols))

L1 TF-IDF

           tf_1      tf_2       idf   tfidf_1   tfidf_2
cat    0.000000  0.166667  1.584963  0.000000  0.264160
dog    0.166667  0.000000  1.584963  0.264160  0.000000
on     0.166667  0.166667  1.000000  0.166667  0.166667
sits   0.166667  0.166667  1.000000  0.166667  0.166667
sofa   0.000000  0.166667  1.584963  0.000000  0.264160
table  0.166667  0.000000  1.584963  0.264160  0.000000
the    0.333333  0.333333  1.000000  0.333333  0.333333


In [27]:
tf_matrix = [{t:tf(t, d, False, 'l2') for t in terms} for d in docs_counter]
tfidf_matrix = [{t: tf_vector[t] * idf_vector[t] for t in terms} for tf_vector in tf_matrix]

print('L2 TF-IDF\n')
print(create_df(tf_matrix, idf_vector, tfidf_matrix, tf_cols, tfidf_cols))

L2 TF-IDF

           tf_1      tf_2       idf   tfidf_1   tfidf_2
cat    0.000000  0.353553  1.584963  0.000000  0.560369
dog    0.353553  0.000000  1.584963  0.560369  0.000000
on     0.353553  0.353553  1.000000  0.353553  0.353553
sits   0.353553  0.353553  1.000000  0.353553  0.353553
sofa   0.000000  0.353553  1.584963  0.000000  0.560369
table  0.353553  0.000000  1.584963  0.560369  0.000000
the    0.707107  0.707107  1.000000  0.707107  0.707107


### TF-IDF with Gensim

In [33]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

docs_tokenized = [d.split() for d in docs]

dct = Dictionary(docs_tokenized)  # fit dictionary
corpus = [dct.doc2bow(line) for line in docs_tokenized]  # convert dataset to BoW format

model = TfidfModel(corpus)  # fit model
vector = model[corpus[0]]  # apply model

vector

[(0, 0.7071067811865475), (3, 0.7071067811865475)]

## Singular Value Decomposition
Useful resources:  
https://www.youtube.com/watch?v=P5mlg91as1c

In [34]:
movie_dict = {'matrix': [1, 3, 4, 5, 0, 0, 0],
                'alien': [1, 3, 4, 5, 2, 0, 1],
                'serenity': [1, 3, 4, 5, 0, 0, 0],
                'casablanca': [0, 0, 0, 0, 4, 5, 2],
                'amelie': [0, 0, 0, 0, 4, 5, 2]}

movie_matrix = pandas.DataFrame.from_dict(movie_dict)

print('\n\nOriginal Matrix\n')
print(movie_matrix)



Original Matrix

   alien  amelie  casablanca  matrix  serenity
0      1       0           0       1         1
1      3       0           0       3         3
2      4       0           0       4         4
3      5       0           0       5         5
4      2       4           4       0         0
5      0       5           5       0         0
6      1       2           2       0         0


In [36]:
from sklearn.decomposition import TruncatedSVD

svd_model = TruncatedSVD(n_components=2)
svd_features = svd_model.fit_transform(movie_matrix)

print('SVD Features\n')
print(pandas.DataFrame(svd_features))

print('\n\nSVD Singular Values\n')
print(pandas.DataFrame(svd_model.singular_values_))

print('\n\nSVD Components\n')
print(pandas.DataFrame(svd_model.components_))

SVD Features

          0         1
0  1.717377 -0.224512
1  5.152130 -0.673537
2  6.869507 -0.898049
3  8.586884 -1.122561
4  1.906788  5.620551
5  0.901335  6.953762
6  0.953394  2.810275


SVD Singular Values

           0
0  12.481015
1   9.508614


SVD Components

          0         1         2         3         4
0  0.592860  0.090134  0.090134  0.562258  0.562258
1  0.028771  0.695376  0.695376 -0.126641 -0.126641


## Latent Semantic Indexing
Useful resources:  
https://www.youtube.com/watch?v=BJ0MnawUpaU

In [40]:
from gensim import corpora, models

dictionary = corpora.Dictionary(docs_tokenized)
# print(dictionary.token2id)

corpus = [dictionary.doc2bow(text) for text in docs_tokenized]
# print(corpus)

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

total_topics = 2

lsi = models.LsiModel(corpus_tfidf,
                      id2word=dictionary,
                      num_topics=total_topics)

pandas.DataFrame(lsi.print_topics(total_topics))

df = pandas.DataFrame(lsi.projection.u)
df['term'] = lsi.id2word.id2token.values()
df = df.set_index('term')


print(df)

                  0             1
term                             
dog    7.071068e-01 -5.551115e-17
on     1.098401e-16  2.505618e-16
sits  -1.971561e-32 -4.667493e-32
table  7.071068e-01  9.340158e-34
the   -1.756232e-34 -6.162976e-33
cat    3.988556e-17  7.071068e-01
sofa   7.113675e-17  7.071068e-01


In [49]:
from sklearn.decomposition import LatentDirichletAllocation


lda_model = LatentDirichletAllocation(n_components=2, doc_topic_prior=0.9, topic_word_prior=0.9)
lda_features = lda_model.fit_transform(movie_matrix)

print('\n\nLDA Features\n')
print(pandas.DataFrame(lda_features))

print('\n\nLDA Components\n')
print(pandas.DataFrame(lda_model.exp_dirichlet_component_))



LDA Features

          0         1
0  0.794225  0.205775
1  0.907556  0.092444
2  0.927545  0.072455
3  0.940429  0.059571
4  0.111079  0.888921
5  0.078317  0.921683
6  0.179559  0.820441


LDA Components

          0         1         2         3         4
0  0.309027  0.019400  0.019045  0.297667  0.301750
1  0.114258  0.376663  0.376310  0.031118  0.024058


