# COMM7380 Recommender Systems for Digital Media

In [None]:
# Install NetworkX, Matplotlib, Pandas, Numpy using pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install nltk

### Import libraries

In [None]:
import os
import math
import nltk
import time
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

### Cosine Similarity Function

In [None]:
# Cosine Similarity
def cosine_similarity(x, y):
    x_sqrt = np.sqrt(np.dot(x, x))
    y_sqrt = np.sqrt(np.dot(y, y))
    if y_sqrt != 0:     
        return (np.dot(x,y.T) / (x_sqrt * y_sqrt))
    elif y_sqrt == 0:
        return 0

In [None]:
# Test cosine_similarity
print (cosine_similarity(np.array([1,2,3]), np.array([1,2,3])))
print (cosine_similarity(np.array([3,4,5]), np.array([1,3,4])))

### Make Inverted Index & Document Count

Utility function to remove numbers, punctuation and digits. Fixes some identified problems in data (such as cnn without spacing)

In [None]:
def clean_str(texts):
    # input : string that needs to clean all number and signs

    texts = re.sub('cnn',' cnn ', texts) # Needed for some news not correctl
    texts = re.sub('\'', ' ', texts)
    texts = ''.join(c for c in texts if c not in string.punctuation)
    texts = ''.join([c for c in texts if not c.isdigit()])
    
    return texts

Create dictionaries:
- doc2vocab: includes for each document a list of tuples containing (term: number of occurrences)
-- `Doc 0 : (term1: num 1), (term2 : num 2), (term3 : num 3), ...`
- vocab2doc: includes for each word in which document it can be found
-- `word : [text_num1, text_num2, ... ]`

In [None]:
doc2vocab  = dict()
vocab2doc  = dict()

#We consider 60 documents in our collection
for i in range(0,60):
    doc2vocab[i] = dict()

    with open('../data/docs/%d.txt' % i, 'r', encoding="utf-8") as doc:
        read_string = doc.read()                       # get sentence as read function
        read_string = read_string.lower()              # sentence lower
        read_string = clean_str(read_string)           # clean all punctuation and number
        
        tokens = nltk.word_tokenize(read_string)       # get tokens of sentence
        stop = set(stopwords.words('english'))
        tokens = [j for j in tokens if j not in stop] # get rid of stopwords at token
        
        ### get shape of {doc : {word1 : word1_num, word2 : word2_num, .... }}
        for words in tokens:
            # make document and vocab pair dictionary
            if words in doc2vocab[i]:
                doc2vocab[i][words] += 1
                
            else:
                doc2vocab[i][words] = 1
            
            # make inverted index, {word : [doc1, doc3, ... ]}
            text_str = str(i) + '.txt'
            if words in vocab2doc:
                if text_str not in vocab2doc[words]:
                    vocab2doc[words].append(text_str)
                    
            else:
                vocab2doc[words] = list()
                vocab2doc[words].append(text_str)

## Inverted Index: president, obama list posting

In [None]:
# Inverted Index Posting Lists Result
print ('president : ', vocab2doc['president'])
print ('obama : ', vocab2doc['obama'])

# Compute Term Frequency and Inverted Document Frequency

In [None]:
term_pd = pd.DataFrame.from_dict(doc2vocab, orient='index')
term_pd = term_pd.fillna(0)
term_pd.head()

### Compute Document TF-IDF (on single term) and Cosine Similarity

In [None]:
def doc_tf_idf(dataframe, query):
    
    # query tf-idf
    _, width = dataframe.shape
    final = list()
    
    # document tf-idf 
    new_tf = dataframe
    doc_term_value = dataframe[dataframe > 0].count().values # get array of number that document has that term
    doc_frequency = np.log(60 / (doc_term_value + 1))
    
    start = time.time()
    for i in range(len(dataframe)):
        results = np.zeros(width)
        one_row = dataframe.loc[i]
        row_value = one_row.values
        row_index = one_row.index
        
        for j,term in enumerate(row_index):
            if row_value[j] > 0:
                term_frequency = np.log(row_value[j] + 1)
                new_tf.iloc[i,j] = term_frequency * doc_frequency[j]
                    
            elif row_value[j] == 0:
                term_frequency = 0
                new_tf.iloc[i,j] = 0
                
            if term in query:
                new_column = dataframe[[term]]
                new_col_value = new_column[new_column > 0].count().values
                results[j] = term_frequency * (np.log(60 / (new_col_value[0]+1)))
        
        final.append((i, cosine_similarity(new_tf.loc[i].values, results)))
    
        if i % 10 == 0:
            print ('step : %d, time : %f' % (i, time.time()-start))
            
    return new_tf, final

### Get query and Compute Document TF-IDF weight

In [None]:
# Setup a query
query = ['president', 'obama']

# Tokenize the query
query_token = nltk.word_tokenize(query[0])

# Compute TF-IDF
term_doc_matrix, query_tf_idf = doc_tf_idf(term_pd, query_token)

# TF-IDF: president, obama term-document matrix

In [None]:
print (term_doc_matrix[['president']])
print (term_doc_matrix[['obama']])

# Cosine-Similarity: president obama Top 5 Document

In [None]:
# (Document number, Cosine-Similarity between query and document)
score = sorted(query_tf_idf, key = lambda x : x[1], reverse=True)
score[:5]

- Course Instructor: Dr. Paolo Mengoni (Visiting Scholar, School of Communication, Hong Kong Baptist University) 
  - pmengoni@hkbu.edu.hk

- The codes in this notebook take insipiration from various sources. Thanks to [minstar](https://github.com/minstar/TF-IDF)
- All codes are for educational purposes only and released under the CC1.0. 