# Part A: Bag of Words

In [19]:
import numpy as np

"""
    text: a string
    dict_size: size of the dictionary
    words: array of input text after split
    words_to_index: dictionary that maps words to their indices
    return a vector which is a bag-of-words representation of 'text'
"""

def my_bag_of_words(text, words_to_index, dict_size):
  result_vector = np.zeros(dict_size)
  words = text.split()
  for word in words:
    if word in words_to_index:
      result_vector[words_to_index[word]] += 1

  return result_vector

words_to_index = {'hi':0, 'you':1, 'me':2, 'are':3}
dict_size = 4

text = 'hi how are you'

the_vector = my_bag_of_words(text, words_to_index, dict_size)
print(the_vector)

[1. 1. 0. 1.]


# Part B: TF-IDF

## 1. Test the script tfidf_demo.ipynb in the Jupiter note and make sure they work. 

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
texts = [
    "good movie", "not a good movie", "did not like", 
    "i like it", "good one"
]

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform(texts)
pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out()  #as get_feature_names() is the part of older version of scikit-learn, I used get_feature_names_out()
)


Unnamed: 0,good movie,like,movie,not
0,0.707107,0.0,0.707107,0.0
1,0.57735,0.0,0.57735,0.57735
2,0.0,0.707107,0.0,0.707107
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0


## 2. Replace the movie review data "texts" in the script file with your own defined document and test it.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
my_texts = ["This is a python code.", "It is used to run a program", "By this program I analyze text", "Text analysis is used in NLP", "I like NLP"]
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform(my_texts)
pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out()
)

Unnamed: 0,is used,nlp,program,text,this,used
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.57735,0.0,0.57735,0.0,0.0,0.57735
2,0.0,0.0,0.57735,0.57735,0.57735,0.0
3,0.5,0.5,0.0,0.5,0.0,0.5
4,0.0,1.0,0.0,0.0,0.0,0.0


## 3. Given the below documents. 
## texts = [ "good movie", "not a good movie", "did not like", "i like it", "good one"]
## Given the definition of TF and IDF, what is the sum of TF-IDF values for 1-grams in "good movie" text? Enter a math expression as an answer. 

#### TF - IDF stands for "Term Frequency - Inverse Document Frequency". It gives us the information about how important a word is to a document in a collection or corpus.

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = [
    "good movie", "not a good movie", "did not like", 
    "i like it", "good one"
]

tfidf = TfidfVectorizer(ngram_range=(1, 1))
tfidf_matrix = tfidf.fit_transform(texts)
tfidf_values = tfidf_matrix[0].toarray()[0] 
sum_tfidf = sum(tfidf_values)
print(sum_tfidf)

1.408157650537996
