In [1]:
import numpy as np
from nltk.tokenize import word_tokenize

In [2]:
text = [
        'Topic sentences are similar to mini thesis statements.\
        Like a thesis statement, a topic sentence has a specific \
        main point. Whereas the thesis is the main point of the essay',\
        'the topic sentence is the main point of the paragraph.\
        Like the thesis statement, a topic sentence has a unifying function. \
        But a thesis statement or topic sentence alone doesn’t guarantee unity.', \
        'An essay is unified if all the paragraphs relate to the thesis,\
        whereas a paragraph is unified if all the sentences relate to the topic sentence.'
        ]

In [5]:
sentences = list()
word_set = list()

for sent in text:
    x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
    sentences.append(x)
    for word in x:
        if word not in word_set:
            word_set.append(word)

# Set of vocabulary
word_set = set(word_set)

# Total elements in our corpus
total_documents = len(sentences)

# Creating an index for each word in the vocabulary.
index_dict = dict()
i = 0

for word in word_set:
    index_dict[word] = i
    i += 1

In [6]:
sentences

[['topic',
  'sentences',
  'are',
  'similar',
  'to',
  'mini',
  'thesis',
  'statements',
  'like',
  'a',
  'thesis',
  'statement',
  'a',
  'topic',
  'sentence',
  'has',
  'a',
  'specific',
  'main',
  'point',
  'whereas',
  'the',
  'thesis',
  'is',
  'the',
  'main',
  'point',
  'of',
  'the',
  'essay'],
 ['the',
  'topic',
  'sentence',
  'is',
  'the',
  'main',
  'point',
  'of',
  'the',
  'paragraph',
  'like',
  'the',
  'thesis',
  'statement',
  'a',
  'topic',
  'sentence',
  'has',
  'a',
  'unifying',
  'function',
  'but',
  'a',
  'thesis',
  'statement',
  'or',
  'topic',
  'sentence',
  'alone',
  'doesn',
  't',
  'guarantee',
  'unity'],
 ['an',
  'essay',
  'is',
  'unified',
  'if',
  'all',
  'the',
  'paragraphs',
  'relate',
  'to',
  'the',
  'thesis',
  'whereas',
  'a',
  'paragraph',
  'is',
  'unified',
  'if',
  'all',
  'the',
  'sentences',
  'relate',
  'to',
  'the',
  'topic',
  'sentence']]

In [8]:
word_set

{'a',
 'all',
 'alone',
 'an',
 'are',
 'but',
 'doesn',
 'essay',
 'function',
 'guarantee',
 'has',
 'if',
 'is',
 'like',
 'main',
 'mini',
 'of',
 'or',
 'paragraph',
 'paragraphs',
 'point',
 'relate',
 'sentence',
 'sentences',
 'similar',
 'specific',
 'statement',
 'statements',
 't',
 'the',
 'thesis',
 'to',
 'topic',
 'unified',
 'unifying',
 'unity',
 'whereas'}

In [9]:
type(sentences), type(word_set)

(list, set)

In [10]:
len(sentences), len(word_set)

(3, 37)

In [12]:
# Create a count dictionary

def count_dict(sentences):
    word_count = dict()

    for word in word_set:
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    
    return word_count

word_count = count_dict(sentences)

In [13]:
word_count

{'sentence': 3,
 'but': 1,
 'sentences': 2,
 'statements': 1,
 'guarantee': 1,
 'thesis': 3,
 'or': 1,
 'has': 2,
 'unifying': 1,
 'an': 1,
 't': 1,
 'relate': 1,
 'like': 2,
 'if': 1,
 'to': 2,
 'the': 3,
 'is': 3,
 'a': 3,
 'doesn': 1,
 'unified': 1,
 'paragraphs': 1,
 'function': 1,
 'mini': 1,
 'of': 2,
 'statement': 2,
 'alone': 1,
 'topic': 3,
 'all': 1,
 'specific': 1,
 'point': 2,
 'are': 1,
 'essay': 2,
 'similar': 1,
 'unity': 1,
 'main': 2,
 'paragraph': 2,
 'whereas': 2}

In [17]:
# TF - Term Frequency
# count of t in d / no of words in d

def term_freq(document, word):
    n = len(document)
    occurence = len([token for token in document if token == word])

    return occurence / n

In [18]:
# IDF - Inverse Document Frequency
# log(n / (df + 1))

def inv_doc_freq(word):
    try:
        word_occurence = word_count[word] + 1
    except:
        word_occurence = 1

    return np.log(total_documents / word_occurence)

In [19]:
# tf-idf

def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))

    for word in sentence:
        tf = term_freq(sentence, word)
        idf = inv_doc_freq(word)

        value = tf * idf
        tf_idf_vec[index_dict[word]] = value

    return tf_idf_vec

In [21]:
vectors = list()

for sent in sentences:
    vec = tf_idf(sent)
    vectors.append(vec)

print(vectors)

[array([-0.0095894 ,  0.        ,  0.        ,  0.0135155 ,  0.        ,
       -0.02876821,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.02876821, -0.0095894 , -0.02876821,  0.        ,  0.        ,
        0.        ,  0.        ,  0.0135155 ,  0.        ,  0.        ,
        0.        , -0.0191788 ,  0.        ,  0.0135155 ,  0.        ,
        0.0135155 ,  0.        ,  0.0135155 ,  0.        ,  0.        ,
        0.        ,  0.        ]), array([-0.02615292,  0.01228682,  0.        ,  0.        ,  0.01228682,
       -0.01743528,  0.01228682,  0.        ,  0.01228682,  0.        ,
        0.01228682,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.03487055, -0.00871764, -0.02615292,  0.01228682,  0.        ,
        0.        ,  0.01228682,  0.        ,  0.        ,  0.        ,
        0.01228682, -0.02615292,  0.        ,  0.        ,  0.        ,
        0.        ,  0.     