In [35]:
import string

file = open("Data/Gutenberg_pg5200_clean.txt.txt", 'r', encoding='utf-8')
lines = []
for line in file:
    lines.append(line)

# Mapping any punctuation to space    
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))    
for line_idx, line in enumerate(lines):
    line = line.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', ' ')
    line = line.translate(translator)
    line = line.replace('  ', ' ')
    cleaned_line = []
    for word in line.split():
        word = word.lower()
        word = word.replace(word, ''.join(list(map(lambda i:i if i not in string.punctuation else '', word))))
        cleaned_line.append(word)
    line = ' '.join(cleaned_line)
    lines[line_idx] = line 

print(lines[:10])
data = ' '.join(lines)
data = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', ' ')

print(data[:100])

['one morning when gregor samsa woke from troubled dreams he found', 'himself transformed in his bed into a horrible vermin he lay on', 'his armour like back and if he lifted his head a little he could', 'see his brown belly slightly domed and divided by arches into stiff', 'sections the bedding was hardly able to cover it and seemed ready', 'to slide off any moment his many legs pitifully thin compared', 'with the size of the rest of him waved about helplessly as he', 'looked', '', 'what s happened to me he thought it wasn t a dream his room']
one morning when gregor samsa woke from troubled dreams he found himself transformed in his bed into


In [36]:
vocab = []
for word in data.split():
    if word not in vocab:
        vocab.append(word)

In [37]:
# Implementing TF-IDF from scratch
vocab_count = dict.fromkeys(vocab, 0)
for word in data.split():
    vocab_count[word] += 1

In [38]:
# Computing IDF
import math
idf_dict = dict.fromkeys(vocab_count.keys(), 0)
for line in lines:    
    for word in list(set(line.split())):
        idf_dict[word] += 1
N = len(lines)
for word, val in idf_dict.items():
    idf_dict[word] = math.log(N / float(val))

In [40]:
# Computing TF-IDF
tfidf_dict = {}
for word, val in vocab_count.items():
    tfidf_dict[word] = val * idf_dict[word]


In [42]:
# Example of tokenized sentence with TF-IDF
print(lines[0])
print(len(lines[0].split()))
tokenized_sent = []
for word in lines[0].split():
    tokenized_sent.append(tfidf_dict[word])
print(tokenized_sent)
print(len(tokenized_sent))

one morning when gregor samsa woke from troubled dreams he found
11
[238.3812997707337, 98.61475380652796, 228.0736019147494, 566.2619826811884, 139.66504223817162, 13.76076816437201, 336.6032969915413, 7.57353126274595, 13.76076816437201, 797.0634468580033, 65.11156476869738]
11


In [43]:
inverse_tfidf_dict = {}
for word, val in tfidf_dict.items():
    inverse_tfidf_dict[val] = word

In [45]:
for token in tokenized_sent:
    print(inverse_tfidf_dict[token])

one
hand
when
gregor
samsa
tram
from
destination
tram
he
playing


In [47]:
# Sklearn's TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf = tfidf_vectorizer.fit_transform(lines)

In [51]:
import pandas as pd
df = pd.DataFrame(tfidf[0].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

              TF-IDF
troubled    0.433653
woke        0.411342
dreams      0.411342
found       0.326577
morning     0.299260
samsa       0.279395
when        0.238809
one         0.236467
from        0.210224
gregor      0.159426
he          0.129282
punished    0.000000
public      0.000000
pull        0.000000
pulled      0.000000
provoking   0.000000
pulling     0.000000
abandoned   0.000000
pure        0.000000
provincial  0.000000
pursue      0.000000
push        0.000000
pushed      0.000000
pushing     0.000000
put         0.000000
