In [1]:
# the idea of this very notebook is to get tags using tf-idf of the terms 
# we want to count a tf-idf value for each word in each document;
# top-5 words with the highest value will become tags (since they are the most valuable for the document)

import numpy as np
import pandas as pd
import json
import nltk
import string
import utils
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from collections import defaultdict

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

In [2]:
# the following downloads may be needed:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

nltk_stop_words = nltk.corpus.stopwords.words('english')
stop_words = list(set().union(nltk_stop_words, sklearn_stop_words))

In [3]:
with open('./data/videos_clean_tags.json') as json_file:
    data = json.load(json_file)

In [4]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

In [5]:
punctuation = ['``', '--', '""', "''", "?!", "...", "–"]
for p in string.punctuation:
    punctuation.append(p)

In [6]:
# here we tokenize our corpus, remove stopwords, lower the first letter of each sentence
# stemming is not used since we want our tags to be actual words, not stems
# lemmatization is used so as not to get many similar tags that have the same semantics but different forms
# n-grams are also not used here 

def text_tokenize(text, to_lemmatize=False):
    tokenized = []
    tokenized_text = nltk.tokenize.sent_tokenize(text)
    ignore = list(set().union(stop_words, punctuation))
    
    for s in tokenized_text:
        s = s[0].lower() + s[1:]
        tokenized_sent = tokenizer.tokenize(s)
        tokenized_sent = [w for w in tokenized_sent if w.lower() not in ignore]
        tokenized.append(tokenized_sent)
        
    tokenized = [item for sublist in tokenized for item in sublist]
    
    if to_lemmatize:
        tag_map = defaultdict(lambda : wn.NOUN)
        tag_map['J'] = wn.ADJ
        tag_map['V'] = wn.VERB
        tag_map['R'] = wn.ADV

        lemmatized = []
        for token, tag in pos_tag(tokenized):
            lemma = lemmatizer.lemmatize(token, tag_map[tag[0]])
            lemmatized.append(lemma)
        tokenized = lemmatized
        
    return tokenized

In [7]:
for i in range (0, len(data)):
    data[i]['captions_tokenized'] = text_tokenize(data[i]['captions'], to_lemmatize=True)

In [8]:
# this is not the most effective implementation, but it's pretty clear in its steps

full_lexicon = {}

for document in data:
    doc_lexicon = {}
    
    for term in document['captions_tokenized']:
        if doc_lexicon.get(term) is None:
            doc_lexicon[term] = 1            
        else:
            doc_lexicon[term] = doc_lexicon[term] + 1
            
    norm_term_frequencies = {}
    doc_length = len(document)
    
    for term in doc_lexicon.keys():
        norm_term_frequencies[term] = doc_lexicon[term]/doc_length
        
        if full_lexicon.get(term) is None:
            full_lexicon[term] = 1            
        else:
            full_lexicon[term] = full_lexicon[term] + 1
        
    document['norm_term_frequencies'] = norm_term_frequencies       

In [9]:
# let's see what we can achieve with normalized term frequencies only;
# actually, the results really help us understand the topic of the text even without the idf part

sorted_dict = sorted(data[1]['norm_term_frequencies'].items(), key=lambda x: x[1])
sorted_dict.reverse()
sorted_dict[:10]

[("'s", 0.625),
 ('trauma', 0.4375),
 ('story', 0.3125),
 ('government', 0.25),
 ('happen', 0.25),
 ('Adayanci', 0.25),
 ('girl', 0.25),
 ('child', 0.25),
 ('family', 0.1875),
 ('violence', 0.1875)]

In [10]:
num_of_docs = len(data)

for document in data:
    tf_idf = {}
    for term in document['norm_term_frequencies'].keys():
        idf = np.log(num_of_docs/full_lexicon[term])
        tf_idf[term] = document['norm_term_frequencies'][term] * idf
        
    tf_idf_sorted = sorted(tf_idf.items(), key=lambda x: x[1])
    tf_idf_sorted.reverse()
    document['tf_idf_predicted_tags'] = tf_idf_sorted[:5]
    document.pop('norm_term_frequencies')
    document.pop('captions_tokenized')

In [11]:
# however, now it's even better; though the actual tags are usually different and more abstract

n = 427
data[n]['tf_idf_predicted_tags'], data[n]['tags']

([('supernova', 6.016755770621052),
  ('atom', 3.8321503245419897),
  ('star', 3.506738791982303),
  ('oxygen', 3.2485690110858716),
  ('explosion', 3.2465079603977616)],
 [{'tag': 'TEDTalk', 'type': 'garbage'},
  {'tag': 'TEDTalks', 'type': 'garbage'},
  {'tag': 'Astronomy', 'type': 'normal'},
  {'tag': 'Science', 'type': 'normal'},
  {'tag': 'Universe', 'type': 'normal'},
  {'tag': 'Human Origins', 'type': 'normal'},
  {'tag': 'Human Body', 'type': 'normal'},
  {'tag': 'Cosmos', 'type': 'normal'},
  {'tag': 'Humanity', 'type': 'normal'},
  {'tag': 'Visualizations', 'type': 'normal'},
  {'tag': 'Space', 'type': 'normal'},
  {'tag': 'Solar System', 'type': 'normal'}])

In [12]:
with open('./data/tfidf_predicted_tags.json', 'w') as file:
    json.dump(data, file)

utils.upload_to_googledrive('tfidf_predicted_tags.json')