In [1]:
import pandas as pd
import numpy as np
import textacy
import json
from tqdm import tqdm 

%config InlineBackend.figure_format = 'retina'

In [2]:
def combine_list_of_dicts(arr):
    dict_ratings = {}
    for a in arr:
        if a["id"] not in dict_ratings:
            dict_ratings[a["id"]]=a
            
    return dict_ratings


def getTEDRating(string):
    arr = json.loads(string.replace("'",'\"'))
    d = combine_list_of_dicts(arr) 
    return sorted(d.items(), key= lambda x:x[1]["count"],reverse=True)[0][1]['id']

### Combine Datasets

In [3]:
df1 = pd.read_csv("data/transcripts.csv")
df2 = pd.read_csv("data/ted_main.csv")
merged = df1.merge(df2, on="url")

In [4]:
labels = [getTEDRating(x) for x in merged['ratings'].tolist()]
merged['label'] = labels
merged.to_csv("data/Merged_dataset.csv", index=False)

### Textacy analysis

In [5]:
corpus=[]
docs = merged['transcript'].tolist()
for doc in tqdm(docs):
    corpus.append(textacy.Doc(content=doc,
                              lang='en'))

100%|██████████| 2467/2467 [34:34<00:00,  1.19it/s]


In [6]:
new_corpus = textacy.corpus.Corpus('en', docs=corpus)

In [7]:
new_corpus.n_docs, new_corpus.n_sents, new_corpus.n_tokens

(2467, 343476, 5999227)

In [8]:
tokenized_docs = (doc.to_terms_list(ngrams=1, 
                                    named_entities=True, 
                                    as_strings=True) 
                  for doc in new_corpus)
vectorizer = textacy.Vectorizer(apply_idf=True, norm='l2',
                                idf_type='smooth',
                                min_df=3, max_df=0.95)
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)

In [9]:
model = textacy.TopicModel('nmf', n_topics=10)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
doc_topic_matrix.shape

for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms))

topic 0 : so   people   go   think   know   thing   say   have   but   want
topic 1 : ♫   ♫ ♫   song   sing   da   music   video   heh   oh   la
topic 2 : cell   dna   stem   gene   genome   tissue   drug   organ   disease   virus
topic 3 : country   africa   government   world   china   percent   people   economy   economic   global
topic 4 : cancer   patient   tumor   drug   doctor   disease   health   treatment   breast   blood
topic 5 :    woman   ebola   autistic   drug   satellite   antibiotic   people   brazil   pete
topic 6 : planet   earth   water   ocean   universe   mars   star   galaxy   energy   sea
topic 7 : brain   neuron   so   memory   cortex   disorder   control   body   region   signal
topic 8 : city   building   design   space   architecture   build   car   neighborhood   street   urban
topic 9 : robot   robotic   so   machine   ai   build   leg   human   locomotion   intelligence
