In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm 

In [3]:
a = pd.read_csv("data/transcripts.csv")

In [16]:
# a["url"]

In [4]:
a["transcript"].apply(lambda x: len(x.split(" "))).describe()

count    2467.000000
mean     2041.959465
std       956.478815
min         1.000000
25%      1336.000000
50%      2029.000000
75%      2707.500000
max      9044.000000
Name: transcript, dtype: float64

1. Average number of words in ted talk: 2000 words
2. Max TED Talk has 9000 words in it

In [17]:
import textacy

### Clean up text to only include unicode, and convert to textacy document

In [25]:
ex = textacy.Doc(content=a["transcript"][0].decode('utf-8'),lang=u'en')

In [None]:
from textacy.keyterms import sgrank

### Get Topics of a Document

In [46]:
textrank(ex,n_keyterms=10)

[(u'education', 0.018482597870771732),
 (u'thing', 0.015298506952932835),
 (u'people', 0.014231480092299567),
 (u'way', 0.011517267298331973),
 (u'kid', 0.010263765982329335),
 (u'child', 0.00890355757567635),
 (u'year', 0.008815724232791696),
 (u'school', 0.008628570304400627),
 (u'human', 0.008134166700086131),
 (u'what', 0.008100449128440587)]

## Create list of textacy Documents that will be used to create BOW Matrix

In [66]:
corpus=[]
docs = a['transcript'].tolist()[:200]
for doc in tqdm(docs):
    corpus.append(textacy.Doc(content=doc.decode('utf-8'),lang=u'en'))

100%|██████████| 200/200 [02:20<00:00,  1.43it/s]


In [69]:
tokenized_docs = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)
vectorizer = textacy.Vectorizer(apply_idf=True, norm='l2',min_df=3, max_df=0.95)
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
doc_term_matrix

<200x6241 sparse matrix of type '<type 'numpy.float64'>'
	with 90633 stored elements in Compressed Sparse Row format>

## Get Labels

In [181]:
metadata = pd.read_csv("data/ted_main.csv")

In [78]:
import json

In [100]:
# help(json.loads)

In [109]:
json.loads("{\"a\":1}")

{u'a': 1}

json uses " as a formatting character, Json escapes quotes inside strings as \" so replace ' with "

In [127]:
arr = json.loads(metadata['ratings'][0].replace("'",'\"'))

Want to be in one dictionary so I can sort and retrieve best rating

In [140]:
help(sorted)

Help on built-in function sorted in module __builtin__:

sorted(...)
    sorted(iterable, cmp=None, key=None, reverse=False) --> new sorted list



In [None]:
def combine_list_of_dicts(arr):
    dict_ratings = {}
    for a in arr:
        if a["id"] not in dict_ratings:
            dict_ratings[a["id"]]=a
            
    return dict_ratings

d = combine_list_of_dicts(arr)

In [149]:
sorted(d.items(), key= lambda x:x[1]["count"],reverse=True)

[(10, {u'count': 24924, u'id': 10, u'name': u'Inspiring'}),
 (7, {u'count': 19645, u'id': 7, u'name': u'Funny'}),
 (24, {u'count': 10704, u'id': 24, u'name': u'Persuasive'}),
 (22, {u'count': 10581, u'id': 22, u'name': u'Fascinating'}),
 (8, {u'count': 7346, u'id': 8, u'name': u'Informative'}),
 (9, {u'count': 6073, u'id': 9, u'name': u'Ingenious'}),
 (1, {u'count': 4573, u'id': 1, u'name': u'Beautiful'}),
 (23, {u'count': 4439, u'id': 23, u'name': u'Jaw-dropping'}),
 (3, {u'count': 3253, u'id': 3, u'name': u'Courageous'}),
 (25, {u'count': 1174, u'id': 25, u'name': u'OK'}),
 (11, {u'count': 387, u'id': 11, u'name': u'Longwinded'}),
 (21, {u'count': 300, u'id': 21, u'name': u'Unconvincing'}),
 (2, {u'count': 242, u'id': 2, u'name': u'Confusing'}),
 (26, {u'count': 209, u'id': 26, u'name': u'Obnoxious'})]

## sort ratings by highest count,use first one as highest rating

Note: May have bias that TED Talks are more inclined to be inspiring

In [164]:
def getTEDRating(string):
    arr = json.loads(string.replace("'",'\"'))
    d = combine_list_of_dicts(arr) 
#     print(d.items())
    return sorted(d.items(), key= lambda x:x[1]["count"],reverse=True)[0][1]['id']
getTEDRating(metadata['ratings'][0])

10

# Create array of labels, where the label is the highest rating for that TED Talk

In [182]:
labels = [getTEDRating(x) for x in metadata['ratings'].tolist()]

In [183]:
labels

[10,
 7,
 7,
 10,
 8,
 10,
 7,
 9,
 24,
 10,
 10,
 10,
 10,
 23,
 10,
 23,
 23,
 10,
 10,
 8,
 8,
 7,
 11,
 8,
 10,
 24,
 22,
 8,
 22,
 22,
 8,
 10,
 24,
 10,
 10,
 8,
 8,
 10,
 10,
 10,
 8,
 9,
 8,
 7,
 8,
 24,
 10,
 10,
 7,
 8,
 8,
 24,
 10,
 10,
 22,
 10,
 8,
 10,
 23,
 22,
 8,
 10,
 10,
 10,
 10,
 3,
 10,
 10,
 8,
 22,
 9,
 8,
 10,
 10,
 10,
 10,
 1,
 10,
 7,
 21,
 10,
 8,
 1,
 23,
 1,
 22,
 7,
 1,
 8,
 10,
 24,
 8,
 1,
 1,
 1,
 3,
 3,
 7,
 1,
 8,
 1,
 8,
 7,
 24,
 10,
 21,
 22,
 8,
 23,
 10,
 10,
 9,
 10,
 11,
 24,
 1,
 23,
 23,
 23,
 24,
 22,
 23,
 8,
 23,
 7,
 23,
 7,
 10,
 24,
 10,
 10,
 10,
 1,
 10,
 1,
 1,
 10,
 23,
 7,
 24,
 23,
 8,
 24,
 3,
 8,
 7,
 8,
 9,
 23,
 23,
 10,
 22,
 7,
 8,
 22,
 8,
 3,
 10,
 24,
 22,
 24,
 24,
 10,
 22,
 22,
 7,
 8,
 8,
 23,
 10,
 1,
 24,
 10,
 7,
 22,
 7,
 22,
 8,
 11,
 1,
 10,
 10,
 7,
 7,
 10,
 8,
 10,
 1,
 10,
 10,
 1,
 22,
 10,
 7,
 7,
 10,
 22,
 22,
 10,
 23,
 22,
 10,
 11,
 10,
 10,
 10,
 10,
 22,
 7,
 1,
 22,
 8,
 21,
 22,
 10,
 9,
 10,
 

In [184]:
a['transcript']

0       Good morning. How are you?(Laughter)It's been ...
1       Thank you so much, Chris. And it's truly a gre...
2       (Music: "The Sound of Silence," Simon & Garfun...
3       If you're here today — and I'm very happy that...
4       About 10 years ago, I took on the task to teac...
5       Thank you. I have to tell you I'm both challen...
6       On September 10, the morning of my seventh bir...
7       I'm going to present three projects in rapid f...
8       It's wonderful to be back. I love this wonderf...
9       I'm often asked, "What surprised you about the...
10      I'm going to take you on a journey very quickl...
11      I can't help but this wish: to think about whe...
12      I'm the luckiest guy in the world. I got to se...
13      I'm really excited to be here today. I'll show...
14      I've been at MIT for 44 years. I went to TED I...
15      (Music)(Music ends)(Applause)(Applause ends)Hi...
16      (Music)(Music ends)(Applause)Thank you!(Applau...
17      In ter

In [185]:
seriess = {'transcripts': pd.Series(a['transcript']),
    'labels': pd.Series(labels)}
dataset = pd.DataFrame(seriess)

In [186]:
dataset

Unnamed: 0,labels,transcripts
0,10,Good morning. How are you?(Laughter)It's been ...
1,7,"Thank you so much, Chris. And it's truly a gre..."
2,7,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,10,If you're here today — and I'm very happy that...
4,8,"About 10 years ago, I took on the task to teac..."
5,10,Thank you. I have to tell you I'm both challen...
6,7,"On September 10, the morning of my seventh bir..."
7,9,I'm going to present three projects in rapid f...
8,24,It's wonderful to be back. I love this wonderf...
9,10,"I'm often asked, ""What surprised you about the..."
