In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib as plt
from scipy import stats

In [2]:
#create a workable dataframe by combining the main and transcript csvs. 
#get rid of the url column while we're at it. Shouldn't need that. 
df1 = pd.read_csv('ted_main.csv')
df2 = pd.read_csv('transcripts.csv')
df = pd.concat([df1, df2], axis =1)
df.drop(columns = ['url'], inplace = True)

In [37]:
views = df['views']

In [4]:
views.describe() ##we could define popular as falling above the 50th percentile

count    2.550000e+03
mean     1.698297e+06
std      2.498479e+06
min      5.044300e+04
25%      7.557928e+05
50%      1.124524e+06
75%      1.700760e+06
max      4.722711e+07
Name: views, dtype: float64

In [5]:
ratings = df['ratings']

In [6]:
ratings[1] #These would probably be useful for defining sentiment, if we wanted to go down that road

"[{'id': 7, 'name': 'Funny', 'count': 544}, {'id': 3, 'name': 'Courageous', 'count': 139}, {'id': 2, 'name': 'Confusing', 'count': 62}, {'id': 1, 'name': 'Beautiful', 'count': 58}, {'id': 21, 'name': 'Unconvincing', 'count': 258}, {'id': 11, 'name': 'Longwinded', 'count': 113}, {'id': 8, 'name': 'Informative', 'count': 443}, {'id': 10, 'name': 'Inspiring', 'count': 413}, {'id': 22, 'name': 'Fascinating', 'count': 132}, {'id': 9, 'name': 'Ingenious', 'count': 56}, {'id': 24, 'name': 'Persuasive', 'count': 268}, {'id': 23, 'name': 'Jaw-dropping', 'count': 116}, {'id': 26, 'name': 'Obnoxious', 'count': 131}, {'id': 25, 'name': 'OK', 'count': 203}]"

In [7]:
popular = df[df['views'] > 1124524]

In [8]:
unpopular = df[df['views'] < 1124524]

In [9]:
popular.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,47227110,Good morning. How are you?(Laughter)It's been ...
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,3200520,"Thank you so much, Chris. And it's truly a gre..."
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,1636292,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,1697550,If you're here today — and I'm very happy that...
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,12005869,"About 10 years ago, I took on the task to teac..."


In [130]:
pop_names = popular['title'].to_list()
unpop_names = unpopular['title'].to_list()

Do sentiment analysis of tags, transcripts, and titles

In [10]:
# Sentiment: the good, the bad, the very bad
#This is mainly just to illustrate the limits of this sentiment analyzer
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
climate = sia.polarity_scores('Averting the climate crisis') #ok
stats = sia.polarity_scores("The best stats you've ever seen") #good job
oh_no = sia.polarity_scores("My adorable puppy is on fire") #very bad

In [131]:
##First tries: categorizing popularity based on sentiment of TED title##
labeled_names = ([(name, 'popular') for name in pop_names]+
                [(name, 'unpopular') for name in unpop_names])

In [132]:
import random
random.shuffle(labeled_names)

In [133]:
def sentim_features(names):
    return {'polarity' : sia.polarity_scores(names)['compound']}

In [134]:
featuresets = [(sentim_features(n), pop) for (n, pop) in labeled_names]
featuresets[:2]

[({'polarity': 0.1779}, 'unpopular'), ({'polarity': 0.5267}, 'popular')]

In [135]:
train_set, test_set = featuresets[2040:], featuresets[:2040]

In [136]:
classifier = nltk.NaiveBayesClassifier.train(train_set) #wild student uses train!

In [137]:
print(nltk.classify.accuracy(classifier, test_set)) #it's not very effective!

0.5294117647058824


Part of the problem here might be that a substantial number of the titles evaluate to neutral in terms of 
sentiment polarity

Let's try again with transcripts! Note: A couple of things. For one thing, when we do this for real, this is
going to require a touch more finesse. There's notation for music, applause and laughter that will need to be removed from the transcripts (although I'm going to argue that we leave the laughter tags in, as those might be important for determining sentiment). Also, Vader is not wonderful at dealing with longer sentences. It was made for shorter things, like tweets and such. This is just a preliminary sort of experiment. 

In [155]:
pop_transcripts = popular['transcript'].to_list()
unpop_transcripts = unpopular['transcript'].to_list()
labeled_transcripts = ([(transcript, 'popular') for transcript in pop_transcripts]+
                      [(transcript, 'unpopular') for transcript in unpop_transcripts])
random.shuffle(labeled_transcripts)

In [161]:
#This cell will take a while. Depending on your hardware, I guess. It took my lil macbook a couple of minutes. 
#Patience, grasshopper
def sentim_features_t(transcript):
    return{'polarity' : sia.polarity_scores(str(transcript))['compound']}
featuresets = [(sentim_features_t(n), pop) for (n, pop) in labeled_transcripts]

In [162]:
train_set, test_set = featuresets[2040:], featuresets[:2040]

In [163]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [164]:
print(nltk.classify.accuracy(classifier, test_set)) #Spoiler Alert: This works less well. 
                                                    #Again, that might be because Vader works less well 
                                                    #with longer sentences. It might be because this isn't a good
                                                    #predictor for popularity. Further science must be done. 

0.49950980392156863


In [165]:
##Last but not least: tags!
pop_tags = popular['tags'].to_list()
unpop_tags = unpopular['tags'].to_list()

In [166]:
labeled_tags = ([(tags, 'popular') for tags in pop_tags]+
                [(tags, 'unpopular') for tags in unpop_tags])

In [167]:
import random
random.shuffle(labeled_tags)

In [168]:
def sentim_features(tags):
    return {'polarity' : sia.polarity_scores(tags)['compound']}

In [180]:
featuresets2 = [(sentim_features(n), pop) for (n, pop) in labeled_tags]
featuresets[:2]

[({'polarity': 0.9908}, 'unpopular'), ({'polarity': -0.9994}, 'popular')]

In [181]:
train_set, test_set = featuresets2[2040:], featuresets2[:2040]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [182]:
print(nltk.classify.accuracy(classifier, test_set)) #Works about as well as the transcripts. Still, though, we're
                                                    #doing about as well as a coin toss. 

0.4980392156862745
