#### Brigham Young University
    Bailey Smith
    Juan S. Rodriguez

In [23]:
from matplotlib import pyplot as plt
from datetime import datetime
import pandas as pd
import numpy as np
import collections
import re
import glob
import time
import codecs
import pickle
import wikipedia
import statsmodels.api as sm
import statsmodels.formula.api as smf

#import visualizations
%matplotlib inline

from tqdm import tqdm
from scipy.spatial import KDTree
from collections import Counter
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from xgboost import XGBClassifier, XGBRegressor

In [2]:
# Maplotlib customizations.
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = [5, 3] # Set the figure size.
plt.rcParams["figure.dpi"] = 200 # Raise figure quality.

# Pandas customizations
pd.set_option("display.max_rows", 5) # Number of rows displayed.
pd.set_option("display.max_columns", 5) # Number of columns displayed.
pd.set_option("precision", 3) # Truncate floats to 3 decimals.

# Introduction
## Abstract
TED talks have become a standard on how to deliver informational talks. All of these talks are presented in a digital format online for public viweing. By using the transcripts and website data for all TED talks given up to September 2017 +++. Each video in the website is provided with suggested similar videos and informative topic-tags on them. We present an improvement in the suggested videos, and we consolidate a base on tags that are more useful by using bisecting k-means and LDA-GS respectively.

## TED Talks
Since 1984, TED has become an iconic conference in which experts of the world in different fields present their ideas and analysis in the fields of technology, entertainment and design (T.E.D.). Since 2006, the conference platform decided to make every talk public and free by publishing them on their website. Given its history and prestige, TED talks have become a standard for quality when it comes to delivering an informational talk to an audience.

## Motivation
Our purpose is to analyze every talk that has been published online up to September 2017 in order to suggest improved methods to classify them. We will create a method that allows automatic classification/labelling of the TED talks just by reading in the transcript. 

An automatic classification method would allow better suggestions to the online audience, would discard the need of **biased-human classification**, and would be generalizable to other fields and problems. This sort of classification method would allow a computer to sort through thousands of legal documents, health documents, speech transcript, or even books, and allow the scholar to approach them seamlessly. 

## Literary Review

We based the clustering techniques on some papers:

A Comparison of Document Clustering Techniques - Michael Steinbach

http://mlwiki.org/index.php/Latent_Semantic_Analysis#Problems_with_Text

http://mlwiki.org/index.php/Document_Clustering#Semi-Supervised_Clustering

## Approach (AKA: to-do):
For tagging: we use the LDA algorithm with Gibbs Sampling and provide 21 different labels for the videos to have. We generate a clustering based on a K-means bisection in order to determine the main clusters of videos based on their transcripts. We attempt several different algorithms, or a ensemble of them to get the best possible clustering. We predict the audience interested in the talk based on the cluster they have watched.

We suggest a new network of related videos by computing the highest similarity (PCA and TFIDF) and clustering the given video with those in that cluster or in any cluster.

In [60]:
# Upload cluster words
A21 = pd.read_csv('WordSets/df_NLP_21a.csv',header=None).as_matrix()
B21 = pd.read_csv('WordSets/df_NLP_21b.csv',header=None).as_matrix()
C21 = pd.read_csv('WordSets/df_NLP_21c.csv',header=None).as_matrix()

In [4]:
print(*A21,sep='\n')

['kids' 'school' 'children' 'students' 'education' 'teachers' 'learning'
 'schools' 'child' 'teacher']
[nan 'people' 'time' 'life' 'years' 'day' 'back' 'world' 'thing' 'make']
['music' 'play' 'sound' 'game' 'games' 'song' 'video' 'playing' 'hear'
 'sounds']
['water' 'energy' 'earth' 'planet' 'climate' 'carbon' 'years' 'oil' 'air'
 'mars']
['data' 'internet' 'information' 'online' 'media' 'phone' 'digital'
 'people' 'google' 'web']
['virus' 'hiv' 'disease' 'flu' 'malaria' 'vaccine' 'polio' 'infected'
 'epidemic' 'vaccines']
['people' 'human' 'social' 'life' 'love' 'good' 'happiness' 'compassion'
 'feel' 'god']
['language' 'books' 'laughter' 'english' 'book' 'words' 'word' 'read'
 'stories' 'film']
['ocean' 'fish' 'sea' 'animals' 'water' 'species' 'boat' 'coral' 'oceans'
 'ice']
['universe' 'light' 'space' 'stars' 'theory' 'physics' 'earth' 'planets'
 'particles' 'black']
['city' 'cities' 'car' 'cars' 'urban' 'street' 'york' 'public' 'map'
 'streets']
['bees' 'poem' 'poetry' 'bee' 'soap'

In [5]:
wikipedia.summary('farmers')

'A farmer (also called an agriculturer) is a person engaged in agriculture, raising living organisms for food or raw materials. The term usually applies to people who do some combination of raising field crops, orchards, vineyards, poultry, or other livestock. A farmer might own the farmed land or might work as a laborer on land owned by others, but in advanced economies, a farmer is usually a farm owner, while employees of the farm are known as farm workers, or farmhands. However, in the not so distant past, a farmer was a person who promotes or improves the growth of (a plant, crop, etc.) by labor and attention, land or crops or raises animals (as livestock or fish).'

In [6]:
mainData = pd.read_csv('ted_main.csv')
transcripts = pd.read_csv('transcripts.csv')

# TED Talks Data
## Data Description
### Main Dataset
The main dataset contains descriptive information for 2550 TED Talks.

    title:            (str)     The title of the talk
    description:      (str)     A blurb of what the talk is about
    main_speaker:     (str)     The first named speaker of the talk
    speaker_occ:      (str)     The occupation of the main speaker
    num_speaker:      (int)     The number of speakers in the talk
    duration:         (int)     The duration of the talk in seconds
    film_date:        (int)     The date of filming Unix timestamp
    published_date:   (int)     The online publication Unix timestamp
    comments:         (int)     The number of comments made on the talk
    languages:        (int)     Number of languages available for talk
    ratings:          (dict)    The various ratings given to the talk
    url:              (url)     The URL of the talk
    views:            (int)     The number of views on the talk
    related_talks:    (dict)    List of dict of 6 related talks
    tags:             (list)    The themes associated with the talk
    
We will describe this data more thoroughly as we develop the visualizations, but we would like to highlight some information that helps us understand this data, and with that, some interesting facts worth mentioning. The most common speaker occupation is "Writer" with 45 occurrences, followed by "Designer" with a total of 34 occurrences. The total number of occupations among speakers is 1458. The average talk is 13.7 minutes long, with the shortest being 2.25 minutes and the longest being about 1.5 hours, which was given by the author of "The Hitchhiker's Guide to the Galaxy". The average TED talk has been translated to 27 languages, and there are 86 talks that have no assigned language. These talks have a mean number of views significantly lower than average, and they are mainly musical presentations. The average number of views is 1.6 million and the talk with highest number of views has been seen almost 50 million times and it's called "Do schools kill creativity?".

An important variable we must understand is the one named "ratings". This variable is a categorical description of the talk. It is opinion-based and given by the online audience: after watching the video, the viewer is asked: *"How would you describe this talk? Tell us by choosing up to three words. (If you choose just one, it will count three times.)".*  Afterwards, the viewer is given 14 possible adjectives to describe the talk with, and can only choose 3 of them. The same 14 possibilities are given to all viewers.

With regards to the video tags, TED gives each video a number of possible tags to link a talk with different topics. The average video has 7.56 tags, with some videos having over 30 tags and some having just one. The other important variable that will be useful for us to observe is the related talks. Every video is given a connection to 6 other videos that are suggested for the viewer to watch. We suggest to improve these two metrics, having specific tags that might be more useful for the viewer, and testing if there is any link in the related talks with the number of views the talks have.

### Transcript Dataset
The transcript dataset contains the transcripts for 2467 TED talks. In this database we found three duplicates. We decided to analyze only the talks that are found on both databases in order to have homogeneous data. The 86 talks for which there is no transcript data are the most recent ones. Therefore, we discarded the data that was duplicated and for which there were no transcripts.

##### Source:
The data has been scraped from the official TED Website and is available under
the Creative Commons License. It was retrieved from the Kaggle featured data sets in October 2017.

## Data Preparation
### Data Munging

The two dataset were merged in order to consolidate all of the information. A RegEx was implemented in order to formally count the number of words in each talk, given that the information contained was in a string with str symbols (i.e. contained symbols representing newlines, tabs, etc.). The duration of the videos was changed to minutes to enable a words per minute (WPM) analysis of the talks. This will allow to do analysis of the pace of each talk.

Most of the data represented originally had lists and dictionaries structure. When they were scrapped they were all saved as strings. A function 'g' was created in order to convert the strings back to their original data structures (dictionaries and lists). In the case of the urls, the last part of the string was erased in order to have functional links. All dates contained in the datasets were changed from Unix timestamps to "datetime" objects.

In [7]:
# Clean the URL's to make them useful
mainData['url'] = mainData['url'].apply(lambda x: x[:-1])
transcripts['url'] = transcripts['url'].apply(lambda x: x[:-1])
mainData.index = mainData['url']
transcripts.index = transcripts['url']

# Drop any duplicates
mainData.drop_duplicates('url',inplace=True)
transcripts.drop_duplicates('url',inplace=True)

# take away the colums repeated by the index
mainData = mainData.drop('url',axis=1)
transcripts = transcripts.drop('url',axis=1)

# We concatenate the two datasets with an inner join
mainData = pd.concat([mainData, transcripts], axis=1, join='inner')

# Format the date variable
def change_t(timestamp):
    return datetime.utcfromtimestamp(timestamp)
mainData["film_date"] = mainData["film_date"].apply(change_t)
mainData["published_date"] = mainData["published_date"].apply(change_t)

# Convert the data from strings into their original data structure
g = lambda x: eval(x, {'__builtins__':None}, {})
mainData['ratings'] = mainData['ratings'].apply(g)
mainData['tags'] = mainData['tags'].apply(g)
mainData['related_talks'] = mainData['related_talks'].apply(g)

### Data Engineering

One of the questions that we desire to answer is how the audience receives a talk. For this reason we have classified the possible categorical ratings that the audience can give to a talk as positive or negative ratings. 

- Positives = ['Funny', 'Beautiful', 'Ingenious', 'Courageous', 'Informative', 'Fascinating', 'Persuasive', 'Jaw-dropping', 'Inspiring']

- Negatives = ['Obnoxious', 'OK', 'Confusing', 'Unconvincing', 'Longwinded']

In order to compare the ratings for different talks we added a variable that gives the ratio of total number of negative ratings over the total number of ratings given for each individual talk. From here we derived the positive ratio.

We also created a column regarding the total number of ratings, given that this can give some insight on how impactful the Talk was, either positively or negatively, given that comments are done when Talks are skewed to a positive or negative side of the spectrum.

A variable called WPM was created that corresponds to the number of words per minute (i.e. pace of the speech). Another variable that was created was "original ted" which marks the talks that are given in an original TED conference.

In [8]:
# Add a variable for word counts
f = lambda x: len(re.findall(r'\w+', x))
mainData['word_count'] = mainData['transcript'].apply(f)
Duration_Minutes = mainData['duration'].apply(lambda x: x/60)
mainData.insert(3,'Duration_Minutes',Duration_Minutes)
mainData['wpm'] = mainData['word_count']/mainData['Duration_Minutes']

# Categorize the ratings with a binary variable
def neg_ratings(list_dict):
    negative = ['Obnoxious','OK','Confusing','Unconvincing','Longwinded']
    neg_count = 0
    pos_count = 0
    for i in list_dict:
        if i['name'] in negative:
            neg_count += i['count']
        else:
            pos_count += i['count']
    return(neg_count/(neg_count+pos_count))
def totalcount(list_dict):
    total = 0
    for i in list_dict:
        total += i['count']
    return(total)
mainData['Tots_Ratings'] = mainData['ratings'].apply(totalcount)
mainData['Neg_Ratio'] = mainData['ratings'].apply(neg_ratings)
mainData['Pos_Ratio'] = 1 - mainData['Neg_Ratio']
# Binary variable to determine if it is original TED or other (e.g. TEDx)
def original_ted(x):
    myre = '^TED[0-9]{4}$'
    if re.search(myre,x):
        return(1)
    else:
        return(0)
mainData['Real Ted'] = mainData["event"].apply(original_ted)

### (Appendix A) Proof that categorizing and regressing are unprofitable

In [9]:
# Create categorical variable
describe_view = mainData["views"].describe()
views = pd.cut(mainData["views"],[describe_view["min"]-1,
            describe_view["25%"], describe_view["50%"], 
            describe_view["75%"], describe_view["max"]+1],
            labels=["0", "1", "2", "3"])
mainData["cut_views"] = views
Independent = mainData[['Duration_Minutes','num_speaker','word_count', 'wpm', 'Neg_Ratio', 'Pos_Ratio']]
Target1 = mainData.cut_views
Target2 = mainData.views
Independent = StandardScaler().fit_transform(Independent)
Independent2 = np.copy(Independent)

In [10]:
TED_Regress2 = XGBRegressor(silent=False, n_jobs=-1,reg_lambda=10,gamma=0,learning_rate=.1)
# new_score = -np.mean(cross_val_score(TED_Regress,Independent, Target, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error'))
parameters = {'learning_rate':[.1, .2, .4, .5, .6, .7], 'gamma':[0, 5, 100], 'reg_lambda':[100, 50, 10, 1, .5]}
GCV2 = GridSearchCV(TED_Regress2, parameters,scoring='neg_mean_absolute_error',n_jobs=-1)
GCV2.fit(Independent,Target2)

GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=10, scale_pos_weight=1, seed=None,
       silent=False, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.2, 0.4, 0.5, 0.6, 0.7], 'gamma': [0, 5, 100], 'reg_lambda': [100, 50, 10, 1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [None]:
TED_Regress1 = XGBClassifier(silent=False, n_jobs=-1,reg_lambda=10,gamma=0,learning_rate=.1)
# new_score = -np.mean(cross_val_score(TED_Regress,Independent, Target, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error'))
parameters = {'learning_rate':[.1, .2, .4, .5, .6, .7], 'gamma':[0, 5, 100], 'reg_lambda':[100, 50, 10, 1, .5]}
GCV1 = GridSearchCV(TED_Regress1, parameters, scoring=None, n_jobs=-1)
GCV1.fit(Independent2,Target1)

In [None]:
print(GCV1.best_estimator_)
print('Best Index         ',GCV1.best_index_)
print('Best Parameters    ',GCV1.best_params_)
print('Accuracy           ',GCV1.best_score_)

In [None]:
print(GCV2.best_estimator_)
print('Best Index         ',GCV2.best_index_)
print('Best Parameters    ',GCV2.best_params_)
print('Best Mean Abs Score',-GCV2.best_score_)

In [41]:
def wikipedia_searcher(word, stop_words, num_searched=21):
    """
    Input: a word, phrase, or otherwise anything that can be searched on Wikipedia.
    
    CAUTION: If you search a word that requires a disambiguation through Wikipedia 
    (e.g., cell: jail cell, biology cell, storm cell, etc), this function will throw an error.
    
    Returns: A list of up to 5 words closely related to the input word.
    
    This function works by searching the phrase on Wikipedia, then returning the 5 most common
    words from the summary at the top. Summaries tend to explain what the word means, using
    other words closely related to the input word, so this function works very well.
    I haven't checked the code too closely for optimizing speed, the 11,000-word dataset is not
    overly prohibitive yet.

    We pass in the word to Wikipedia's summary.
    If it doesn't work, we'll go to GloVe to get the most commonly
    associated words to the ambiguous words.
    BUT if all that fails, then we'll just pick one of the disambiguations
    at random to use as our word.
    Really, the word doesn't specify which disambiguation it is referring to,
    so this is our best remaining option.
    """
    if pd.isnull(word):
        return []
    try:
        summary = wikipedia.summary(word)
        definite_word = word
    except wikipedia.exceptions.DisambiguationError as e:
        for desperate_word in e.options:
            try:
                summary = wikipedia.summary(desperate_word)
                definite_word = desperate_word
                break
            except:
                continue
    print(definite_word)
    summary = re.sub(r'[^\w\s]','',summary).replace('\n',' ').rstrip().lower().split(' ')
    summary = [word for word in summary if word not in stop_words]
    mydict = Counter(summary)
    return [i[0] for i in mydict.most_common(num_searched)]


In [51]:
stopwords = codecs.open('stop_words.txt').readlines()
stopwords = [word[:-1] for word in stopwords]

In [None]:
wiki_dicts = []
for j in A21:
    all_words = []
    for i in j:
        print(i)
        all_words += wikipedia_searcher(i,stopwords)
    all_dicts.append(Counter(all_words))
    print('\n\n')

In [97]:
with open('Wikipedia_dict.pickle', 'wb') as f:
    pickle.dump(wiki_dicts, f)  

with open('Wikipedia_dict.pickle', 'rb') as f:
    wiki_dicts = pickle.load(f)

In [None]:
j=0
for i in all_dicts:
    print(A21[j]);j+=1
    print(*[i.most_common()[j][0] for j in range(5)],'\n')

In [None]:
print(*A21,sep='\n')

In [10]:
import os
os.getcwd()

'/acmeshare/juan9310/TedProject'

In [12]:
datadict = {}
FILES = glob.glob('glove*.txt')
print(FILES)
file = FILES[0]
with open(file) as f:
    for ind, l in tqdm(enumerate(f)):
        line = l.split(' ')
        word, vec = line[0], np.array(line[1:]).astype(np.float32)
        datadict[word] = vec

325it [00:00, 3248.20it/s]

['glove.txt']


400000it [00:54, 7308.94it/s]


In [13]:
glove_words = np.array(list(datadict.keys()))
glove_A = np.array(list(datadict.values()))

#By experimentation, keeping 27 dimensions keeps 70% of the variance, which should be ok.
pca = PCA(n_components=30) 
PCA_A = pca.fit_transform(glove_A)

In [14]:
glove_A.shape

(400000, 300)

In [15]:
PCA_A.shape

(400000, 30)

In [39]:
glove_tree = KDTree(glove_A)

In [64]:
def glove_searcher(word, num_searched = 21):
    if pd.isnull(word):
        return []
    word_itself = [word]
    word_itself.extend(word.split(' '))
    word_itself.extend([i+'s' for i in word_itself]) #Add plurals too; we don't want those.
    
    word_vector = glove_A[glove_words==word][0] #accessing the 0th entry is necessary.
    dists, vec_locs = glove_tree.query(word_vector, 2*num_searched+1) #There may be up to 2 words in here from the input word.
    associated_words = glove_words[vec_locs]
    truncated = [i for i in associated_words if i not in word_itself]
    
    return np.array([i for i in truncated if i not in stopwords])[:num_searched] #Up to 5 words.

In [65]:
word = 'child'
glove_searcher(word)

array(['children', 'parents', 'mother', 'infant', 'girl', 'daughter',
       'boy', 'newborn', 'instance', 'father', 'baby', 'kids', 'victim',
       'siblings', 'birth', 'teenage', 'couple', 'young', 'daughters',
       'teenagers', 'husband'], dtype='<U10')

In [None]:
glove_dicts = []
for j in A21:
    all_words = []
    for i in j:
        print(i)
        all_words += list(glove_searcher(i))
    glove_dicts.append(Counter(all_words))
    print('\n\n')

In [None]:
glove_dicts

In [70]:
with open('Glove_dict.pickle', 'wb') as f:
    pickle.dump(glove_dicts, f)  

with open('Glove_dict.pickle', 'rb') as f:
    Glove_dicts = pickle.load(f)