In [None]:
import numpy as np 
import pandas as pd 

%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns

import ast

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

from wordcloud import WordCloud

In [None]:
!python -m spacy download en_core_web_md
import spacy
nlp =  spacy.load('en_core_web_md')

In [None]:
df_metadata = pd.read_csv('../data/ted_main.csv')
df_transcripts = pd.read_csv('../data/transcripts.csv') 

In [None]:
df_metadata.head()

In [None]:
print("There are {} talks, with {} duplicate titles, and {} speakers that have more than one talk.".format(
        len(df_metadata),
        df_metadata.duplicated('title').sum(),
        df_metadata.duplicated('main_speaker').sum()
     ))

# 1 Data Exploration

Let's focus on:

* tags.
* description
* (mabe title? (same as name, except that name also has the name of the speaker))

But keep in mind that there are other super interesting columns, for example: stuff related with popularity (ratings, comments) and talks that a particular talk is related to. 

## 1.1 Tags

Most talks have 4-7 tags.

In [None]:
df_metadata['number_tags'] = df_metadata.tags.apply(lambda x: len(ast.literal_eval(x)))

fig, ax = plt.subplots(1,1,figsize=(16,3))
sns.countplot(x='number_tags',data=df_metadata,ax=ax)

What are the most popular tags?

In [None]:
def flatten_list_of_list_of_words(l):
    """The tags column returns a list of lists of strings, this function just turns it to a single list.
    It is a bit annoying cause python usually loops on chars not words, but I found this dark magic here:
    https://stackoverflow.com/questions/52981376/flatten-list-of-list-of-strings
    """
    return [inner for item in df_metadata.tags for inner in ast.literal_eval(item)] 

df_all_tags = pd.Series(flatten_list_of_list_of_words(df_metadata.tags))

In [None]:
###(
# Just to make the issue clear:
tags_test = df_metadata.tags
tags = flatten_list_of_list_of_words(df_metadata.tags)
print(tags[0],'vs',tags_test[0])
print(len(np.unique(tags)),'vs',len(np.unique(tags_test)))
###)

In [None]:
df_all_tags.value_counts().sort_values(ascending=False).head(10)

Mostly a useless plot, but that people seem to enjoy, cause it's kinda of cool:

In [None]:
text = (' ').join(df_all_tags)

plt.figure(figsize=(20,6))
wc = WordCloud(background_color="white",width=1800,height=600).generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off");

Is there a group of tags that are way more used than others (that we might ignore)? 

In [None]:
fig, ax = plt.subplots(1,1,figsize=(16,3))
sns.violinplot(df_all_tags.value_counts(),ax=ax,color='C1')
ax.set_xlabel('Number of times a tag was used')

q25,q50,q75 = np.quantile(df_all_tags.value_counts(),[0.25,0.50,0.75])
#ax.axvline(q25,color='k')
#ax.axvline(q50,color='k')
#ax.axvline(q75,color='k');

print('25% of the tags are used less than {} times in the entire data set, 50% {} and 75% {} times)'.format(q25,q50,q75))

### Tags analysis I: K-means

Can we group this tags into broader groups (and end up with just a couple of tags)? 

   1. Take a pre-trainned embedding (from spacy, not sure what is the model, but it has a dimmention of 96, not 300 like Glove).
   2. Use PCA/t-sne/... to diminish the dimensions of the vectors from the embeddings.
   3. See if we can group it using k-means.

In [None]:
df_tags = df_all_tags.value_counts().rename_axis('tags').reset_index(name='counts')
df_tags['has_vector'] = df_tags.tags.apply(lambda x: nlp(x).has_vector)
df_tags['word_vector'] = df_tags.tags.apply(lambda x: nlp(x).vector)
df_tags.head()

All worlds exist in the vocab

In [None]:
df_tags.has_vector.value_counts()

**PCA**

I randomly choose to keep 5 components.

In [None]:
flat_list = [item for sublist in df_tags.word_vector for item in sublist]
X = np.reshape(flat_list,(len(df_tags),-1))
X.shape

Quick check to make sure I didn't swap the dimentions of the X matrix...

In [None]:
print((df_tags.word_vector[42] - X[42]).sum())

In [None]:
pca = PCA(n_components=5)
X_PCA = pca.fit_transform(X)
print(pca.explained_variance_ratio_)

For the K-means, we have to chose a number of classes before running the algorith. A way to try to come up with the optimal number of classes (k) is using the elbow-method (k vs mean distance of the points to the centroid to which they belong to).

In [None]:
# I just coppied this from here: https://pythonprogramminglanguage.com/kmeans-elbow-method/

distortions = []
K = range(1,50,2)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(X_PCA)
    distortions.append(sum(np.min(cdist(X_PCA, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_PCA.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

Actually, we might skip the PCA all together

In [None]:
distortions = []
K = range(1,50,2)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

It doesn't seem that there is a neat and smallish number of clusters of worlds, that would maybe point towards the same topic (e.g. childreen, education, play vs technology, computers, ai...). So **k-means doesn't seem to be that interesting to cluster ted talks**.

### Tags analysis II: Network analysis

Here I'm basing this work in this post about analysing a [entities relations in news](https://towardsdatascience.com/building-a-social-network-from-the-news-using-graph-theory-by-marcell-ferencz-9155d314e77f).

The original articule says:

> The fundamental premise behind building our social network will be two-fold and quite simple:
> 1. If two people are mentioned in the same article, they are friends.
> 2. The more articles mention the same two people, the closer they are as friends.


Which I'm going to addapt for this example by:

1. If two TED talks have the same tag, these talks are related (aka friends, but that's weird for talks...)
2. The more tags talks have in common, the stronger their relation is (which is still kind of weird for talks)

Notice here that I'm ignoring the actual content of the tags.

In [None]:
df_all_tags

In [None]:
df_metadata.head()

In [None]:
len(df_all_tags.unique())

In [None]:
from tqdm import tqdm
from itertools import combinations

In [None]:
def connect_videos_with_same_tag(df):
    links = []
    for tag in tqdm(df_all_tags.unique()):
        #videos_with_tag = df.title[[i for i,tag_list in enumerate(df.tags) if tag in tag_list]]
        #all_comb_2_videos = list(combinations(videos_with_tag, 2))
        idx_videos_with_tag = [i for i,tag_list in enumerate(df.tags) if tag in tag_list]
        all_comb_2_videos = list(combinations(idx_videos_with_tag, 2))
        links.append(all_comb_2_videos)
        
    flat_links = [item for sublist in links for item in sublist]
    df_links = pd.DataFrame({'TED1':np.transpose(flat_links)[0],'TED2':np.transpose(flat_links)[1]})
    
    return df_links


def get_links_weight(df):
    # This is literally coppied from the medium article. Check the original: 
    # https://towardsdatascience.com/building-a-social-network-from-the-news-using-graph-theory-by-marcell-ferencz-9155d314e77f
    df_links = df.groupby(['TED1', 'TED2']).size().reset_index()
    df_links.rename(columns={0: 'weight'}, inplace=True)
    df_links = df_links[df_links['weight'] > 1]
    df_links.reset_index(drop=True, inplace=True)
    df_links.sort_values('weight', ascending=False)
    return df_links

def add_title_for_human_readability(df_links,df):
    df_links['Title1'] = [df.iloc[idx_ted].title for idx_ted in tqdm(df_links.TED1)]
    df_links['Title2'] = [df.iloc[idx_ted].title for idx_ted in tqdm(df_links.TED2)]
    return df_links

df_links = get_links_weight(connect_videos_with_same_tag(df_metadata))
df_links = add_title_for_human_readability(df_links,df_metadata)

In [None]:
df_links.sample(10).head(10)

In [None]:
import networkx as nx

In [None]:
df_plot = df_links[df_links['weight']>10]
df_plot.reset_index(inplace=True, drop=True)
G_plot = nx.Graph()
for link in tqdm(df_plot.index):
    G_plot.add_edge(str(df_plot.iloc[link]['TED1']),
                    str(df_plot.iloc[link]['TED2']),
                    weight=df_plot.iloc[link]['weight'])

In [None]:
pos = nx.kamada_kawai_layout(G_plot)
nodes = G_plot.nodes()

fig, axs = plt.subplots(1, 1, figsize=(15,20))
el = nx.draw_networkx_edges(G_plot, pos, alpha=0.1, ax=axs)
nl = nx.draw_networkx_nodes(G_plot, pos, nodelist=nodes, node_color='#FAA6FF', with_labels=True, node_size=50, ax=axs)
ll = nx.draw_networkx_labels(G_plot, pos, font_size=10, font_family='sans-serif')

In [None]:
pos = nx.spring_layout(G_plot)
nodes = G_plot.nodes()

fig, axs = plt.subplots(1, 1, figsize=(15,20))
el = nx.draw_networkx_edges(G_plot, pos, alpha=0.1, ax=axs)
nl = nx.draw_networkx_nodes(G_plot, pos, nodelist=nodes, node_color='#FAA6FF', with_labels=True, node_size=50, ax=axs)
ll = nx.draw_networkx_labels(G_plot, pos, font_size=10, font_family='sans-serif')

### Tags analysis II: can we predict the tags using the description?

Since there are too many tags (about 400) the first step is to reduce this to about 50, by just keeping the most common ones.

In [None]:
most_common_tags = df_all_tags.value_counts().sort_values(ascending=False).rename_axis('tag',axis=0).reset_index()['tag'][:50].tolist()
most_common_tags

In [None]:
labels = []
for tag in most_common_tags:
    labels.append([1 if tag in tags else 0 for tags in df_metadata.tags])
    
labels = np.transpose(labels)
labels.shape

Make input data for the model:
    
    version 1: td-idf unigrams
    version 2: td-idf bigrams
    version 3: GloVe vectors using spacy
  

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english',strip_accents='ascii',ngram_range=(1,1),max_features=5000)
unigrams = vectorizer.fit_transform(df_metadata.description)
print(unigrams.toarray().shape)

vectorizer = TfidfVectorizer(stop_words='english',strip_accents='ascii',ngram_range=(2,2),max_features=5000)
bigrams = vectorizer.fit_transform(df_metadata.description)
print(bigrams.toarray().shape)

vecs = df_metadata.description.apply(lambda x: nlp(x).vector)
vecs = np.array([i for i in vecs]) # this probably can be done better
print(vecs.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train_uni, X_test_uni, y_train_uni, y_test_uni = train_test_split(unigrams, labels, test_size=0.33, random_state=42)
X_train_bi, X_test_bi, y_train_bi, y_test_bi     = train_test_split(bigrams, labels, test_size=0.33, random_state=42)
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(vecs, labels, test_size=0.33, random_state=42)

**Very simple model**

*unigrams*

In [None]:
import tensorflow as tf
from keras import metrics

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(5000,)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(50,activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy',metrics.categorical_accuracy])

model.fit(X_train_uni.toarray(), y_train_uni, epochs=20)

In [None]:
model.evaluate(X_test_uni.toarray(),  y_test_uni, verbose=2)

**Bigrams**

In [None]:
model_bi = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(5000,)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(50,activation='softmax')
])

model_bi.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy',metrics.categorical_accuracy])

model_bi.fit(X_train_bi.toarray(), y_train_bi, epochs=20)

In [None]:
model_bi.evaluate(X_test_bi.toarray(),  y_test_bi, verbose=2)

**Glove vectors**

In [None]:
model_vec = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(300,)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(50,activation='softmax')
])

model_vec.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy',metrics.categorical_accuracy])

model_vec.fit(X_train_vec, y_train_vec, epochs=20)

In [None]:
model_vec.evaluate(X_test_vec,  y_test_vec, verbose=2)

In [None]:
TO BE CONTINUED

## 1.3 Titles

I'll take a bit more of a manual approach here and use something quite handy that spacy has:

* [Part-of-speach tagging POS](https://spacy.io/api/annotation#pos-tagging) that says if a word is a verb, noun, adjective, etc
* [Named Entities Recognition](https://spacy.io/api/annotation#named-entities) that recognises some words as locations, people, events etc

And I'll just list the verbs, nouns, and proper nouns in the titles

In [None]:
def count_POS(text,pos='VERB',max_words=None):
    ''' Other maybe interestung POS: VERB, NOUN, PROPN
    '''
    doc = nlp(text)
    words = [token.lemma_ for token in doc if token.pos_ == pos]
    df_sorted_words =  pd.Series(words).value_counts().sort_values(ascending=False)
    df_sorted_words = df_sorted_words.rename_axis('words').reset_index().rename({0:'count'},axis=1)
    return df_sorted_words.words.to_list()[:None]

df_metadata['verbs_in_title'] = df_metadata.title.apply(lambda x: count_POS(x,pos='VERB'))
df_metadata['nouns_in_title'] = df_metadata.title.apply(lambda x: count_POS(x,pos='NOUN'))
df_metadata['proper_nouns_in_title'] = df_metadata.title.apply(lambda x: count_POS(x,pos='PROPN'))

In [None]:
df_metadata[['title','verbs_in_title','nouns_in_title','proper_nouns_in_title']].sample(10).head(10)

## 1.2 Descriptions

Instead of just listing the most common words, I'll also have a look at the most common verbs and nouns. Hopefully, they'll be related with the ones on the titles.

In [None]:
df_metadata['verbs_in_description'] = df_metadata.description.apply(lambda x: count_POS(x,pos='VERB',max_words=None))
df_metadata['nouns_in_description'] = df_metadata.description.apply(lambda x: count_POS(x,pos='NOUN',max_words=None))
df_metadata['proper_nouns_in_description'] = df_metadata.description.apply(lambda x: count_POS(x,pos='PROPN',max_words=None))

In [None]:
df_metadata[['description','verbs_in_description','nouns_in_description','proper_nouns_in_description']].sample(10).head(10)

## 1.3 Similarities between title and description

Quick look of the verbs and nouns

In [None]:
df_metadata[['title','verbs_in_title','verbs_in_description']].head()

I'll use the cosine similarity to compare the word vectors of verbs/nouns in the title to the ones in the description, just to have an idea of how close they are.

In [None]:
def similarity_lists(list_title,list_description):
    return nlp(' '.join(list_title)).similarity(' '.join(list_description))

df_metadata['similarity_verbs'] = df_metadata.apply(lambda row: similarity_lists(row['verbs_title'],row['verbs_description']),axis=1)
df_metadata['similarity_nouns'] = df_metadata.apply(lambda row: similarity_lists(row['nouns_title'],row['nouns_description']),axis=1)

In [None]:
df_metadata['similarity_title_desc']

In [None]:
TO DO 

put all verbs in the ground form - DONE
maybe td-idf is useful, to see what words are more unique to each description
think of a way to relate the two, it is list vs list -- WITH SIMILARITY

see how to generate news headers from news


In [None]:
def count_most_common_words(text,include_stop_words=True,max_return=None):
    
    doc = nlp(text)
    
    if include_stop_words:
        words = [w.text for w in doc if w.is_alpha]
    else:
        words = [w.text for w in doc if w.is_alpha and not w.is_stop]
        
    df = pd.Series(words).value_counts().sort_values(ascending=False)
    df = df.rename_axis('word').reset_index().rename({0:'count'},axis=1)

    if max_return is None:
        return df
    
    else:
        return df[:max_return].to_list()

def count_most_common_POS(text,pos='VERB'):
    ''' Other maybe interestung POS: VERB, NOUN, PROPN
    '''
    doc = nlp(text)
    words = [w for token in doc if token.pos_ == pos]
    return pd.Series(words).value_counts().sort_values(ascending=False)


df_test = count_most_common_words(df_metadata.description[0],include_stop_words=False)