In [None]:
import pandas as pd
import json
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from scipy.linalg import svd
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

%matplotlib inline

In [None]:
with open('data/watch-later.json', 'r') as fout:
    wl = json.load(fout)
type(wl)

In [None]:
wl[0]

In [None]:
titles = [v['snippet']['title'] for v in wl]
descriptions = [v['snippet']['description'] for v in wl]

In [None]:
from wordcloud import WordCloud

In [None]:
wc = WordCloud().generate(" ".join(titles))
plt.figure(figsize=(10, 12))
plt.imshow(wc)
_ = plt.axis('off')

In [None]:
stopwords = ['http', 'www', 'com', 'facebook', 'https', 'youtube', 'bit', 'ly', 'goo', 'gl', 'watch', 'suggest', 'twitter',
             'watchmojo', 'instagram']
stopwords += list(ENGLISH_STOP_WORDS)
desc = ' '.join(descriptions).lower()

wc = WordCloud(stopwords=stopwords, normalize_plurals=True).generate(desc)
plt.figure(figsize=(10, 12))
plt.imshow(wc)
_ = plt.axis('off')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words=stopwords)

In [None]:
X = vect.fit_transform(descriptions)

In [None]:
X.shape

## Latent Semantic Analysis (LSA)

![](https://raw.githubusercontent.com/fastai/course-nlp/aabfeddf61fea29b18c72f841d057b56a216b7eb/images/svd_fb.png)

Source: [A Code-First Introduction to NLP](https://www.fast.ai/2019/07/08/fastai-nlp/)

In [None]:
U, S, V = svd(X.todense(), full_matrices=False)

In [None]:
num_top_words=10
vocab = vect.get_feature_names()

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [None]:
show_topics(V[:20])

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2)
x_red = tsne.fit_transform(X)

In [None]:
plt.scatter(*x_red.T)

In [None]:
outliers = np.arange(x_red.shape[0])[x_red[:, 0] > 10]
plt.scatter(*x_red.T)
plt.scatter(*x_red[outliers, :].T, c="r")

In [None]:
outliers = [titles[i] for i in outliers]

wc = WordCloud(stopwords=stopwords, normalize_plurals=True).generate(' '.join(outliers).lower())
plt.figure(figsize=(10, 12))
plt.imshow(wc)
_ = plt.axis('off')