In [1]:
from __future__ import print_function
import time

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [2]:
path = "youtube_comments_climate_change/cleaned_data.csv"
data = pd.read_csv(path, sep=",", encoding='utf-8')

list(data.columns)

['Unnamed: 0', 'publishedAt', 'authorName', 'text', 'isReply', 'cleaned']

In [3]:
import tensorflow as tf
import tensorflow_hub as hub

hub_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.KerasLayer(hub_url)
embeddings = embed(data['cleaned'])

print(embeddings)
print("The embeddings vector is of fixed length {}".format(embeddings.shape[1]))

OSError: SavedModel file does not exist at: /var/folders/vh/gybrjtm15llb7q0vbplqdvs80000gn/T/tfhub_modules/063d866c06683311b44b4992fd46003be952409c/{saved_model.pbtxt|saved_model.pb}

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim(input_vectors):
    similarity = cosine_similarity(input_vectors)
    return similarity

cosine_similarity_matrix = cos_sim(np.array(embeddings))
print(cosine_similarity_matrix)

### PCA

#### Choosing dimensions by looking at cumulative explained variance ration

In [None]:
X_embed = [np.array(emb) for emb in embeddings]
X = np.array([emb for emb in X_embed])

In [None]:
import matplotlib.pyplot as plt

pca_cev = PCA().fit(X)
plt.plot(np.cumsum(pca_cev.explained_variance_ratio_))
plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=12)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.grid(axis='x')

In [None]:
SUBSPACE_DIM = 2

pca = PCA(n_components= SUBSPACE_DIM)
pca.fit(X)
X_new = pca.transform(X)
pca_dim = X_new.shape[1]

X_list = [ list(X_new[i]) for i in range(X_new.shape[0])]

print("Dimensions of embeddings matrix after PCA: ({}, {})".format(X_new.shape[0], X_new.shape[1]))

In [None]:
print(pca.components_)

In [None]:
print(pca.explained_variance_)

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

# plot data
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.2)
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

In [25]:
X_list[0]

[0.54168385,
 -0.17364876,
 0.0008559711,
 0.093810365,
 0.023223635,
 0.007853197,
 -0.070139945,
 0.028013334,
 -0.03132332,
 -0.09416608,
 -0.13835822,
 0.11158867,
 0.03708365,
 0.027428862,
 -0.058646075,
 -0.09104265,
 -0.052695625,
 -0.06313127,
 0.07095427,
 -0.14024389,
 0.056206703,
 -0.04549577,
 0.07889472,
 -0.06101137,
 -0.054316536,
 -0.062499855,
 0.07329381,
 0.0012182854,
 -0.10612603,
 0.17370722,
 0.04733481,
 0.011612965,
 0.010314325,
 -0.05374921,
 0.044733837,
 0.073102884,
 0.1602329,
 -0.034588274,
 0.032931533,
 0.0870486,
 -0.14825329,
 0.020095598,
 -0.03243103,
 -0.1055067,
 0.021270528,
 -0.07841529,
 0.012200127,
 0.098992966,
 -0.05140899,
 0.015557879,
 0.05289026,
 0.08090312,
 -0.053625673,
 0.044009477,
 -0.0033516958,
 0.08318426,
 0.061527856,
 0.04777445,
 -0.06371616,
 0.027852593,
 0.022721538,
 -0.049247,
 0.03613004,
 0.06550949,
 0.0029577157,
 0.009952769,
 0.038182817,
 -0.0673822,
 0.037381224,
 0.03192152,
 -0.014384659,
 -0.030634873,
 

In [22]:
type(X_list[0])

list

In [26]:
from sklearn.preprocessing import StandardScaler

X_pre = StandardScaler().fit_transform(X_list)

In [None]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.3, min_samples=10).fit(X)