In [2]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [5]:
import spacy

In [3]:
music_df = pd.read_csv('./data/clean/music_df.csv')

In [None]:
# !python -m spacy download en_core_web_lg

In [7]:
nlp = spacy.load('en_core_web_lg')

In [8]:
spacy.__version__

'3.0.3'

In [4]:
import spacy
import en_core_web_lg
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import string

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

nlp = en_core_web_lg.load()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jabru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jabru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jabru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NotImplementedError: [E957] Writing directly to `Language.factories` isn't needed anymore in spaCy v3. Instead, you can use the `@Language.factory` decorator to register your custom component factory or `@Language.component` to register a simple stateless function component that just takes a Doc and returns it.

### Online Example

In [7]:
body = [
  "the quick brown fox",
    "the slow brown dog",
    "the quick red dog",
    "the lazy yellow fox"  
]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [9]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(body)

In [10]:
bag_of_words

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [11]:
bag_of_words.todense()

matrix([[1, 0, 1, 0, 1, 0, 0, 1, 0],
        [1, 1, 0, 0, 0, 0, 1, 1, 0],
        [0, 1, 0, 0, 1, 1, 0, 1, 0],
        [0, 0, 1, 1, 0, 0, 0, 1, 1]], dtype=int64)

In [13]:
svd = TruncatedSVD(n_components = 2)

In [14]:
lsa = svd.fit_transform(bag_of_words)

In [15]:
lsa

array([[ 1.69490493,  0.29952405],
       [ 1.51585111, -0.76911037],
       [ 1.51585111, -0.76911037],
       [ 1.26618606,  1.44058513]])

In [17]:
topic_encoded_df = pd.DataFrame(lsa, columns=['topic_1', 'topic_2'])
topic_encoded_df['body'] = body
topic_encoded_df[['body', 'topic_1', 'topic_2']]

Unnamed: 0,body,topic_1,topic_2
0,the quick brown fox,1.694905,0.299524
1,the slow brown dog,1.515851,-0.76911
2,the quick red dog,1.515851,-0.76911
3,the lazy yellow fox,1.266186,1.440585


In [18]:
dictionary = vectorizer.get_feature_names()
dictionary

['brown', 'dog', 'fox', 'lazy', 'quick', 'red', 'slow', 'the', 'yellow']

In [21]:
encoding_matrix = pd.DataFrame(svd.components_, index=['topic_1', 'topic_2'], columns=dictionary).T
encoding_matrix

Unnamed: 0,topic_1,topic_2
brown,0.353937,-0.140256
dog,0.334199,-0.459436
fox,0.326416,0.519736
lazy,0.139578,0.430274
quick,0.353937,-0.140256
red,0.1671,-0.229718
slow,0.1671,-0.229718
the,0.660615,0.0603
yellow,0.139578,0.430274


In [22]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])

In [23]:
encoding_matrix

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
brown,0.353937,-0.140256,0.353937,0.140256
dog,0.334199,-0.459436,0.334199,0.459436
fox,0.326416,0.519736,0.326416,0.519736
lazy,0.139578,0.430274,0.139578,0.430274
quick,0.353937,-0.140256,0.353937,0.140256
red,0.1671,-0.229718,0.1671,0.229718
slow,0.1671,-0.229718,0.1671,0.229718
the,0.660615,0.0603,0.660615,0.0603
yellow,0.139578,0.430274,0.139578,0.430274


In [24]:
encoding_matrix.sort_values('abs_topic_2', ascending=False)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
fox,0.326416,0.519736,0.326416,0.519736
dog,0.334199,-0.459436,0.334199,0.459436
lazy,0.139578,0.430274,0.139578,0.430274
yellow,0.139578,0.430274,0.139578,0.430274
red,0.1671,-0.229718,0.1671,0.229718
slow,0.1671,-0.229718,0.1671,0.229718
quick,0.353937,-0.140256,0.353937,0.140256
brown,0.353937,-0.140256,0.353937,0.140256
the,0.660615,0.0603,0.660615,0.0603


### Pitchfork LSA

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD

In [14]:
music_df_subset = music_df.sample(n=100_000, replace=False)
#drop any null values - only dropping less than 20 out of the large amount of data
music_df_subset.dropna(subset=['content'], inplace=True)

In [15]:
body = music_df_subset['content']

In [16]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(body)

In [17]:
svd = TruncatedSVD(n_components = 10)
lsa = svd.fit_transform(bag_of_words)

In [18]:
topic_encoded_df = pd.DataFrame(lsa, columns=['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'topic_10'])
topic_encoded_df['body'] = body
topic_encoded_df[['body', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'topic_10']]

Unnamed: 0,body,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10
0,,3.159855,-2.284082,-0.535413,-0.697249,-0.083225,-1.513064,0.674506,-1.830428,-0.579353,-0.266522
1,,3.585974,1.193033,0.292489,1.199899,-0.326768,-1.684984,0.018063,0.268632,0.473227,-0.171835
2,I received this CD as a gift a few weeks ago f...,2.349984,0.276742,0.061650,0.853682,-0.185161,0.312554,0.498205,0.991497,-0.176372,0.181252
3,I am a beginner and have tried a couple of med...,7.959311,1.377752,0.267833,0.796150,2.046997,0.860893,1.738382,-0.309600,0.582057,-0.697527
4,,16.854707,-1.990994,-0.500710,4.690206,0.853345,0.612717,-0.313789,0.182246,-0.573742,1.479144
...,...,...,...,...,...,...,...,...,...,...,...
99985,,3.143810,-2.018101,-0.742453,-1.194682,-2.344113,0.150810,-0.432356,-0.341746,1.088740,-0.608507
99986,,5.713905,-0.192928,-0.108106,1.096568,-0.674055,1.559267,1.735372,0.857223,-0.174702,-0.802876
99987,,6.385014,-0.440039,-0.425217,0.015250,0.675124,0.864244,1.378276,0.429728,-1.022491,-1.530973
99988,,0.634869,-0.715221,0.174425,-0.590555,-0.364004,-0.368967,0.674571,-0.310687,-0.681022,0.495200


In [19]:
dictionary = vectorizer.get_feature_names()

In [20]:
encoding_matrix = pd.DataFrame(svd.components_, index=['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'topic_10'], columns=dictionary).T
encoding_matrix

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10
00,3.957877e-04,9.216036e-06,-2.417616e-04,3.918832e-04,-3.221933e-04,-6.103195e-04,5.875856e-04,-4.693084e-04,-1.195689e-03,-1.422940e-03
000,2.810022e-04,-2.856530e-04,-9.776862e-05,4.166059e-04,-3.125092e-04,2.746143e-04,8.035636e-04,-4.742324e-04,-7.876783e-04,3.155442e-04
0000,2.023326e-06,-4.168056e-06,-3.334536e-06,3.660791e-06,7.413700e-06,6.541652e-06,1.387844e-05,-3.530956e-06,-1.265257e-05,-5.319585e-06
00000,1.220965e-06,-6.973136e-06,-2.327762e-06,8.261153e-06,7.384200e-06,-3.227586e-07,7.576487e-06,1.562976e-06,-1.207539e-05,5.869230e-07
000ish,8.097626e-07,3.053329e-07,-6.265388e-07,3.285783e-06,-9.006352e-06,2.479756e-06,6.950250e-06,-4.332150e-07,3.445247e-06,6.230581e-06
...,...,...,...,...,...,...,...,...,...,...
zzzz,1.380520e-08,-4.890240e-07,-1.291952e-07,2.656525e-07,2.053721e-06,-6.825911e-07,-1.062871e-06,-1.037899e-06,-3.389787e-07,-4.497948e-07
zzzzzz,1.609450e-06,2.296324e-06,-1.132625e-06,6.417883e-06,-2.385782e-05,2.336717e-06,7.324374e-06,5.916243e-06,-2.322920e-06,6.479479e-06
zzzzzzzzz13,9.759453e-07,-2.533342e-06,-9.536243e-07,6.078310e-06,4.111070e-06,-1.069157e-07,-3.453208e-06,-2.423595e-07,-3.550171e-06,2.746440e-05
zzzzzzzzzzzzthere,3.852522e-07,-5.467566e-06,-1.482800e-06,1.246955e-06,7.472599e-06,-7.456893e-06,6.242388e-07,-1.577412e-05,5.520094e-06,-5.348028e-06


In [21]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
encoding_matrix['abs_topic_3'] = np.abs(encoding_matrix['topic_3'])
encoding_matrix['abs_topic_4'] = np.abs(encoding_matrix['topic_4'])
encoding_matrix['abs_topic_5'] = np.abs(encoding_matrix['topic_5'])
encoding_matrix['abs_topic_6'] = np.abs(encoding_matrix['topic_6'])
encoding_matrix['abs_topic_7'] = np.abs(encoding_matrix['topic_7'])
encoding_matrix['abs_topic_8'] = np.abs(encoding_matrix['topic_8'])
encoding_matrix['abs_topic_9'] = np.abs(encoding_matrix['topic_9'])
encoding_matrix['abs_topic_10'] = np.abs(encoding_matrix['topic_10'])

In [22]:
encoding_matrix.sort_values('abs_topic_2', ascending=False)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,abs_topic_1,abs_topic_2,abs_topic_3,abs_topic_4,abs_topic_5,abs_topic_6,abs_topic_7,abs_topic_8,abs_topic_9,abs_topic_10
the,6.899443e-01,5.241296e-01,1.250478e-01,3.681611e-01,-1.263407e-01,-1.866898e-01,-1.489069e-02,-5.985193e-02,6.612971e-02,-6.931982e-02,6.899443e-01,5.241296e-01,1.250478e-01,3.681611e-01,1.263407e-01,1.866898e-01,1.489069e-02,5.985193e-02,6.612971e-02,6.931982e-02
it,1.512734e-01,-3.723956e-01,-6.466710e-02,2.763091e-01,-7.461160e-02,-7.748773e-02,9.923701e-02,7.180832e-01,5.126224e-02,-2.716155e-01,1.512734e-01,3.723956e-01,6.466710e-02,2.763091e-01,7.461160e-02,7.748773e-02,9.923701e-02,7.180832e-01,5.126224e-02,2.716155e-01
this,1.410599e-01,-3.063057e-01,-5.342550e-02,1.276419e-01,3.387929e-01,2.237021e-03,1.180180e-01,-3.124212e-01,-3.649946e-01,-2.448304e-01,1.410599e-01,3.063057e-01,5.342550e-02,1.276419e-01,3.387929e-01,2.237021e-03,1.180180e-01,3.124212e-01,3.649946e-01,2.448304e-01
to,2.281518e-01,-2.804460e-01,-1.013596e-01,5.333523e-02,-3.463369e-01,4.868300e-01,-3.623657e-01,-1.953604e-01,2.995365e-01,3.457685e-02,2.281518e-01,2.804460e-01,1.013596e-01,5.333523e-02,3.463369e-01,4.868300e-01,3.623657e-01,1.953604e-01,2.995365e-01,3.457685e-02
you,7.380370e-02,-2.173452e-01,1.262419e-03,1.380324e-01,-7.395792e-02,2.828362e-02,1.708770e-01,-3.337019e-01,3.968781e-02,-1.770915e-01,7.380370e-02,2.173452e-01,1.262419e-03,1.380324e-01,7.395792e-02,2.828362e-02,1.708770e-01,3.337019e-01,3.968781e-02,1.770915e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sftwett,2.289566e-12,-3.419872e-12,9.687121e-11,7.715965e-11,-3.591586e-10,-4.207731e-11,8.640492e-10,2.232504e-10,-1.077135e-09,-4.691704e-10,2.289566e-12,3.419872e-12,9.687121e-11,7.715965e-11,3.591586e-10,4.207731e-11,8.640492e-10,2.232504e-10,1.077135e-09,4.691704e-10
sfsfsf,2.289566e-12,-3.419872e-12,9.687121e-11,7.715965e-11,-3.591586e-10,-4.207731e-11,8.640492e-10,2.232504e-10,-1.077135e-09,-4.691704e-10,2.289566e-12,3.419872e-12,9.687121e-11,7.715965e-11,3.591586e-10,4.207731e-11,8.640492e-10,2.232504e-10,1.077135e-09,4.691704e-10
sfdsffsfdf,2.289566e-12,-3.419872e-12,9.687121e-11,7.715965e-11,-3.591586e-10,-4.207731e-11,8.640492e-10,2.232504e-10,-1.077135e-09,-4.691704e-10,2.289566e-12,3.419872e-12,9.687121e-11,7.715965e-11,3.591586e-10,4.207731e-11,8.640492e-10,2.232504e-10,1.077135e-09,4.691704e-10
fsdf,2.289566e-12,-3.419872e-12,9.687121e-11,7.715965e-11,-3.591586e-10,-4.207731e-11,8.640492e-10,2.232504e-10,-1.077135e-09,-4.691704e-10,2.289566e-12,3.419872e-12,9.687121e-11,7.715965e-11,3.591586e-10,4.207731e-11,8.640492e-10,2.232504e-10,1.077135e-09,4.691704e-10


### Example From Class

In [25]:
tf_vect = TfidfVectorizer(stop_words='english', max_features=200)

In [26]:
X = music_df_subset['content']

In [27]:
dtm = tf_vect.fit_transform(X)

In [28]:
lda = LatentDirichletAllocation(n_components=10)

In [29]:
lda.fit(dtm)

LatentDirichletAllocation()

In [30]:
dtm

<99990x200 sparse matrix of type '<class 'numpy.float64'>'
	with 1347561 stored elements in Compressed Sparse Row format>

In [31]:
# import pickle

# pickle.dump(lda, 'lda.pk')
# # # then reload it with
# # lda_model = pickle.load('lda_model.pk')

In [32]:
# lda.save('lda.model')

In [None]:
# !pip install pyLDAvis

In [33]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [34]:
pyLDAvis.sklearn.prepare(lda, dtm, tf_vect)

  and should_run_async(code)
