In [None]:
%run ./pkg/db.py

In [None]:
import nltk
from nltk.corpus import stopwords, words, wordnet
from dotenv import load_dotenv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from stemming.porter2 import stem

In [None]:
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')

In [None]:

load_dotenv()
conn_string = os.getenv('POSTGRES_CONNECTION_STRING')

In [None]:
df = get_dataframe('lyrics_limit.sql', conn_string)

In [None]:
df.info()

In [None]:
# removing stopwords
stops = set(stopwords.words('english'))
df_stop = df[~df['word'].isin(stops)]

In [None]:
df_stop.info()

In [None]:
# removing non-english words
eng_words = set(wordnet.words())
eng_words_stem = set([stem(word) for word in eng_words])
df_stop_eng = df_stop[df_stop['word'].isin(eng_words_stem)]

In [None]:
df_stop_eng.info()

In [None]:
# Assemble the list of documents for count vectorizer
doc_list = []
grouped = df_stop_eng.groupby('track_id')
for doc_id, group in grouped:
    word_counts = []
    for index, row in group.iterrows():
        word_counts += [row['word']] * row['count']
    doc_str = ' '.join(word_counts)
    doc_list.append(doc_str)

In [None]:
# convert document list to count matrix
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(doc_list)

In [None]:
# Apply NMF to the matrix of word counts
num_topics = 10
model = NMF(n_components=num_topics, init='nndsvd')
model.fit(matrix)

In [None]:
# Print the top 10 words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(model.components_):
    print('Topic #%d:' % topic_idx)
    print(' '.join([feature_names[i] for i in topic.argsort()[:-11:-1]]))