In [14]:
import pandas as pd

data = pd.read_csv('song_data.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data=data[data['Hip-Hop/Rap']==1]
data.reset_index(drop=True, inplace=True)

In [15]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# docs=data_old.index.to_list()
docs=data.index.to_list()

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in data.index.to_list():
    docs[idx] = data.at[idx, 'LYRICS'].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

In [16]:
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one or two characters.
docs = [[token for token in doc if len(token) > 2] for doc in docs]

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [19]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [20]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [21]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [22]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1067
Number of documents: 754


In [23]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 3
passes = 30
iterations = 500
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [24]:
top_topics = model.top_topics(corpus, topn=10) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.2166.
[([(0.011779952, 'youre'),
   (0.009827721, 'way'),
   (0.0092895795, 'man'),
   (0.008502136, 'need'),
   (0.00821081, 'tell'),
   (0.008146796, 'hey'),
   (0.007946965, 'never'),
   (0.0077284407, 'ill'),
   (0.0076186634, 'give'),
   (0.00749161, 'gotta')],
  -0.8982348488138864),
 ([(0.023906128, 'bitch'),
   (0.018651875, 'shit'),
   (0.01701901, 'fuck'),
   (0.01380316, 'ayy'),
   (0.012625215, 'money'),
   (0.007863798, 'hey'),
   (0.0077253003, 'ooh'),
   (0.0074824123, 'gon'),
   (0.007017716, 'life'),
   (0.006418251, 'big')],
  -1.0816973436097568),
 ([(0.016623303, 'boy'),
   (0.015663745, 'hot'),
   (0.013390103, 'woo'),
   (0.012220253, 'walk'),
   (0.01077965, 'ride'),
   (0.0100454, 'low'),
   (0.009333121, 'watch'),
   (0.00891482, 'said'),
   (0.008875465, 'dat'),
   (0.008820154, 'hit')],
  -1.6697709286855444)]


In [61]:
topic_dist = list(model.get_document_topics(corpus))[6]
topic_dist

[(1, 0.2821305), (2, 0.7169467)]

In [65]:
topic_dist[0][1]

0.2821305

In [93]:
df_new = pd.DataFrame(data)
df_new['Topic 0'] = 0.0
df_new['Topic 1'] = 0.0
df_new['Topic 2'] = 0.0

for i in range(len(topic_dist)):
  topic_dist = list(model.get_document_topics(corpus[i]))

  for j in range(len(topic_dist)):
    if j == 0:
      df_new.at[i, 'Topic 0'] = topic_dist[0][1]
    elif j == 1:
      df_new.at[i, 'Topic 1'] = topic_dist[1][1]
    elif j == 2:
      df_new.at[i, 'Topic 2'] = topic_dist[2][1]



In [104]:
df_new.drop(columns=0, inplace=True)

In [109]:
X = df_new[['Topic 0', 'Topic 1', 'Topic 2']]
X

Unnamed: 0,Topic 0,Topic 1,Topic 2
0,0.998095,0.000000,0.000000
1,0.997421,0.000000,0.000000
2,0.997994,0.000000,0.000000
3,0.997478,0.000000,0.000000
4,0.997993,0.000000,0.000000
...,...,...,...
3236,0.350898,0.391917,0.257184
3237,0.108732,0.666010,0.225258
3238,0.953490,0.044945,0.000000
3239,0.437119,0.561114,0.000000


In [122]:
#### Do the actual clustering ####
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=5, verbose=False, )
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=3, n_init=5, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=False)

In [125]:
print(km.cluster_centers_)

[[0.24749983 0.71896821 0.03275805]
 [0.91485214 0.07618759 0.00728217]
 [0.2830022  0.24443722 0.47256058]]


In [127]:
import matplotlib.pyplot as plt

# Plotting the cluster centers and the data points on a 2D plane
plt.scatter(X[:, 0], X[:, -1])
    
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x')
    
plt.title('Data points and cluster centroids')
plt.show()

TypeError: ignored