In [None]:
#pip install pyspark

In [1]:
# Import dependencies
import os
import findspark
import pandas as pd

In [35]:
# Encode category column and save df
from sklearn.preprocessing import LabelEncoder
music_df = pd.read_csv('music_df.csv')
le = LabelEncoder()
music_df['category'] = le.fit_transform(music_df['category']) 
# Clean the lyrics column of the DataFrame
music_df['lyrics'] = music_df['lyrics'].str.replace('  ', ' ')
music_df['lyrics'] = music_df['lyrics'].str.replace('  ', ' ')
music_df['lyrics'] = music_df['lyrics'].str.replace('-', ' ')
music_df['lyrics'] = music_df['lyrics'].str.replace(',', '')
music_df['lyrics'] = music_df['lyrics'].str.replace(')', '')
music_df['lyrics'] = music_df['lyrics'].str.replace('’', '')
music_df.to_csv('encoded_df.csv', index=False)

In [36]:
# Read in DataFrame
from pyspark import SparkFiles
music_df = spark.read.csv('encoded_df.csv', sep=",", header=True)
# Remove songs without lyrics
music_df = music_df.na.drop()
music_df.show()

+--------------------+-------------+--------+--------------------+
|                song|       artist|category|              lyrics|
+--------------------+-------------+--------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|
|           positions|Ariana Grande|      34| Heaven sent you ...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|
|              Dakiti|    Bad Bunny|      34| Baby ya yo me en...|
|             Errbody|     Lil Baby|      34| Flyer than every...|
|             Whoopty|           CJ|      34| Loyalty over roy...|
|    Drankin N Smokin|       Future|      34|" Saggin out the ...|
|               On Me|     Lil Baby|      34|" Whats happenin ...|
|            PRACTICE|       DaBaby|      34|" Okay Boom boom ...|
|          Regardless|         RAYE|      34| Oh regard— la la...|
|Save My Life (fea...| David Guetta|      34|One last time One

In [37]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [38]:
# Tokenize lyrics
tokenizer = Tokenizer(inputCol='lyrics', outputCol='words')
tokenized_df = tokenizer.transform(music_df)
tokenized_df.show()

+--------------------+-------------+--------+--------------------+--------------------+
|                song|       artist|category|              lyrics|               words|
+--------------------+-------------+--------+--------------------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|[, you, put, me, ...|
|           positions|Ariana Grande|      34| Heaven sent you ...|[, heaven, sent, ...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|[, im, not, your,...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|[, billboard, bab...|
|              Dakiti|    Bad Bunny|      34| Baby ya yo me en...|[, baby, ya, yo, ...|
|             Errbody|     Lil Baby|      34| Flyer than every...|[, flyer, than, e...|
|             Whoopty|           CJ|      34| Loyalty over roy...|[, loyalty, over,...|
|    Drankin N Smokin|       Future|      34|" Saggin out the ...|[", saggin, out, ...|
|               On Me|     Lil B

In [39]:
# Remove stop words
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
removed_df = remover.transform(tokenized_df)
removed_df.show()

+--------------------+-------------+--------+--------------------+--------------------+--------------------+
|                song|       artist|category|              lyrics|               words|            filtered|
+--------------------+-------------+--------+--------------------+--------------------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|[, you, put, me, ...|[, put, pedestal,...|
|           positions|Ariana Grande|      34| Heaven sent you ...|[, heaven, sent, ...|[, heaven, sent, ...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|[, im, not, your,...|[, im, friend, an...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|[, billboard, bab...|[, billboard, bab...|
|              Dakiti|    Bad Bunny|      34| Baby ya yo me en...|[, baby, ya, yo, ...|[, baby, ya, yo, ...|
|             Errbody|     Lil Baby|      34| Flyer than every...|[, flyer, than, e...|[, flyer, everybo...|
|             Whoop

In [40]:
# TF-IDF
hashing = HashingTF(inputCol='filtered', outputCol='hashedValues', numFeatures=pow(2,18))
hashed_df = hashing.transform(removed_df)
idf = IDF(inputCol='hashedValues', outputCol='features')
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)
#rescaledData.select('words', 'features').show()
rescaledData.show()

+--------------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                song|       artist|category|              lyrics|               words|            filtered|        hashedValues|            features|
+--------------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|[, you, put, me, ...|[, put, pedestal,...|(262144,[3386,392...|(262144,[3386,392...|
|           positions|Ariana Grande|      34| Heaven sent you ...|[, heaven, sent, ...|[, heaven, sent, ...|(262144,[7231,218...|(262144,[7231,218...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|[, im, not, your,...|[, im, friend, an...|(262144,[18176,22...|(262144,[18176,22...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|[, billboard, bab...|[, bill

In [41]:
# Save DataFrame to CSV file
rescaledData.toPandas().to_csv('../preliminary_dataframes/nlp_df.csv')