In [None]:
#pip install pyspark

In [1]:
# Import dependencies
from sklearn.preprocessing import LabelEncoder
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import pandas as pd
import findspark

In [2]:
# Add encoded category column
song_lyrics_df = pd.read_csv('../preliminary_dataframes/song_lyrics2.csv')
le = LabelEncoder()
new_column = le.fit_transform(song_lyrics_df['category']) 
song_lyrics_df.insert(4, 'category_id', new_column)

# Remove non_alpha_words column
song_lyrics_df = song_lyrics_df.drop('non_alpha_words', axis=1)

# Save DataFrame to CSV
song_lyrics_df.to_csv('encoded_df.csv', index=False)
song_lyrics_df.head()

Unnamed: 0,song,artist,artist_id,category,category_id,genres,filterd_genres,lyrics
0,positions,Ariana Grande,66CXWjxzNUsdJxJ2JdwvnR,toplists,35,"['dance', 'pop', 'post-teen']","['dance', 'pop']",Heaven sent you to me Im just hopin I dont re...
1,Therefore I Am,Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,toplists,35,"['pop', 'electropop']",['pop'],Im not your friend Or anything damn You think...
2,Monster (Shawn Mendes & Justin Bieber),Shawn Mendes,7n2wHs1TKAczGzO7Dd2rGr,toplists,35,"['dance', 'viral', 'pop', 'post-teen', 'canadi...","['dance', 'viral', 'pop', 'canadian']",You put me on a pedestal and tell me Im the b...
3,HOLIDAY,Lil Nas X,7jVv8c5Fj3E9VhNjxT4snq,toplists,35,"['queer', 'lgbtq+', 'pop', 'country']","['pop', 'country']",T T Tay Keith Took it to ten Hey Ayy its a ...
4,On Me,Lil Baby,5f7VJjfbwm532GiveGC0ZK,toplists,35,"['rap', 'atl']","['rap', 'atl']",Whats happenin Chi Chi Fill the bando up wi...


In [3]:
# Start Spark session
spark = SparkSession.builder.appName('lyrics_NLP').getOrCreate()

OSError: [Errno 62] Too many levels of symbolic links: '/opt/spark/./bin/spark-submit'

In [4]:
# Read in DataFrame
music_df = spark.read.csv('encoded_df.csv', sep=",", header=True)

# Remove songs without lyrics
#music_df = music_df.na.drop()
music_df.shape()

NameError: name 'pyspark' is not defined

In [37]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [38]:
# Tokenize lyrics
tokenizer = Tokenizer(inputCol='lyrics', outputCol='words')
tokenized_df = tokenizer.transform(music_df)
tokenized_df.show()

+--------------------+-------------+--------+--------------------+--------------------+
|                song|       artist|category|              lyrics|               words|
+--------------------+-------------+--------+--------------------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|[, you, put, me, ...|
|           positions|Ariana Grande|      34| Heaven sent you ...|[, heaven, sent, ...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|[, im, not, your,...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|[, billboard, bab...|
|              Dakiti|    Bad Bunny|      34| Baby ya yo me en...|[, baby, ya, yo, ...|
|             Errbody|     Lil Baby|      34| Flyer than every...|[, flyer, than, e...|
|             Whoopty|           CJ|      34| Loyalty over roy...|[, loyalty, over,...|
|    Drankin N Smokin|       Future|      34|" Saggin out the ...|[", saggin, out, ...|
|               On Me|     Lil B

In [39]:
# Remove stop words
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
removed_df = remover.transform(tokenized_df)
removed_df.show()

+--------------------+-------------+--------+--------------------+--------------------+--------------------+
|                song|       artist|category|              lyrics|               words|            filtered|
+--------------------+-------------+--------+--------------------+--------------------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|[, you, put, me, ...|[, put, pedestal,...|
|           positions|Ariana Grande|      34| Heaven sent you ...|[, heaven, sent, ...|[, heaven, sent, ...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|[, im, not, your,...|[, im, friend, an...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|[, billboard, bab...|[, billboard, bab...|
|              Dakiti|    Bad Bunny|      34| Baby ya yo me en...|[, baby, ya, yo, ...|[, baby, ya, yo, ...|
|             Errbody|     Lil Baby|      34| Flyer than every...|[, flyer, than, e...|[, flyer, everybo...|
|             Whoop

In [40]:
# TF-IDF
hashing = HashingTF(inputCol='filtered', outputCol='hashedValues', numFeatures=pow(2,18))
hashed_df = hashing.transform(removed_df)
idf = IDF(inputCol='hashedValues', outputCol='features')
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)
#rescaledData.select('words', 'features').show()
rescaledData.show()

+--------------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                song|       artist|category|              lyrics|               words|            filtered|        hashedValues|            features|
+--------------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Monster (Shawn Me...| Shawn Mendes|      34| You put me on a ...|[, you, put, me, ...|[, put, pedestal,...|(262144,[3386,392...|(262144,[3386,392...|
|           positions|Ariana Grande|      34| Heaven sent you ...|[, heaven, sent, ...|[, heaven, sent, ...|(262144,[7231,218...|(262144,[7231,218...|
|      Therefore I Am|Billie Eilish|      34| Im not your frie...|[, im, not, your,...|[, im, friend, an...|(262144,[18176,22...|(262144,[18176,22...|
|Levitating (feat....|     Dua Lipa|      34| Billboard Baby D...|[, billboard, bab...|[, bill

In [41]:
# Save DataFrame to CSV file
rescaledData.toPandas().to_csv('../preliminary_dataframes/nlp_df.csv')