In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

#### Import our sentiment dataset and reduce its dimensionality.
We do this to avoid overfitting on sentiment. We only have a few audio features and want to fit primarily on audio.

In [3]:
sent_df = pd.read_pickle('../Data/combined_sentiment_scores_df')
sent = sent_df.iloc[:, -6:]
sent_index = sent_df[['spotify_song_id']]

In [4]:
pca = PCA(n_components=2)
pca.fit(sent)
sent = pd.DataFrame(pca.transform(sent),
                    columns=['sentiment_feature_0', 'sentiment_feature_1'])
sent = pd.merge(sent_index, sent, left_index=True, right_index=True)

#### Import our n-gramed lyrics dataset, perform TF-IDF, and reduce its dimensionality:
Again, we do this to avoid overfitting on lyric features. We only have a few audio features and want to fit primarily on audio.

In [2]:
df = pd.read_pickle('../Data/n-gramed_lyrics')
df.reset_index(inplace=True, drop=True)

In [5]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_vectorizer = tf_idf.fit(df.lyrics)
tf_idf_array = tf_idf.fit_transform(df.lyrics).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array, columns=tf_idf.get_feature_names())
tf_idf_df.shape

(15340, 22298)

In [6]:
pca = PCA(n_components=3)
pca.fit(tf_idf_df)
tf_idf_df = pd.DataFrame(pca.transform(tf_idf_df),
                        columns=['tfidf_feature_0', 'tfidf_feature_1', 'tfidf_feature_2'])

#### We can now combine all of our lyric features

In [7]:
df = df.merge(tf_idf_df, left_index=True, right_index=True)
df = df.merge(sent, on='spotify_song_id')

In [8]:
df.to_pickle('../Data/tf_idf_sent_pca_features')
df.head(2)

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,...,unaltered_artist_name,lyrics,genius_song_id,genius_song_url,genius_artist_id,tfidf_feature_0,tfidf_feature_1,tfidf_feature_2,sentiment_feature_0,sentiment_feature_1
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,...,Suburban Kids With Biblical Names,old chance get gonna marry marry get act toget...,861607,https://genius.com/Suburban-kids-with-biblical...,353411,-0.038104,-0.035694,0.01979,-0.566828,-0.140975
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,...,Suburban Kids With Biblical Names,bigger everything ever done x found reason sta...,980120,https://genius.com/Suburban-kids-with-biblical...,353411,-0.03318,0.000825,-0.03385,-0.650236,0.045182
