In [None]:
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.cache_handler import CacheFileHandler

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=os.environ['SPOTIFY_CLIENT_ID'], client_secret=os.environ['SPOTIFY_CLIENT_SECRET'], cache_handler=CacheFileHandler(username='keatonconrad')))

In [None]:
full_df = pd.read_csv('./complete_project_data.csv')
print(full_df.columns)
full_df.describe()

In [None]:
hit_df = full_df.loc[full_df['Label'] == 1]
hit_df.describe()

In [None]:
no_hit_df = full_df.loc[full_df['Label'] == 0]
no_hit_df.describe()

In [None]:
for column in full_df.columns:
    try:
        t = ttest_ind(hit_df[column].astype(float), no_hit_df[column].astype(float))
        print(column + ' - T: ' + str(t[0]) + ', p: ' + str(t[1]))
    except ValueError:
        continue

In [None]:
df = full_df.head(500)

In [None]:
import lyricsgenius
genius = lyricsgenius.Genius('rRYZ3FEWX4MeR22Nhk40IBkl8y8OB2PrybJ-R_rgeBTcEusrWdF5EDtm7L33-3Qc')

In [None]:
import re
from textblob import TextBlob

polarity = []
subjectivity = []
lyrics = []

for i, song in df.iterrows():
    try:
        song = genius.search_song(song['Track'], song['Artist'])
        song_lyrics = re.sub(r'\[.*?\]\n', '', song.lyrics).replace('\n', ' ')
        blob = TextBlob(song_lyrics)
        polarity.append(blob.sentiment[0])
        subjectivity.append(blob.sentiment[1])
        lyrics.append(song_lyrics)
    except:
        polarity.append(None)
        subjectivity.append(None)
        lyrics.append(None)

In [None]:
df = pd.DataFrame(df)
df['Polarity'] = polarity
df['Subjectivity'] = subjectivity
df['Lyrics'] = lyrics

In [None]:
df.dropna(inplace=True)
df.describe()

In [None]:
import sklearn as sk
from sklearn.model_selection import train_test_split

y_data = df['Label']
x_data = df.drop(['Label', 'Artist', 'Track', 'Year', 'Month', 'ArtistScore', 'Lyrics'], axis=1, inplace=False)
x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
    x_data, y_data, test_size=0.15, shuffle=False
)
print(len(y_data_train))

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler_model = MinMaxScaler()
scaler_model.fit(x_data_train)


x_data_train = pd.DataFrame(
    scaler_model.transform(x_data_train),
    columns=x_data_train.columns,
    index=x_data_train.index
)
x_data_test = pd.DataFrame(
    scaler_model.transform(x_data_test),
    columns=x_data_test.columns,
    index=x_data_test.index
)

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0).fit(x_data_train, y_data_train)
round(LR.score(x_data_test, y_data_test), 4)

In [None]:
SVM = sk.svm.LinearSVC()
SVM.fit(x_data_train, y_data_train)
round(SVM.score(x_data_test, y_data_test), 4)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(x_data_train, y_data_train)
round(RF.score(x_data_test, y_data_test), 4)

In [None]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-1, hidden_layer_sizes=(5, 2), random_state=1)
NN.fit(x_data_train, y_data_train)
round(NN.score(x_data_test, y_data_test), 4)

In [1]:
from platform import python_version

print(python_version())

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import boto3

s3 = boto3.resource('s3')


class EmbeddingGenerator:

    def __init__(self, max_sequence_len=50, texts=None, filename=None,
                 embedding_s3_bucket=None, embedding_s3_key=None, num_words=None,
                 char_level=False):
        self.word_index = {}
        self.embeddings_index = {}
        self.embedding_matrix = {}
        self.vocab_size = 0
        self.embedding_dimension = 0
        self.max_sequence_len = max_sequence_len
        self.tokenizer = Tokenizer(
            num_words=num_words,
            filters='\t\n',
            char_level=char_level,
            lower=False,
            oov_token='<unknown>'  # Sets words it doesn't know to this value
        )
        if texts is not None:
            self.generate_word_index(texts)
        if filename or (embedding_s3_bucket and embedding_s3_key):
            self.load_pretrained_embedding(filename, embedding_s3_bucket,
                                           embedding_s3_key)
            if texts is not None:
                self.generate_embedding_matrix()

    def generate_word_index(self, texts):
        """
        Creates the word index from the given texts.

        Args:
            texts: Array of strings
        Returns:
        The generated word index
        """

        self.tokenizer.fit_on_texts(texts)
        self.word_index = self.tokenizer.word_index
        self.vocab_size = len(self.word_index) + 1
        return self.word_index

    def generate_sequences(self, texts):
        """
        Transforms texts into sequences of word indices.
        Pads sequences so that they have equal length.
        Only callable after word index has been generated.

        Args:
            texts: Array of strings
        Returns:
            The padded sequences as a 2D-array of strings
        """

        sequences = self.tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(sequences, maxlen=self.max_sequence_len,
                               padding='post', truncating='post')
        return padded

    def load_pretrained_embedding(self, filename=None, s3_bucket=None, s3_key=None):
        """
        Loads a pretrained embeddings index from the given file.
        Assumes a file with one line per embedding, starting with word and
        followed by coefficients (separated by spaces).

        Args:
            filename: Path to embedding
            s3_bucket: S3 Bucket where embedding is stored
            s3_key: Key of embedding file stored on S3
        Returns:
            The loaded embeddings index
        """

        coefs = []

        if filename:
            with open(filename, encoding='utf-8') as f:
                for line in f:
                    values = line.split()
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    self.embeddings_index[word] = coefs
                self.embedding_dimension = len(coefs)
            return self.embeddings_index

        else:
            obj = s3.Object(s3_bucket, s3_key)
            for line in obj.get()['Body'].iter_lines():
                values = line.decode('utf-8').split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
            self.embedding_dimension = len(coefs)
            return self.embeddings_index

    def generate_embedding_matrix(self):
        """
        Creates embedding matrix from word and embeddings indices.

        Returns:
            Generated embedding matrix as 2D numpy array
        """

        embedding_matrix = np.zeros((len(self.word_index) + 1, self.embedding_dimension))
        for word, i in self.word_index.items():
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:
                # Words not found will stay all-zeros
                embedding_matrix[i] = embedding_vector
        self.embedding_matrix = embedding_matrix
        return self.embedding_matrix


3.9.4


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
print('Building model...')
# Branch 1
aux_input = Input(shape=(len(X_train.columns),))
aux = Dense(32, activation="relu")(aux_input)
aux = BatchNormalization()(aux)
aux = Model(inputs=aux_input, outputs=aux)

# Branch 2
emb_input = Input(shape=(embedding.max_sequence_len,))
emb = Embedding(embedding.vocab_size, embedding.embedding_dimension,
                input_length=embedding.max_sequence_len,
                weights=[embedding.embedding_matrix], trainable=True)(emb_input)
emb = Conv1D(filters=32, kernel_size=2, activation='relu')(emb)
emb = MaxPooling1D(4)(emb)
emb = Flatten()(emb)
emb = BatchNormalization()(emb)
emb = Model(inputs=emb_input, outputs=emb)

# Combined
combined = concatenate([aux.output, emb.output])
z = Dense(32, activation="relu")(combined)
z = BatchNormalization()(z)
z = Dropout(.1)(z)
z = Dense(32, activation="relu")(z)
z = BatchNormalization()(z)
z = Dropout(.1)(z)
z = Dense(1, activation="relu")(z)

model = Model(inputs=[aux.input, emb.input], outputs=z)

optimizer = keras.optimizers.Adam(.05)

model.compile(loss='mean_absolute_error', optimizer=optimizer)

es = EarlyStopping(monitor='val_loss', mode='min', patience=5, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=.2, patience=5, min_lr=0.001)

print(X_train.shape)
print(seqs_train.shape)

history = model.fit(
    [X_train, seqs_train],
    y_train,
    validation_data=([X_eval, seqs_eval], y_eval),
    batch_size=512,
    epochs=1,
    shuffle=True,
    callbacks=[es, rlr]
)

pred_train = model.evaluate([X_train, seqs_train], y_train)
pred_test = model.evaluate([X_eval, seqs_eval], y_eval)
y_pred = model.predict([X_eval, seqs_eval]).astype('int').flatten()