In [2]:
# git config --global credential.helper store

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import itertools
from collections import Counter
import ssl
import time
import string
import unicodedata

from urllib.request import Request, urlopen
from threading import Thread
from bs4 import BeautifulSoup

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)

import nltk
nltk.download(["stopwords", "punkt", "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"])
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk import RegexpParser
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

from clean_features import clean_features
from clean_weeks import clean_weeks
from web_scraping import parse_page, store_lyrics, read_lyrics
from nlp_pipeline import clean_lyrics, lyrics_tokenize
from genre_helper_functions import get_bucket, contains_genre_type, create_genre_column
from make_plots import (make_frequency_plot, make_line_plot, make_dual_plot_same,
                        make_dual_plot_mixed, make_scatter)
import modeling_functions as mf

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_percept

In [None]:
features = clean_features()
weeks = clean_weeks()

In [None]:
'''
joined = weeks.merge(features, on='SongID')
#joined.to_csv("data/joined.csv", index=False)

# Expand genres into individual components
featureGenres = features.explode('spotify_genre')
featureGenres = featureGenres[featureGenres['spotify_genre'] != '']

joinedGenres = joined.explode('spotify_genre')
joinedGenres = joinedGenres[joinedGenres['spotify_genre'] != '']

explicitness = joined[['Year', 'spotify_track_explicit']]
explicitness = explicitness.groupby(['Year']).mean().reset_index()

numericalMetrics = joined.columns.tolist()[11:23]
numericals = joined[['Year'] + numericalMetrics].groupby(['Year']).mean().reset_index()
'''

In [None]:
'''
# Normalize numerical features not between 0 and 1
featureGenresNorm = featureGenres.copy()
scaled = ["track_duration", "loudness", "tempo"]
for metric in scaled:
    mms = MinMaxScaler()
    featureGenresNorm[metric] = mms.fit_transform(featureGenresNorm['track_duration']. \
                                to_numpy().reshape(-1, 1))

# Create grouped tables
genres = featureGenres.groupby(['spotify_genre'])['SongID'].count().reset_index()
genresJoined = joinedGenres.groupby(['spotify_genre'])['SongID'].count().reset_index()
genresJoinedDecade = joinedGenres.groupby(['spotify_genre', 'Decade'])['SongID'].count(). \
                        reset_index().sort_values(by="Decade")
genreFeatures = featureGenresNorm.groupby(['spotify_genre'])[numericalMetrics].mean().reset_index()
'''

In [None]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [24]:
testpage = parse_page("Dance the Night Away", "Twice")

In [None]:
# Web scrape lyrics
featureScrape = features.loc[[contains_genre_type(genre, ["pop", "rock", "metal"]) for genre \
                             in features['spotify_genre']]].reset_index(drop=True)
lyricsMap = {}
threads = []
temp = 0
start = time.time()
# Write scraped lyrics to hashmap, parallelize to save time (thread safe because no unique keys)
#for i in range(temp, temp+50):
for i in range(len(featureScrape)):
    t = Thread(target=store_lyrics, args=(featureScrape['Song'][i],
                featureScrape['Performer'][i], lyricsMap))
    threads.append(t)
    t.start()
for t in threads:
    t.join()
end = time.time()
print(end - start)
scrapedLyrics = pd.DataFrame(lyricsMap.items(), columns=["SongID", "Lyrics"])
scrapedLyrics.to_csv("data/scrapedLyrics.csv", index=False)

In [None]:
# Get list of all improperly formatted songs and save to file
problemSongs = []
for k, v in lyricsMap.items():
    if v[0][0] == "*":
        problemSongs.append([k] + v[2:5])
print(len(featureScrape), len(problemSongs))

In [None]:
with open("data/problemSongs.txt", "w") as file:
    for s in problemSongs:
        file.write("{}\n".format(s))

In [3]:
###################

In [4]:
# Read csv of previously outputted scraped lyrics and reformat to match original
allLyrics = read_lyrics()

In [5]:
def valid_lyrics(lyrics: str) -> bool:
    return lyrics[0][0] != "*"
allLyrics = allLyrics[[valid_lyrics(l) for l in allLyrics['Lyrics']]]
allLyrics['Lyrics'] = allLyrics['Lyrics'].map(clean_lyrics)
allLyrics.reset_index(drop=True, inplace=True)

In [10]:
testpage = parse_page("Dance the Night Away", "Twice")
testpage = [line.replace(",", "") for line in testpage]
testpage = clean_lyrics(testpage)
print(testpage)
testpage_tokenized = lyrics_tokenize(testpage)
print(testpage_tokenized)
testpage2 = parse_page("Boy with Luv", "BTS")
testpage2 = [line.replace(",", "") for line in testpage2]
testpage2 = clean_lyrics(testpage2)
print(testpage2)
testpage2_tokenized = lyrics_tokenize(testpage2)
print(testpage2_tokenized)

라라라라라라라 라라라라라라라 라라라라라라라 라라라라라라라 You and me in the moonlight ah 별 꽃 축제 열린 밤 ah 파도 소리를 틀고 춤을 추는 이 순간 이 느낌 정말 딱야 바다야 우리와 같이 놀아 ah 바람아 너도 이쪽으로 와 whoa 달빛 조명 아래서 너와 나와 세상과 다 같이 party all night long yeah it’s good If you wanna have some fun 짭짤한 공기처럼 이 순간의 특별한 행복을 놓치지마 One two three let’s go 저 우주 위로 날아갈 듯 춤추러 가 hey! Let’s dance the night away Let’s dance the night away Yeah One two three let’s go 저 바다 건너 들릴 듯 소리 질러 Let’s dance the night away Dance the night away Let’s dance the night away Dance the night away Let’s dance the night away You and me in this cool night ah 미소 짓는 반쪽 달 ah 그 언젠가 너와 나 저 달 뒷면으로 가 파티를 열기로 약속 yeah it’s good (hey!) If you wanna have some fun 은빛 모래알처럼 이 순간의 특별한 행복을 놓치지 마 One two three let’s go 저 우주 위로 날아갈 듯 춤추러 가 hey! Let’s dance the night away Let’s dance the night away (hey!) Yeah One two three let’s go 저 바다 건너 들릴 듯 소리 질러 (yeah!) Let’s dance the night away 오늘이 마지막인 듯 소리 질러 저 멀리 끝없이 날아오를 듯 힘껏 뛰어 더 높이 오늘이 마지막인 듯 소리 질러 저 멀리 쏟아지는 별빛과 Oh let’s dance the night away (yeah yeah y

In [11]:
# NLP pipeline to create tokens -> bag of words -> corpus
allLyrics['Lyrics_tokenized'] = list(map(lyrics_tokenize, allLyrics['Lyrics']))
allLyrics.to_csv("data/allLyricsTokenized.csv", index=False)



In [12]:
# TF-IDF
corpus = [testpage_tokenized, testpage2_tokenized]
tf = CountVectorizer()
tf_matrix = tf.fit_transform(corpus)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)

AttributeError: 'list' object has no attribute 'lower'

In [88]:
print(tf.vocabulary_)
print(tf_matrix.todense()[0:10])

AttributeError: 'CountVectorizer' object has no attribute 'vocabulary_'

In [89]:
print(tfidf.vocabulary_)
print(tfidf_matrix.todense()[0:10])

AttributeError: 'TfidfVectorizer' object has no attribute 'vocabulary_'

In [None]:
model = Sequential()