# Data Collection

In [None]:
!pip install spotipy
!pip install numpy
!pip install pandas
!pip install pickle-mixin
!pip install tqdm

## Importing Needed Libraries

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import numpy as np
import csv
import pandas as pd
from tqdm.notebook import tqdm
import time
import pickle
import os
import random
import math

os.environ["SPOTIPY_CLIENT_ID"] = "760ed52c59224d5284b5e584d889eaa6"
os.environ["SPOTIPY_CLIENT_SECRET"] = "3985bd6c9b2c490d95cff9ef6426014d"
os.environ["SPOTIPY_REDIRECT_URI"] = "http://localhost:8080/"

scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [None]:
sp.track("https://open.spotify.com/track/5pcjystBtalYeqaiXCcgEY?si=757f260f6e6d43bb")

## Read Data

In [None]:
bb_data = pd.read_csv("data/original/billboard_top100.csv")
bb_data = bb_data[~(bb_data['date'] < '2008-01-01')]

musico_data = pd.read_csv("data/original/musico_artists_popularity.csv", sep="\t")

## Defining Helper Functions

In [None]:
def get_artist(raw):
    if "," in raw:
        return raw[:raw.index(",")]
    if "&" in raw:
        return raw[:raw.index("&") - 1]
    
    return raw

def gen_id(song, artist):
    return song + "%:%" + get_artist(artist)

def hit_song(track_hist, verbose=False, weights=[99, 95, 90, 80, 60, 10]):    
    top = {1: 0, 5: 0, 10: 0, 20: 0, 40: 0, 100: 0}
    for entry in track_hist:
        rank = entry["rank"]
        for key in top.keys():
            if rank <= key:
                top[key] += 1
                break
                
    if verbose:
        print(track_hist)
        print(top)
        
    return np.dot(list(top.values()), weights)

## Get URLs from Spotify

### Utilize Spotify to get URLs

In [None]:
urls = {}

for i in tqdm(range(bb_data.shape[0])):
    track = bb_data.iloc[i]
    key = gen_id(track["song"], track["artist"])
    if key not in urls.keys() or urls[key] == None:
        try:
            urls[key] = sp.search(q="artist: " + get_artist(track["artist"]) + " track: " + track["song"], type="track", limit=1)["tracks"]["items"][0]["external_urls"]["spotify"]
        except:
            urls[key] = "NA"
            
for key, value in urls.items():
    if value == "NA":
        print("NOT FOUND: " + str(key))
    if value == None:
        print("ERROR: " + str(key))

### Manually Update Some URL That Weren't Found

In [None]:
urls["You All Over Me (Taylor's Version) (From The Vault)%:%Taylor Swift Featuring Maren Morris"] = "https://open.spotify.com/track/5pcjystBtalYeqaiXCcgEY?si=757f260f6e6d43bb"
urls["It's Beginning To Look A Lot Like Christmas%:%Perry Como And The Fontane Sisters With Mitchell Ayres And His Orchestra"] = "https://open.spotify.com/track/2pXpURmn6zC5ZYDMms6fwa?si=4fddbdd48c8f422a"
urls["We Are One (Ole Ola) [The 2014 FIFA World Cup Official Song]%:%Pitbull Featuring Jennifer Lopez"] = "https://open.spotify.com/track/1PCvKFPWnTXAe2oaReVUcr?si=1f57b36659ec428c"

### Manually Correct Incorrect URLs

In [None]:
man_corrected_urls = pd.read_csv("data/manually_corrected_urls.csv")

urls_clean = {}
for row in man_corrected_urls.iterrows():
    urls.update({row[1]["new_id"]: row[1]["url"]})

for key, url in urls.items():
    if type(url) == str:
        urls_clean[key] = url
urls = urls_clean

### Sample 10 Random Songs as a Check

In [None]:
sample_keys = random.sample(list(urls), 10)

for key in sample_keys:
    print(key, urls[key])

### Write Pickle and CSV for URLs (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/urls_dict.pickle', 'wb') as handle:
    pickle.dump(urls, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/spotify_urls.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "url"])
    for key, value in urls.items():
        writer.writerow([key, value])

### Load URLs from Pickle

In [None]:
with open('pickles/urls_dict.pickle', 'rb') as handle:
    urls = pickle.load(handle)

## Finding Track Charting History

### Finding Track Charting History

In [None]:
tracks_hist = {}

for i in tqdm(range(bb_data.shape[0])):
    track = bb_data.iloc[i]
    new_id = gen_id(track["song"], track["artist"])
    if new_id in tracks_hist.keys():
        tracks_hist[new_id].append({key: track[key] for key in track.keys() if key not in ["song", "artist", "last-week"]})
    else:
        tracks_hist[new_id] = [{key: track[key] for key in track.keys() if key not in ["song", "artist", "last-week"]}]

### Write Pickle and CSV for Track Histories (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/tracks_hist_dict.pickle', 'wb') as handle:
    pickle.dump(tracks_hist, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/billboard_t100_filtered.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "track_hist"])
    for key, value in tracks_hist.items():
        tmp = [str(entry) for entry in value]
        writer.writerow([key, ";".join(tmp)])

### Load Track Histories from Pickle

In [None]:
with open('pickles/tracks_hist_dict.pickle', 'rb') as handle:
    tracks_hist = pickle.load(handle)

## Find "Hitness" Rating for Each Track Based on Charting History

### Calculate Hitness of Each Song

In [None]:
hitness = {}

for key in tracks_hist.keys():
    hitness[key] = hit_song(tracks_hist[key], weights=[100, 90, 80, 60, 40, 20])

### Write Pickle and CSV for Hitness (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/hitness_dict.pickle', 'wb') as handle:
    pickle.dump(hitness, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/billboard_hitness.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "hitness"])
    for key, value in hitness.items():
        writer.writerow([key, value])

### Load Hitness from Pickle

In [None]:
with open('pickles/hitness_dict.pickle', 'rb') as handle:
    hitness = pickle.load(handle)

## Get Audio Features of Each Track Using Spotipy

### Get Audio Features

In [None]:
features = {}

for key in tqdm(tracks_hist.keys()):
    if key not in features.keys() or features[key] == None:
        features[key] = sp.audio_features(urls[key])[0]

### Check For Missing Values

In [None]:
for key, value in features.items():
    if features[key] == None:
        print("NOT FOUND: " + key)

### Manually Entering Missing Value(s)

In [None]:
urls["Memories%:%David Guetta Featuring Kid Cudi"] = "https://open.spotify.com/track/5xYC48nOppVemY6U5GRGTb?si=7ba7d8a23d5e4ded"
# features["Memories%:%David Guetta Featuring Kid Cudi"] = 
sp.audio_features(urls["Memories%:%David Guetta Featuring Kid Cudi"])[0]

### Write Pickle and CSV for Features (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/features_dict.pickle', 'wb') as handle:
    pickle.dump(urls, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/spotify_features.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id"] + list(features[list(features.keys())[0]].keys()) + ["url"])
    for key, value in features.items():
        if value == None:
            print(key, value)
        writer.writerow([key] + list(value.values()) + [urls[key]])

### Load Hitness from Pickle

In [None]:
with open('pickles/features_dict.pickle', 'rb') as handle:
    hitness = pickle.load(handle)

## Get Track Names

### Get Track Names with Spotify API

In [None]:
tracks = {}

for key in tqdm(tracks_hist.keys()):
    if key not in tracks.keys():
        try:
            tracks[key] = sp.track(urls[key])["name"]
        except:
            print("Error")
            tracks[key] = None

### Write Pickle and CSV for Track Names (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/tracks_dict.pickle', 'wb') as handle:
    pickle.dump(tracks, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/spotify_tracks.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "artist_id"])
    for key, value in tracks.items():
        writer.writerow([key, value])

### Load Track Names from Pickle

In [None]:
with open('pickles/tracks_dict.pickle', 'rb') as handle:
    tracks = pickle.load(handle)

## Getting Artist IDs of Artists on Each Song

### Get Artists IDs from Spotify API

In [None]:
artists = {}

for key in tqdm(tracks_hist.keys()):
    artists[key] = []
    if key not in artists.keys() or len(artists[key]) == 0:
        try:
            for artist in sp.track(urls[key])["artists"]:
                artists[key].append(artist["id"])
        except:
            artists[key] = None

### Write Pickle and CSV for Artist IDs (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/artists_dict.pickle', 'wb') as handle:
    pickle.dump(artists, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/spotify_artists.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "artist_id"])
    for key, value in artists.items():
        writer.writerow([key, value])

### Load Artist IDs from Pickle

In [None]:
with open('pickles/artists_dict.pickle', 'rb') as handle:
    artists = pickle.load(handle)

## Getting Charting Years of Each Track

### Find Charting Years for Each Track

In [None]:
charting_years = {}

for key in tqdm(tracks.keys()):
    charting_years[key] = [tracks[key][0]["date"][:4], tracks[key][-1]["date"][:4]]

### Write Pickle and CSV for Charting Years (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/dates_dict.pickle', 'wb') as handle:
    pickle.dump(charting_years, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/spotify_charting_years.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "chart_start", "chart_end"])
    for key, value in charting_years.items():
        writer.writerow([key, value[1], value[0]]) 

### Load Charting Years from Pickle

In [None]:
with open('pickles/dates_dict.pickle', 'rb') as handle:
    charting_years = pickle.load(handle)

## Getting Popularity of Artists

### Calculate Max Artist Popularity for Each Song

In [None]:
popularity = {}

for key in tqdm(tracks_hist.keys()):
    popularity[key] = -1
    for artist in artists[key]:
        try:
            popularity[key] = max(popularity[key], musico_popularity_data.loc[(musico_popularity_data["year"] >= int(charting_years[key][0])) & (musico_popularity_data["year"] <= max(2018, int(charting_years[key][1]))) & (musico_popularity_data["artist_id"] == artist)]["year_end_score"].mean())
        except:
            pass

### Write Pickle and CSV for Artist Popularity (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open('pickles/popularity_dict.pickle', 'wb') as handle:
    pickle.dump(urls, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/musico_popularity.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "popularity"])
    for key, value in popularity.items():
        writer.writerow([key, value]) 

### Load Artist Popularity from Pickle

In [None]:
with open('pickles/popularity_dict.pickle', 'rb') as handle:
    popularity = pickle.load(handle)

## Getting Lyrics

In [None]:
dataset = pd.read_csv("/Users/hyungjaekim/Data Science Academy/music/group1_hit_songs_dataset_raw.csv")

### Install lyricsgenius Package

In [None]:
!pip install lyricsgenius

### Setup Genius API

In [None]:
from lyricsgenius import Genius
token = '9mS1_EvPIb3jQdXQegjFdV-traU6NAJxsY24Wze5uWLJDzP-Zt9xHevkK6QZ9_aR'
genius = Genius(token, verbose=False)

### Get Lyrics from Genius

In [None]:
lyrics = {}

for key in tqdm(urls.keys()):
    if key not in lyrics.keys() or lyrics[key] == None:
        try:
            track = tracks[key]
            artist = sp.artist(artists[key][0])["name"]
            lyrics[key] = genius.search_song(title=track, artist=artist)
        except:
            print(key)
            lyrics[key] = None

"""
#run through every song in the database
for i in tqdm(range(data.shape[0])):
    #retrieve all song information
    track = data.iloc[i]["new_id"]
    #extract song title
    song = track.partition("%:%")[0]
    #extract artist
    artist = track.partition("%:%")[2]
    #If it's featuring, change the artist to just the name of the main artist
    if "Featuring" in artist:
      artist = artist.partition("Featuring")[0]
      # if track not in lyrics.keys() or lyrics[track] == None:   
    try:
        #Search the genius API for the song by artist and song name and extract lyrics
        lyrics[track] = genius.search_artist(artist, max_songs=1, sort="title").song(song).lyrics
        #Put into a pickle
        with open('lyrics.pickle', 'wb') as handle:
          pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    except:
        #If error raised make it NA
        lyrics[track] = "NA"
"""

### Write Pickle for Lyrics (DO NOT RUN IF YOU ARE LOADING FROM PICKLE)

In [None]:
with open("pickles/lyrics.pickle", "wb") as handle:
    pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Load Lyrics from Pickle

In [None]:
with open("pickles/lyrics.pickle", "rb") as handle:
    lyrics = pickle.load(handle)

### Write Incorrect URLs in Need of Manual Search

In [None]:
with open('data/incorrect_urls.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "url"])
    for key, value in lyrics.items():
        if value == None:
            writer.writerow([key, None])

### Count Number of Incorrect URLs

In [None]:
count = 0
for key, lyric in lyrics.items():
    if lyric == None:
        print(key)
        count += 1
print(count)

## Getting Sentiment Analysis

### Installing VADER Sentiment Analysis Package

In [None]:
!pip install vaderSentiment

### Running Sentiment Analysis on Lyrics

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_sentiment(lyric):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(lyric)
          
# sentiments = {}

for new_id, lyric in tqdm(lyrics.items()):
    if new_id not in sentiments.keys():
        cleaned_lyric = ""
        for sentence in lyric.splitlines()[1:]:
            if sentence == "" or sentence[0] == "[":
                continue
            cleaned_lyric += sentence + "\n"
        print(new_id)
        sentiments[new_id] = sentiment_scores(cleaned_lyric)

### Writing Sentiment

In [None]:
with open('pickles/sentiment.pickle', 'wb') as handle:
    pickle.dump(sentiment, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/genius_vader_sentiments.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(["new_id", "sentiment"])
    for key, value in sentiments.items():
        writer.writerow([key, value["compound"]])