 # TDDE16 - Project

### Description
Get song lyrics and detect if the lyrics are containing bad words and then mark the song as “Explicit”. Check from explicit songs in Spotify as golden standard. Replace the explicit word with a suitable non-explicit word.

Using Genius and Spotify API :) 


## Data retrieval

### Spotify - Track Retriever

In [None]:
import requests, json, base64

class TrackRetriever():
    def __init__(self):
        self.request_token()
        
    # Requests an OAuth token required by the Spotify API
    def request_token(self):
        # Client ID and secret for Spotify Application
        client_id = '055a803e06f848a9a6b18ec76aafbe6e'
        client_secret = '756b412f52d54b49bd4e2fa0b2f0df15'
        # Encode client ID and secret in base64
        string = ':'.join([client_id, client_secret])
        encoded = base64.b64encode(string.encode())
        # Add encoding to header
        header = { "Authorization" : "Basic " + encoded.decode('utf-8')}
        data = {"grant_type" : "client_credentials"}
        # Url for requesting OAuth token for Spotify API
        url = "https://accounts.spotify.com/api/token" 
        response = requests.post(url, data=data, headers=header)
        # Store OAuth token in class
        self.token = response.json()['access_token']
        if not self.token:
            print("Error receiving token!")
            print(response.json())
    
    # Gets track id of all songs in a playlist. 
    # Input is an Spotify URI of a playlist
    def get_tracks_from_playlist(self, spotify_uri):
        track_list = []
        split = spotify_uri.split(":")
        user = split[2]
        playlist_id = split[4]
        header = { "Authorization" : "Bearer " + self.token}
        url = "https://api.spotify.com/v1/users/{}/playlists/{}/tracks".format(user, playlist_id)
        response = requests.get(url, headers=header)
        json_res = response.json()
        for track in json_res['items']:
            track_list.append(track['track'])
        return track_list
    
    # Extracts track title, artist name and explicit bool from a track json
    def extract_info(self, track):
        # Join all artists to one string separated by a whitespace
        artists = " ".join([str(art['name']) for art in track['artists']])
        # Extract title and explicit status
        title = track['name']
        explicit = track['explicit']
        track_id = track['id']
        return {'id': track_id, 'artist': artists, 'title': title, 'explicit': explicit}
    
    # Starting method for retrieving songs from a playlist
    def get_track_info(self, uri):
        track_list = []
        tracks = self.get_tracks_from_playlist(uri)
        for t in tracks:
            track_list.append(self.extract_info(t))
        return track_list
            
        
# spotify:user:spotify:playlist:37i9dQZF1E9RVkbMAXdy3v        

### Genius - Lyrics Catcher

In [None]:
import requests, json, re
from bs4 import BeautifulSoup

class LyricsCatcher():
    def __init__(self, token):
        self.oauth_token = token
        self.api_url = "http://api.genius.com"
        self.api_headers = {'Authorization': 'Bearer ' + oauth_token}
        self.web_url = "https://genius.com"
        
    def search_song(self, song_title, artist_name):
        search_url = "{}/search?q={}{}".format(self.api_url, song_title, artist_name)
        response = requests.get(search_url, headers=self.api_headers)
        json = response.json()
        song_info = None
        for hit in json["response"]["hits"]:
            if hit["result"]["primary_artist"]["name"] == artist_name:
                song_info = hit
                break
        return song_info
    
    def get_url(self, song_id):
        endpoint = "/songs/{}".format(song_id)
        # Get song info from API using the song ID
        response = requests.get(self.api_url + endpoint, headers=self.api_headers)
        return response.json()["response"]["song"]["path"]
    
    def get_lyrics(self, endpoint):
        # Fetch page containing the lyrics
        u = self.web_url + endpoint
        page = requests.get(u)
        # Extract HTML source code from page
        html = BeautifulSoup(page.text, "html.parser")
        # Remove script tags that they put in the middle of the lyrics
        [h.extract() for h in html('script')]
        # Get the div-tag where the lyrics are and extract text inside
        lyrics = html.find("div", class_="lyrics").get_text()
        lyrics = re.sub(r'\[.+\]', '', lyrics)
        lyrics = re.sub(r'\n+', ' ', lyrics)
        return lyrics
    
    def fetch_lyrics(self, title, artist):
        song = self.search_song(title, artist)
        if not(song):
            return None
        song_id = str(song["result"]["id"])
        path = self.get_url(song_id)
        lyrics = self.get_lyrics(path)
        return lyrics
        

## Data gathering
Using the Spotify API to get songs and explicit status and then use the genius API to search for lyrics

In [None]:
import time
import winsound
# Method for printing statistics of the tracks
def evaluate(tracks):
    explicit_counter = 0
    non_explicit_counter = 0
    
    for track in tracks:
        if track['explicit']:
            explicit_counter += 1
        else:
            non_explicit_counter += 1
    total = len(tracks)
    print("Total number of lyrics: " + str(total))
    print("Explicit tracks:\t {0}% ({1})".format((explicit_counter/total)*100, explicit_counter))
    print("Non-Explicit tracks:\t {0}% ({1})".format((non_explicit_counter/total)*100, non_explicit_counter))
    
def play_sound():
    duration = 100  # millisecond
    freq = 440  # Hz
    for i in range(10):
        winsound.Beep(freq, duration)
        time.sleep(0.5)

In [None]:
oauth_token = "O8rraozwWRK1i66ofhUJSZ9EDCBLkxg-gfoDzBB9_XH3Vjsm0qYoewK8C7lMOWkh"
catcher = LyricsCatcher(oauth_token)
tr = TrackRetriever()
playlists = [
    'spotify:user:heekzz:playlist:5IeIXRXPSkRpBX9bNo5onK', # Test
    #'spotify:user:heekzz:playlist:6nL0gtJLo2xqCWIirZ4inF', # Rockify
    #'spotify:user:spotify:playlist:37i9dQZF1DXcF6B6QPhFDv', # Rock Classics
    #'spotify:user:spotify:playlist:37i9dQZF1DX0XUsuxWHRQd', # Rapcaviar 
    #'spotify:user:spotify:playlist:37i9dQZF1DWVA1Gq4XHa6U', # Gold School (Hiphop)
    #'',
    #'',
    #'',
    #'',
    #'',
    #'',
    #''
]
tic = time.time()
tracks_with_lyrics = []
track_id_set = set()
for pl in playlists:
    tracks = tr.get_track_info(pl)
    for t in tracks:
        if t['id'] not in track_id_set:
            lyrics = catcher.fetch_lyrics(t['title'], t['artist'])
            if lyrics != None:
                data = {'id': t['id'], 'artist': t['artist'], 'title': t['title'], 'explicit': t['explicit'], 'lyrics': lyrics}
                tracks_with_lyrics.append(data)
            track_id_set.add(t['id'])
elapsed = (time.time() - tic)
#print(tracks_with_lyrics)

In [None]:
evaluate(tracks_with_lyrics)
print("Duration: " + str(elapsed))
play_sound()

### Pre-processing

In [None]:
import nltk
#nltk.download()
#tokens = nltk.word_tokenize(tracks_with_lyrics[0]['lyrics'])
print(json.dumps(tracks_with_lyrics, indent=2))
play_sound()