## 0.0 Intro
This script will scrape the songs data from genius, then save it to DATASET_PATH location, cutting out rappers which don't have more than POPULARITY_OFFSET songs

Global imports, setting variables

In [1]:
from pprint import pprint
import json
import re
import os
from tqdm import tqdm

In [2]:
GENIUS_LINK = "http://genius.com"
API_LINK = "http://api.genius.com/"
client_access_token = "vJuRYtZOmqJwepQq0mAH5fqaxO3Yw9hM0b9wzguKf40yLii05QLVjTwC4o50XH4G"
DATASET_PATH = "/home/jack/datasets/polish_rap/"
POPULARITY_OFFSET = 5

## 0.1 Pre-crawling
Rapper names from hot16 challenge list, then rappers ids to use in genius api, then songs links

In [3]:
from bs4 import BeautifulSoup as bs
import requests
import urllib.request

In [4]:
def get_songs(artist_id):
    try_on = True #checks if there are still some songs left
    artist_id = str(artist_id)
    pagination = 0
    paths = []
    while try_on:
        try_on = False
        pagination += 1
        rq = urllib.request.Request("http://api.genius.com/artists/" + artist_id + \
                                    "/songs?per_page=50&page=" + str(pagination))
        rq.add_header("Authorization", "Bearer " + client_access_token)
        rq.add_header("User-Agent", \
                      "curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)")
        try:
            resp = urllib.request.urlopen(rq)
            r = json.loads(resp.read().decode("utf-8"))
        except:
            print("Rapper is not in database: " + artist_id)
            return []
        for k in r["response"]["songs"]:
            paths.append(k["path"])
            try_on = True
    return paths

In [5]:
def get_song_text(song_path):
    data = requests.get(GENIUS_LINK+song_path).text
    
    soup = bs(data, "lxml")
    for div in soup.findAll("lyrics", attrs={"class":"lyrics"}):
        return div.text

In [6]:
def get_artist_id(artist_name):
    data = requests.get(GENIUS_LINK+"/artists/"+artist_name).text
    
    soup = bs(data, "lxml")
    for a in soup.findAll(attrs={"name": "newrelic-resource-path"}):
        return a.attrs["content"].split("/")[-1]

In [7]:
def get_all_songs(names, force=False):
    all_songs_path = DATASET_PATH + "songs_by_artist.json"
    rappers_ids_path = DATASET_PATH + "rappers_ids.json"
    if os.path.exists(all_songs_path) and not force:
        with open(all_songs_path, "r") as f:
            all_songs = json.load(f)
        with open(rappers_ids_path, "r") as f:
            rappers_ids = json.load(f)
    else:
        all_songs = {}
        rappers_ids = {}
        for rapper_name in tqdm(names):
            rapper_id = get_artist_id(rapper_name)
            rappers_ids[rapper_name] = rapper_id
#             songs = get_songs(rapper_id)
            songs = None
            all_songs[rapper_name] = songs
        
        if not os.path.exists(DATASET_PATH):
            os.makedirs(DATASET_PATH)
        with open(all_songs_path, "w") as f:
            json.dump(all_songs, f, ensure_ascii=False)
        with open(rappers_ids_path, "w") as f:
            json.dump(rappers_ids, f, ensure_ascii=False)
    
    return all_songs, rappers_ids

In [8]:
def get_polish_rappers_names():
    href = "/Rap-genius-polska-lista-raperow-ktorzy-ukonczyli-hot-16-challenge-lyrics"
    polish_list = get_song_text(href)
    polish_list = polish_list.split("\n")
    polish_list = list(filter(lambda rp: 15 > len(rp) > 3, polish_list))
    polish_list = list(map(lambda rp: re.sub(" ", "-", rp), polish_list))
    return polish_list

In [9]:
polish_list = get_polish_rappers_names()
polish_list = [s.replace(".", "") for s in polish_list]

In [None]:
def clean_text(s):
    s = re.sub(r'\[.*\]', '', s)
    s = re.sub(r'[\n]+', '\n', s)
    return s[s.find('Lyrics')+len("Lyrics "):]

In [None]:
all_songs, rappers_ids = get_all_songs(polish_list, force=True)

  7%|▋         | 27/374 [00:24<05:05,  1.14it/s]

In [None]:
from collections import defaultdict

def clean_songs(songs_dict):
    res = defaultdict(list)
    for rapper, songs in songs_dict.items():
        for song in songs:
            if "/" + rapper in song:
                res[rapper].append(song)
    return dict(res)

In [None]:
all_songs = clean_songs(all_songs)

## 0.2 Cleaning list of rappers
Removing non-polish, rappers with empty songs list etc

In [None]:
empty_rappers = list(filter(lambda k: k not in all_songs or not all_songs[k], polish_list))
non_polish_rappers = ["Eminem", "Bonez", "Derk", "Kidman", "Kord", \
                      "Kordas", "Made", "Maestro", "Mikey-Kim", "Mona", "Oldas", "Perry", "Peti", "Shin", "Tazz", "Mikey"]
non_rappers = list(filter(lambda k: "DJ" in k, polish_list))
non_rappers.append("Dj")
non_rappers.append("Antologia")

In [None]:
good_polish_rappers = set(polish_list)
not_good_rappers = set()
not_good_rappers |= set(empty_rappers)
not_good_rappers |= set(non_polish_rappers)
not_good_rappers |= set(non_rappers)
good_polish_rappers -= not_good_rappers

good_polish_rappers.add("Mickiewicz")

In [None]:
len(good_polish_rappers)

## 0.3 Crawling songs
Web scraping from genius

In [None]:
def dump_songs(rapper_name, force=False):
    rapper_path = DATASET_PATH + "all_lyrics/" + rapper_name
    if not os.path.exists(rapper_path) or force: 
        os.makedirs(rapper_path)
        try:
            for song in all_songs[rapper_name]:
                lyrics = get_song_text(song)
                song = song.split("/")[1]
                lyrics_plus_meta = {}
                lyrics_plus_meta["lyrics"] = clean_text(lyrics)
                with open(rapper_path + "/" + song, "w") as f:
                    json.dump(lyrics_plus_meta, f, ensure_ascii=False)
        except:
            print("There was a problem with rapper:" + rapper_name)

In [None]:
def dump_all_lyrics(rappers_names):
    for rapper in tqdm(rappers_names):
        dump_songs(rapper)

In [None]:
dump_all_lyrics(good_polish_rappers)

## 0.4 Cleaning data
detecting language, moving songs to their's main artists dirs

In [None]:
from langdetect import detect
import shutil

In [None]:
def optionally_create(artist_name):
    if not os.path.exists(DATASET_PATH + "ok_lyrics/" + artist_name):
        os.mkdir(DATASET_PATH + "ok_lyrics/" + artist_name)

In [None]:
def clean_data():
    #clean rappers names and languages
    optionally_create("")
    rappers = os.listdir(DATASET_PATH + "all_lyrics/")
    for rapper in tqdm(rappers):
        if not os.path.exists(DATASET_PATH + "ok_lyrics/" + rapper): #hack for mickiewicz, comment it
            files = os.listdir(DATASET_PATH + "all_lyrics/" + rapper)
            for song in files:
                song_path = DATASET_PATH + "all_lyrics/" + rapper + "/" + song
                with open(song_path, "r") as f:
                    song_text = json.load(f)["lyrics"]
                try:
                    lang = detect(song_text)
                except:
                    lang = "xx"
                if lang != "pl" :
                    #optionally delete them, but for now just leave
                    #os.remove(song_path)
                    pass
                else:
                    #clean every dir - move songs that doesnt start with the name of a rapper 
                    #to a new dir and add rapper to list
                    main_artist = song.split("-")[0]
                    optionally_create(main_artist)
                    os.rename(song_path, DATASET_PATH + "ok_lyrics/" + main_artist + "/" + song)

In [None]:
clean_data()

## 0.5 Selecting popular rappers 

In [None]:
def cut_out_popular(offset=POPULARITY_OFFSET):
    popularity = {}
    rappers = os.listdir(DATASET_PATH + "ok_lyrics/")
    for rapper in tqdm(rappers):
        files = os.listdir(DATASET_PATH + "ok_lyrics/" + rapper)
        l = list(filter(lambda r: r.startswith(rapper+"-"), good_polish_rappers))
        if l:
            os.rename(DATASET_PATH + "ok_lyrics/" + rapper, DATASET_PATH + "ok_lyrics/" + l[0])
            rapper = l[0]
        if ((len(files) < offset) or (rapper in not_good_rappers)) and rapper != "Mickiewicz":
            shutil.rmtree(DATASET_PATH + "ok_lyrics/" + rapper)
        else:
            popularity[rapper] = len(files)
    return popularity

In [None]:
"Mickiewicz" in good_polish_rappers

In [None]:
popularity = cut_out_popular()

## 0.6 Exploring popular rappers

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
TOP_N = 20

In [None]:
popularity_df = pd.DataFrame.from_dict(popularity, "index")
popularity_df.columns = ["popularity"]
popularity_df.sort_values(by="popularity", inplace=True, ascending=False)
# print(popularity_df)
popularity_df[:TOP_N].plot(kind="barh", figsize=(15,15))

In [None]:
print("There were " + str(len(popularity)) + " popular rappers")