# 0. Introduction

## Data Source

- [link to pitchfork](https://pitchfork.com/reviews/albums/?page=20) sample: August 8th > October 21st  

## What we would like to get 
- artist name
- album name
- genre
- link to review
- review score
- link to cover picture
- cover picture
- dominant color of the picture, HSL format

## Steps

**1. Fetching the data**  
1.1. get the html page  
1.2. use beautiful soup to extract data from Pitchfork  
1.3. preparing the data into a dataframe     
1.4. use beautiful soup to retrieve album covers & wget  

**2. Extracting the dominant color from album cover**  
2.1. extract album covers dominant color (`k-means`)  
2.2. format it in HSL into the current dataframe 

## Modules

In [None]:
# modules for part 1
from bs4 import BeautifulSoup
import pandas as pd
import subprocess, sys
import requests

# modules for part 2
from collections import namedtuple
import colorsys
import json
from math import sqrt
from os import listdir
from os.path import isfile, join
import random
from PIL import Image
from IPython.core.display import display, HTML
from natsort import natsorted

# 1. Fetching the data

## 1.1 Get the HTML page

In [None]:
# the html file for the scope August 8th > October 21st was manually retrieved
file_html = "../../data/pitchfork_covers/soup.html"

In [None]:
# open the file using open() function 
file = open(file_html)  
    
# Reading from file  
soup = BeautifulSoup(file.read(), "html.parser")
file.close()

## 1.2 Use beautiful soup to extract data above

In [None]:
# full review
p_review_link = []
p_pic_link = []
p_artist = []
p_album = []
p_genre = []

for a in soup.find_all("div", class_="review"):
    
    for r in a.find_all("ul", class_="review__title-artist"):
        
        # removing reviews with Various Artists
        if (r.find("li").string == "Various Artists"):
            pass
        
        else:
            for b in a.find_all("a", class_="review__link"):
                p_review_link.append(b["href"])

            for c in a.find_all("img"):
                p_pic_link.append(c["src"])

            # only first artist
            for d in a.find_all("ul", class_="review__title-artist"):
                p_artist.append(d.find("li").string)

            for e in a.find_all("h2", class_="review__title-album"):
                p_album.append(e.string)

            # only first genre
            for f in a.find_all("ul", class_="review__genre-list"):
                p_genre.append(f.find("li").string)

In [None]:
# page by page review score
# for double albums, taking only the first album rating

review_score = []
i = -1

for link in p_review_link:
    i+=1
    url = "https://pitchfork.com"+link
    req = requests.get(url)
    soup_ = BeautifulSoup(req.text, "html.parser")
    for t in soup_.find("span", class_="score"):
        review_score.append(t.string)

## 1.3 Preparing the data into a dataframe

In [None]:
df = pd.DataFrame({"link_review": p_review_link, "artist": p_artist, "album": p_album, "genre": p_genre, "link_pic": p_pic_link, "review_score": review_score})
df = df.reset_index().rename(columns={'index':'review_id'})

## 1.4 Use beautiful soup to retrieve album covers & wget 

In [None]:
pic_folder = "../../data/pitchfork_covers/pics_0808_1021/"

In [None]:
# quick warning on wget:
# ' should be replaced by "'" which is done here with this trick str('"') + str("'") + str('"')
# it does not catch '&' in urls so "" should be added - same goes for ()

df.link_pic = df.link_pic.str.replace("'",str('"') + str("'") + str('"'))
df.link_pic = df.link_pic.str.replace("&","'&'")
df.link_pic = df.link_pic.str.replace("(","'('")
df.link_pic = df.link_pic.str.replace(")","')'")

In [None]:
for idx, link in enumerate(df.link_pic):
    command = "wget {0} -O {1}{2}.jpg".format(link,pic_folder,idx)
    p = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE)

# 2. Extracting main color from album cover

## 2.1. Extract album covers main color

Here, the algorithm used is a k-means clustering, with only one cluster.  
A really nice article helped me through the process, credit to [Charles Leifer](http://charlesleifer.com/blog/using-python-and-k-means-to-find-the-dominant-colors-in-images/) for his help on this!  


In [None]:
Point = namedtuple('Point', ('coords', 'n', 'ct'))
Cluster = namedtuple('Cluster', ('points', 'center', 'n'))

In [None]:
# two functions to do math computations used in later functions
def calculate_center(points, n):
    vals = [0.0 for i in range(n)]
    plen = 0
    for p in points:
        plen += p.ct
        for i in range(n):
            vals[i] += (p.coords[i] * p.ct)
    return Point([(v / plen) for v in vals], n, 1)

def euclidean(p1, p2):
    return sqrt(sum([
        (p1.coords[i] - p2.coords[i]) ** 2 for i in range(p1.n)
    ]))

In [None]:
# retrieve all points of the picture as a rgb array
def get_points(img):
    points = []
    w, h = img.size
    for count, color in img.getcolors(w * h):
        points.append(Point(color, 3, count))
    return points

In [None]:
def kmeans(points, k, min_diff):
    clusters = [Cluster([p], p, p.n) for p in random.sample(points, k)]

    while 1:
        plists = [[] for i in range(k)]

        for p in points:
            smallest_distance = float('Inf')
            for i in range(k):
                distance = euclidean(p, clusters[i].center)
                if distance < smallest_distance:
                    smallest_distance = distance
                    idx = i
            plists[idx].append(p)

        diff = 0
        for i in range(k):
            old = clusters[i]
            center = calculate_center(plists[i], old.n)
            new = Cluster(plists[i], center, old.n)
            clusters[i] = new
            diff = max(diff, euclidean(old.center, new.center))

        if diff < min_diff:
            break

    return clusters

In [None]:
rtoh = lambda rgb: '#%s' % ''.join(('%02x' % p for p in rgb))

In [None]:
def colorz(filename, n=1):
    img = Image.open(filename)
    img.thumbnail((200, 200))
    w, h = img.size
    # calling the get_points function
    points = get_points(img)
    # calling the k_means function
    clusters = kmeans(points, n, 1)
    
    rgbs = [map(int, c.center.coords) for c in clusters]
    return map(rtoh, rgbs)

In [None]:
files = [f for f in listdir(pic_folder) if isfile(join(pic_folder, f))]
# sorting files by alphabetical/numerical order
files = natsorted(files)

In [None]:
def recursive(path):
    color = ""
    arrays = []
    for idx,file in enumerate(files):
        i = (idx *10)
        array = list(colorz(path + file))
        arrays.append(array)
        color = color + \
                    """
                  <rect x="{1}" y="0" width="10" height="10" style="fill: {0};" />
                  
                    """.format(array[0],i)
        
    opening = """ <svg width="{0}" height="100"> """.format((len(files)*10)+100)
    closing = """ </svg> """
        
    return arrays

In [None]:
# arr is a list of 3 hex codes, with the dominant color
arr = recursive(pic_folder)

In [None]:
# from all the previous steps, list of pictures has been kept in order
# we can safely add the column to the dataframe

# adding hex code to dataframe
df["hex"] = np.array(arr)

## 2.2. format array values in HSL into a new dataframe

The datavisualization will rank array values by hue. To rank them this way, the hex values should be converted into HSL and from there, we will be able to rank album covers on the hue color wheel.  

In [None]:
# converting hex to rgb
for i in range(len(arr)):
    for j in range(0,1):
        arr[i][j] = tuple(int(arr[i][j].lstrip('#')[k:k+2], 16) for k in (0, 2, 4))

In [None]:
# converting rgb to hsb/hsv
for i in range(len(arr)):
    for j in range(0,1):
        arr[i][j] = list(colorsys.rgb_to_hsv(arr[i][j][0]/255.,arr[i][j][1]/255.,arr[i][j][2]/255.))

In [None]:
# converting to the proper number format: [0,360] - [0,100] - [0,100]
for i in range(len(arr)):
    arr[i][0][0] = int(arr[i][0][0] * 360)
    arr[i][0][1] = int(arr[i][0][1] * 100)
    arr[i][0][2] = int(arr[i][0][2] * 100)

In [None]:
# adding hsl code to dataframe
df["hsl"] = arr
df["hsl"] = df["hsl"].apply(lambda x: x[0])

df["hue"] = df["hsl"].apply(lambda x: x[0])
df["saturation"] = df["hsl"].apply(lambda x: x[1])
df["light"] = df["hsl"].apply(lambda x: x[2])

# ranks
df["hue_rank"] = df["hue"].rank(method="first").astype(int) - 1
df["score_rank"] = df["review_score"].rank(method="first")

In [None]:
df.to_csv("../../data/pitchfork_covers/pics_0808_1021.csv")