# Download artist images

We want images of faces for rap, rock, and country artists.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import json
import time
import re
from collections import Counter
import pickle
import cv2
import wikipedia
from google_images_download import google_images_download

In [2]:
# Change the Matplotlib defaults
colors = np.array([(182,109,255),(218,109,0),(0,146,146)])/255
plt.rcParams.update({'font.size': 16, 'figure.figsize': (12.0, 6.0)})
plt.style.use('seaborn')
%matplotlib inline

# Collect lists of artist names from various sources

## Get artist names from Wikipedia

In [3]:
import requests
from bs4 import BeautifulSoup

In [6]:
categories = ["Category:American_female_country_singers",
              "Category:American_male_rappers",
              "Category:American_female_rappers"]

In [7]:
# Provide the content category and starting URL
root = "https://en.wikipedia.org"
artist_names = {}
total = 0
for category in categories:
    print("\n" + category)
    url = f"{root}/wiki/{category}"
    key = category.split(":")[-1].lower()

    # Start searching through pages
    reached_last_page = False
    while not reached_last_page:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        # Find the names on the page
        letter_groups = soup.find_all(class_="mw-category-group")[1:]
        for group in letter_groups:
            names_in_group = list(map(lambda x: x.text, group.find_all("li")))
            names_in_group = [name for name in names_in_group if not name.startswith("►")]
            total += len(names_in_group)
            artist_names.setdefault(key, []).extend(names_in_group)
        print(names_in_group[-4:])

        # Find the link to the next page
        prev_next_links = soup.find_all(title=category.replace("_", " "))
        if prev_next_links and "next" in prev_next_links[-1].text:
            prev_next_links = prev_next_links[-1]
            url = root + prev_next_links.get("href") # URL for the next page
        else:
            print("Reached the last page.")
            reached_last_page = True
        time.sleep(0.5)
print(f"Found {total} total names.")

Category:American_female_country_singers
['Kye Fleming', 'Rosie Flores', 'Mary Ford', 'Connie Francis']
['Moonshine Kate', 'Abra Moore', 'Allison Moorer', 'Heather Morgan (songwriter)']
['Dar Williams', 'Holly Williams', 'Joy Williams (singer)', 'Leona Williams']
['Pia Zadora', 'Andrea Zonn']
Reached the last page.
Category:American_male_rappers
['Meechy Darko', 'Datin (rapper)', 'Deacon the Villain', 'Menace Demarco']
['Rob Sonic', 'Souleye (hip hop artist)', 'Bubba Sparxxx', 'Speak!']
['Jonny Z', 'VZilla', 'Zombie Juice']
Reached the last page.
Category:American_female_rappers
['Shawnna', 'Magnolia Shorty', 'Shunda K', 'Natalie Sims']
['Yo-Yo (rapper)', 'Yoon Mi-rae', 'Young M.A', 'Maimouna Youssef']
Reached the last page.
Found 1231 total names.


## Female rock singers

In [9]:
category = "List_of_female_rock_singers"
root = "https://en.wikipedia.org"
url = f"{root}/wiki/{category}"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ul = soup.find_all("ul")
names = []
for section in ul[1:26]:
    names.extend(map(lambda x: x.text, section.find_all("li")))
print(f"Found {len(names)} total names.")
artist_names.setdefault("female_rock_singers", []).extend(names)

Found 610 total names.


## Male rock singers

In [10]:
url = "https://digitaldreamdoor.com/pages/best_vocalists.html"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all("table", class_="t7")[0]
columns = table.find_all("td", class_="td16a")
names = []
for column in columns:
    names.extend(list(map(lambda x: x.split(".")[-1].strip(), column.text.strip().split("\n"))))
print(f"Found {len(names)} total names.")
artist_names.setdefault("male_rock_singers", []).extend(names)    

Found 200 total names.


## Male country singers

In [11]:
url = "https://www.countrystartpage.com/music-directory/male/"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
div = soup.find(class_="uk-margin-remove-top uk-grid-margin uk-margin-remove-top")
names = [item.text.split("/")[0].strip() for item in div.find_all("li")]
print(f"Found {len(names)} total names.")
artist_names.setdefault("male_country_singers", []).extend(names)

Found 904 total names.


# Combine the different name sources

In [16]:
all_artist_names = {}
all_artist_names["RAP_FEMALE"] = artist_names["american_female_rappers"]
all_artist_names["RAP_MALE"] = artist_names["american_male_rappers"]
all_artist_names["ROCK_FEMALE"] = artist_names["female_rock_singers"]
all_artist_names["ROCK_MALE"] = artist_names["male_rock_singers"]
all_artist_names["COUNTRY_FEMALE"] = artist_names["american_female_country_singers"]
all_artist_names["COUNTRY_MALE"] = artist_names["male_country_singers"]
print(f"Collected a total of {sum(map(len, all_artist_names.values()))} artist names.")

Collected a total of 3555 artist names.


# Download images while checking for faces

## google_images_download

https://google-images-download.readthedocs.io/en/latest/

In [50]:
from skimage import io
from facemorpher import locator

In [19]:
response = google_images_download.googleimagesdownload()

In [57]:
all_artist_names.keys()

dict_keys(['RAP_FEMALE', 'RAP_MALE', 'ROCK_FEMALE', 'ROCK_MALE', 'COUNTRY_FEMALE', 'COUNTRY_MALE'])

In [76]:
# Get the image URLs for the artist
t0 = time.time()
output_dir = "./face_image_downloads/"
for genre_gender, artist_names in all_artist_names.items():
    print(genre_gender)
    genre, gender = genre_gender.split("_")
    for n, name in enumerate(artist_names):
        try:
            query = name + " " + genre
            if n % 50 == 0:
                print(f"({n+1}/{len(artist_names)}) || Time elapsed: {(time.time() - t0) / 60:.2f} minutes.")

            # Google Image Search
            args = {"keywords": query, "limit": 3, "prefix": prefix, "silent_mode": True,
                    "output_directory": output_dir, "no_directory": True, "delay": 0.1,
                    "save_source": "face_image_urls", "print_urls": False,
                    "no_download": True, "size": "medium"}
            image_path = response.download(args)
            prefix = output_dir + prefix

            # Check the image URLs for a face
            for url in image_path[0][query]:
                try:
                    # Read the image URL
                    image = io.imread(url)[...,::-1]

                    # Look for a face
                    face_array = locator.face_points(image)
                    if len(face_array): # Download the image if a face was found
                        # Format the filename
                        ext = url.rsplit(".")[-1]
                        fp = output_dir + genre + "_" + gender + "_" + name + "." + ext

                        # Save the image
                        cv2.imwrite(fp, image)
                        break
                except:
                    pass
            time.sleep(3) # Wait two seconds
        except Exception as e:
            print(f'big wait: {e}')
            time.sleep(10)
print(f"Total time elapsed: {(time.time() - t0) / 60:.2f} minutes.")

RAP_FEMALE
(1/218) || Time elapsed: 0.00 minutes.
Downloading images for: Aesja RAP ...
Downloading images for: Ak'Sent RAP ...
Downloading images for: Lexii Alijai RAP ...
Downloading images for: Amil (rapper) RAP ...
Downloading images for: Anquette RAP ...
Downloading images for: Antoinette (rapper) RAP ...
Downloading images for: Apani B RAP ...
Downloading images for: Asian Da Brat RAP ...
Downloading images for: Audra the Rapper RAP ...
Downloading images for: Whitney Avalon RAP ...
Downloading images for: Awkwafina RAP ...
Downloading images for: Cardi B RAP ...
Downloading images for: Bahamadia RAP ...
Downloading images for: Charli Baltimore RAP ...
Downloading images for: Azealia Banks RAP ...
Downloading images for: Dee Barnes RAP ...
Downloading images for: BbyMutha RAP ...
Downloading images for: Porcelain Black RAP ...
Downloading images for: Toni Blackman RAP ...
Downloading images for: Amanda Blank RAP ...
Downloading images for: Mary J. Blige RAP ...
Downloading images

In [33]:
url = image_path[0]["Jay-Z"][0]

In [None]:
t0 = time.time()
prefix = "./data/images/"
error_count = 0
for n, (genre_artist, url) in enumerate(image_urls.items()):
    # Format the filename
    genre, artist = genre_artist.split("_", 1)
    ext = url.rsplit(".")[-1]
    fp = prefix + genre_artist + "." + ext
    if n % 10 == 0:
        print(f"({n + 1:3.0f}/{len(image_urls)}): {fp}")

    # Download the image
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        image = r.raw.read()
        open(fp, "wb").write(image)
    else:
        error_count += 1
        print(f"(n={n})\nBad status code: {r.status_code}")
        print(f"Time elapsed: {(time.time() - t0) / 60:.2f} minutes.")
        print(f"Error count: {error_count}")
        time.sleep(3 * 60) # Wait three minutes
    time.sleep(2) # Wait two seconds
    if error_count >= 3:
        print("Too many errors. Quitting.")
        continue
print(f"Total time elapsed: {(time.time() - t0) / 60:.2f} minutes.")

# Clean up the downloaded file names

In [94]:
from glob import glob
import os

In [108]:
for char in ["?", "%"]:
    image_paths = glob("./face_image_downloads/*")
    new_names = list(map(lambda x: x.split(char)[0], image_paths))
    for old, fp in zip(image_paths, new_names):
        os.rename(old, fp)