# Download artist images

We want images of faces for rap, rock, and country artists.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import json
import time
import re
from collections import Counter
import pickle
import cv2
import wikipedia
from google_images_download import google_images_download

In [2]:
# Change the Matplotlib defaults
colors = np.array([(182,109,255),(218,109,0),(0,146,146)])/255
plt.rcParams.update({'font.size': 16, 'figure.figsize': (12.0, 6.0)})
plt.style.use('seaborn')
%matplotlib inline

# Collect lists of artist names from various sources

## Get artist names from Wikipedia

In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
categories = ["Category:American_female_country_singers",
              "Category:American_male_rappers",
              "Category:American_female_rappers"]

In [5]:
# Provide the content category and starting URL
root = "https://en.wikipedia.org"
artist_names = {}
total = 0
for category in categories:
    print("\n" + category)
    url = f"{root}/wiki/{category}"
    key = category.split(":")[-1].lower()

    # Start searching through pages
    reached_last_page = False
    while not reached_last_page:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        # Find the names on the page
        letter_groups = soup.find_all(class_="mw-category-group")[1:]
        for group in letter_groups:
            names_in_group = list(map(lambda x: x.text, group.find_all("li")))
            names_in_group = [name for name in names_in_group if not name.startswith("►")]
            total += len(names_in_group)
            artist_names.setdefault(key, []).extend(names_in_group)
        print(names_in_group[-4:])

        # Find the link to the next page
        prev_next_links = soup.find_all(title=category.replace("_", " "))
        if prev_next_links and "next" in prev_next_links[-1].text:
            prev_next_links = prev_next_links[-1]
            url = root + prev_next_links.get("href") # URL for the next page
        else:
            print("Reached the last page.")
            reached_last_page = True
        time.sleep(0.5)
print(f"Found {total} total names.")


Category:American_female_country_singers
['Connie Francis', 'Paula Frazer', 'Adrianna Freeman', 'Dori Freeman']
['Moonshine Kate', 'Abra Moore', 'Allison Moorer', 'Heather Morgan (songwriter)']
['Chickie Williams', 'Dar Williams', 'Holly Williams', 'Joy Williams (singer)']
['Pia Zadora', 'Andrea Zonn']
Reached the last page.

Category:American_male_rappers
['Meechy Darko', 'Datin (rapper)', 'Deacon the Villain', 'Menace Demarco']
['Rob Sonic', 'Souleye (hip hop artist)', 'Bubba Sparxxx', 'Speak!']
['Jonny Z', 'VZilla', 'Zombie Juice']
Reached the last page.

Category:American_female_rappers
['Shawnna', 'Magnolia Shorty', 'Shunda K', 'Natalie Sims']
['Yo-Yo (rapper)', 'Yoon Mi-rae', 'Young M.A', 'Maimouna Youssef']
Reached the last page.
Found 1234 total names.


## Female rock singers

In [6]:
category = "List_of_female_rock_singers"
root = "https://en.wikipedia.org"
url = f"{root}/wiki/{category}"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ul = soup.find_all("ul")
names = []
for section in ul[1:26]:
    names.extend(map(lambda x: x.text, section.find_all("li")))
print(f"Found {len(names)} total names.")
artist_names.setdefault("female_rock_singers", []).extend(names)

Found 610 total names.


## Male rock singers

In [7]:
url = "https://digitaldreamdoor.com/pages/best_vocalists.html"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all("table", class_="t7")[0]
columns = table.find_all("td", class_="td16a")
names = []
for column in columns:
    names.extend(list(map(lambda x: x.split(".")[-1].strip(), column.text.strip().split("\n"))))
print(f"Found {len(names)} total names.")
artist_names.setdefault("male_rock_singers", []).extend(names)    

Found 200 total names.


## Male country singers

In [8]:
url = "https://www.countrystartpage.com/music-directory/male/"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
div = soup.find(class_="uk-margin-remove-top uk-grid-margin uk-margin-remove-top")
names = [item.text.split("/")[0].strip() for item in div.find_all("li")]
print(f"Found {len(names)} total names.")
artist_names.setdefault("male_country_singers", []).extend(names)

Found 904 total names.


# Combine the different name sources

In [9]:
all_artist_names = {}
all_artist_names["RAP_FEMALE"] = artist_names["american_female_rappers"]
all_artist_names["RAP_MALE"] = artist_names["american_male_rappers"]
all_artist_names["ROCK_FEMALE"] = artist_names["female_rock_singers"]
all_artist_names["ROCK_MALE"] = artist_names["male_rock_singers"]
all_artist_names["COUNTRY_FEMALE"] = artist_names["american_female_country_singers"]
all_artist_names["COUNTRY_MALE"] = artist_names["male_country_singers"]
print(f"Collected a total of {sum(map(len, all_artist_names.values()))} artist names.")

Collected a total of 2948 artist names.


In [12]:
with open("artist_names.pickle", "wb") as outfile:
    pickle.dump(all_artist_names, outfile)
print("Saved the artist names dictionary to a pickle.")

Saved the artist names dictionary to a pickle.


# Download images while checking for faces

## google_images_download

https://google-images-download.readthedocs.io/en/latest/

In [13]:
from skimage import io
import dlib

In [14]:
response = google_images_download.googleimagesdownload()

In [30]:
# Used for finding faces in the image
detector = dlib.get_frontal_face_detector()

In [70]:
# Get the image URLs for the artist
t0 = time.time()
output_dir = "./face_image_downloads_v2/"
# for genre_gender, artist_names in all_artist_names.items():
d = {key:all_artist_names[key] for key in ["ROCK_MALE", "COUNTRY_MALE", "COUNTRY_FEMALE"]}
for genre_gender, artist_names in d.items():
    print(genre_gender)
    genre, gender = genre_gender.split("_")
    for n, name in enumerate(artist_names):
        name = name.replace(",", "")
        try:
            query = name + " " + genre
            if n % 50 == 0:
                print(f"({n+1}/{len(artist_names)}) || Time elapsed: {(time.time() - t0) / 60:.2f} minutes.")

            # Google Image Search
            prefix = genre_gender
            args = {"keywords": query, "limit": 3, "prefix": prefix, "silent_mode": True,
                    "output_directory": output_dir, "no_directory": True, "delay": 0.1,
                    "save_source": "face_image_urls", "print_urls": False,
                    "no_download": True, "size": "medium", "color_type": "full-color"}
            image_path = response.download(args)
            prefix = output_dir + prefix

            # Check the image URLs for a fOace
            for url in image_path[0][query]:
                print(url)
                try:
                    # Read the image URL
                    image = io.imread(url)[...,::-1]
                   
                    # Try to detect a face in the image
                    found_array = detector(image, 1)
                    if len(face_array) == 1: # Only download images with a single face
                        # Format the filename
                        ext = url.rsplit(".")[-1]
                        fp = output_dir + genre + "_" + gender + "_" + name + "." + ext
                        fp = fp.split("?", 1)[0]

                        # Save the image
                        print("hi")
                        cv2.imwrite(fp, image)
                        break
                except Exception as e:
                    print("ERROR")
                    print(e)
                    break
            time.sleep(2.5) # Wait a little bit
            break
        except Exception as e:
            print(f'big wait: {e}')
            time.sleep(1)
            break
    break
print(f"Total time elapsed: {(time.time() - t0) / 60:.2f} minutes.")

ROCK_MALE
(1/200) || Time elapsed: 0.00 minutes.
Downloading images for: Freddie Mercury (Queen Solo) ROCK ...
https://www.billboard.com/files/styles/article_main_image/public/media/freddie-mercury-queen-1982-r-billboard-1548.jpg
ERROR
HTTP Error 403: Forbidden
Total time elapsed: 0.05 minutes.


# Clean up the downloaded file names

In [94]:
from glob import glob
import os

In [108]:
for char in ["?", "%"]:
    image_paths = glob("./face_image_downloads/*")
    new_names = list(map(lambda x: x.split(char)[0], image_paths))
    for old, fp in zip(image_paths, new_names):
        os.rename(old, fp)