## Scraping 


#### Getting a list of subpages for each instrument

Let's get a list of all the available instrument names and webpages for sample banks from the 2012 collection on the University of Iowa Music Department website

In [2]:
import requests 
from bs4 import BeautifulSoup 

base_url = 'http://theremin.music.uiowa.edu/'
req = requests.get(base_url+'MIS.html') 
  
soup = BeautifulSoup(req.content, 'html5lib') 
print(soup.prettify()) 

<html>
 <head>
  <title>
   University of Iowa Electronic Music Studios
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="file:///Sites/Unnamed Site 1/CSS/Level1_Arial.css" rel="stylesheet" type="text/css"/>
  <style type="text/css">
   <!--
.style1 {
	color: #0000ff;
	font-family: Verdana, Geneva, sans-serif;
}
.style2 {
	font-size: 10.5pt;
	font-family: Verdana, Geneva, sans-serif;
}
.style1 a {
	color: #0000FF;
}
.style1 a {
	font-family: Verdana, Geneva, sans-serif;
	font-size: 10.5pt;
	text-align: center;
}
.ems {
	font-family: Verdana, Geneva, sans-serif;
}
.ems {
	font-family: Verdana, Geneva, sans-serif;
}
.style1 b {
	font-family: Verdana, Geneva, sans-serif;
}
.style3 {
	font-family: Verdana, Geneva, sans-serif;
}
.style1 td .style2 u {
	font-size: 10.5pt;
	text-align: center;
}
.style1 td p .style2 {
	font-size: 10.5px;
}
.hell {
	font-size: 10.5pt;
}
.style4 {
	font-size: 10.5px;
}
.w {
	text-align: left;
}
.bcccc {
	text-align:

In [3]:
instruments = soup.find(id = 'MenuBar1').contents[3].contents[2]
instruments_unfiltered = instruments.contents[3].select('a') #[href="#"]')
instrument_links = list(filter(lambda x: x.attrs['href'][:16] == 'MIS-Pitches-2012', instruments_unfiltered))

In [4]:
instruments = list(map(lambda x: (str(x.contents[0]).strip('()012 '), x.attrs['href']), instrument_links))[:-4]
instruments

[('Flute', 'MIS-Pitches-2012/MISFlute2012.html'),
 ('Alto Flute', 'MIS-Pitches-2012/MISaltoflute2012.html'),
 ('Bass Flute', 'MIS-Pitches-2012/MISBassFlute2012.html'),
 ('Oboe', 'MIS-Pitches-2012/MISOboe2012.html'),
 ('Eb Clarinet', 'MIS-Pitches-2012/MISEbClarinet2012.html'),
 ('Bb Clarinet', 'MIS-Pitches-2012/MISBbClarinet2012.html'),
 ('Bass Clarinet', 'MIS-Pitches-2012/MISBbBassClarinet2012.html'),
 ('Bassoon', 'MIS-Pitches-2012/MISBassoon2012.html'),
 ('Bb Soprano Saxophone', 'MIS-Pitches-2012/MISBbSopranoSaxophone2012.html'),
 ('Eb Alto Saxophone', 'MIS-Pitches-2012/MISEbAltoSaxophone2012.html'),
 ('Horn', 'MIS-Pitches-2012/MISHorn2012.html'),
 ('Bb Trumpet', 'MIS-Pitches-2012/MISBbTrumpet2012.html'),
 ('Tenor Trombone', 'MIS-Pitches-2012/MISTenorTrombone2012.html'),
 ('Bass Trombone', 'MIS-Pitches-2012/MISBassTrombone2012.html'),
 ('Tuba', 'MIS-Pitches-2012/MISTuba2012.html'),
 ('Violin', 'MIS-Pitches-2012/MISViolin2012.html'),
 ('Viola', 'MIS-Pitches-2012/MISViola2012.html'),
 (

We've filterd out unpitched percussion instruments, and cleaned up the instrument names for convenience matching with other sample banks later. There are individual samples for each pitch in the 2012 data set, but it is easier to download the zip files. For many instruments there are multiple zip files for piano, mezzo forte, and forte samples. Some instruments have additional expressions, like tremlolo or pizzacato. We'll have to decide whether to treat these as "the same" instrument, or "different" instruments for our analysis. For now, we will download them all and build in the categorization as an addition feature in our database.

In [5]:
instrument_zips = {}

for instr, rel_url in instruments:
    req = requests.get(base_url + rel_url)
    soup = BeautifulSoup(req.content, 'html5lib') 
    zip_links_unfiltered = soup.find_all('a')
    zip_links = list(filter(lambda x: x.attrs['href'][-4:] == '.zip', zip_links_unfiltered))
    zip_urls = list(map(lambda x: base_url + x.attrs['href'][3:], zip_links))
    instrument_zips[instr] = zip_urls

In [6]:
def download_file(url, target_path):
    print(f'Downloading {url} to {target_path} ...' , end=' ')
    try:
        response = requests.get(url, stream=True)
        handle = open(target_path, "wb")
        for chunk in response.iter_content(chunk_size=512):
            if chunk:
                handle.write(chunk)
        handle.close()
        print('Done.')
    except:
            print("Error downloading:", sys.exc_info()[0])
            raise

In [7]:
from zipfile import ZipFile
from os import remove

def unzip_and_delete_archive(directory, zip_filename, subfolder):
    with ZipFile(directory + zip_filename, 'r') as zipObj:
        print(f'Unzipping {zip_filename} ...', end=' ')
        try:
            zipObj.extractall(directory + subfolder)
            print('Done.')
        except:
            print("Error unzipping:", sys.exc_info()[0])
            raise
    remove(directory + zip_filename)

### Create Database for Samples

In [15]:
import config
import mysql
import mysql.connector
from mysql.connector import errorcode

# DB FUNCTIONS

def connect(db_name=None):
    if db_name:
        cnx = mysql.connector.connect(
            host = config.rds_host,
            user = config.rds_user,
            passwd = config.rds_password,
            database = db_name)
    else:
        cnx = mysql.connector.connect(
            host = config.rds_host,
            user = config.rds_user,
            passwd = config.rds_password)
    cursor = cnx.cursor()
    return cursor, cnx

def create_database_helper(cursor, db_name):
    try:
        cursor.execute(
            f"CREATE DATABASE {db_name} DEFAULT CHARACTER SET 'utf8'"
        )
        print(f"Database {db_name} created successfully.")
    except mysql.connector.Error as err:
        print("Failed creating database: {}".format(err))
        exit(1)

def create_database(db_name):
    cursor, cnx = connect() 
    try:
        cursor.execute("USE {}".format(db_name))
    except mysql.connector.Error as err:
        print("Database {} does not exist.".format(db_name))
        if err.errno == errorcode.ER_BAD_DB_ERROR:
            create_database_helper(cursor, db_name)
            cnx.database = db_name
        else:
            print(err)
            exit(1)
    cnx.close()
    


def insert_to_dir(list_of_four, cursor):
    stmt = """
    INSERT INTO top_directors 
    (id, name, url, sign)
    VALUES (%s, %s, %s, %s)
    """
    cursor.execute(stmt, list_of_four)

In [18]:
create_database('instruments')

In [19]:
create_sample_table_query = """CREATE TABLE samples (
    sample_id int NOT NULL AUTO_INCREMENT,
    instrument_name varchar(255) NOT NULL,
    note varchar(255) NOT NULL,
    expression varchar(255),
    source varchar(255),
    file_extension varchar(255),
    PRIMARY KEY (sample_id)
    )"""

cur, cnx = connect('instruments')
cur.execute(create_sample_table_query)
cnx.close()

### Using AWS S3 for storing audio samples

In [8]:
import logging
import boto3
from botocore.exceptions import ClientError


def upload_file(file_name, bucket='instrument-samples-1337', object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3',
                             aws_access_key_id=config.aws_access_key_id,
                             aws_secret_access_key=config.aws_secret_access_key)
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [9]:


def upload_sample(file_name, instrument_name, note, expression, source):
    file_extension = file_name.split('.')[-1]
    cur, cnx = connect()
    cur.execute(f"""INSERT INTO instruments.samples (instrument_name, note, expression, source, file_extension) 
                    VALUES ("{instrument_name}", "{note}", "{expression}", "{source}", "{file_extension}");""")
    sample_id = cur.lastrowid
    print(f'Attemping to upload sample {file_name}, id={sample_id} ...', end=' ')
    try:
        upload_file(file_name, object_name=f"{sample_id}.{file_extension}")
        cnx.commit()
        cnx.close()
        print('Success!')
    except:
        print("Upload failed :'(")
        cnx.close()
        raise

    

In [10]:
from os import rmdir, listdir
from shutil import rmtree

def extract_Iowa2012_metadata(file_name):
    if '.ff.' in file_name:
        left, right = file_name.split('.ff.')[:2]
        metadata = {'instrument_name' : left.split('.')[0],
                    'note' : right.split('.')[0],
                    'expression' : left.split('.')[1] if len(left.split('.')) > 1 else '',
                    'source' : 'Iowa2012',
                    'file_extension' : right.split('.')[-1]
                   }
    else:
        metadata = {'instrument_name' : file_name.split('.')[0],
                    'note' : file_name.split('.')[2],
                    'expression' : file_name.split('.')[1],
                    'source' : 'Iowa2012',
                    'file_extension' : file_name.split('.')[-1]
                   }
    return metadata

def upload_Iowa2012_samples(directory):
    instrument_file_names = list(filter(lambda x: len(x.split('.')) > 4, listdir(directory)))
    metadata = list(map(extract_Iowa2012_metadata, instrument_file_names))
    for i, file_name in enumerate(instrument_file_names):
        upload_sample(directory + file_name,
                      metadata[i]['instrument_name'],
                      metadata[i]['note'],
                      metadata[i]['expression'],
                      metadata[i]['source'])
    rmtree(directory)

Manually create directory ./Iowa2012 before running this next cell.

In [16]:
for inst, zip_urls in instrument_zips.items():
    for url in zip_urls:
        subdirectory_name = inst.replace(' ', '') + '/'
        zip_filename = url.split('/')[-1]
        directory = './Iowa2012/'
        download_file(url, directory + zip_filename)
        unzip_and_delete_archive(directory, zip_filename, subdirectory_name)
        upload_Iowa2012_samples(directory + subdirectory_name)

Downloading http://theremin.music.uiowa.edu/sound files/MIS Pitches - 2014/Percussion/Bells/bells.plastic.ff.stereo.zip to ./Iowa2012/bells.plastic.ff.stereo.zip ... Done.
Unzipping bells.plastic.ff.stereo.zip ... Done.
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.A5.stereo.aif, id=2231 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.G7.stereo.aif, id=2232 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.F6.stereo.aif, id=2233 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.Eb7.stereo.aif, id=2234 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.Bb7.stereo.aif, id=2235 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.C6.stereo.aif, id=2236 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.D8.stereo.aif, id=2237 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.plastic.ff.Db7.stereo.aif, id=2238 ... Success!
A

Attemping to upload sample ./Iowa2012/Bells/bells.brass.ff.Gb7.stereo.aif, id=2311 ... Success!
Attemping to upload sample ./Iowa2012/Bells/bells.brass.ff.F7.stereo.aif, id=2312 ... Success!
Downloading http://theremin.music.uiowa.edu/sound files/MIS Pitches - 2014/Percussion/Crotales/Crotale.ff.stereo.zip to ./Iowa2012/Crotale.ff.stereo.zip ... Done.
Unzipping Crotale.ff.stereo.zip ... Done.
Attemping to upload sample ./Iowa2012/Crotales/Crotale.ff.C7.stereo.aif, id=2313 ... Success!
Attemping to upload sample ./Iowa2012/Crotales/Crotale.ff.Db6.stereo.aif, id=2314 ... Success!
Attemping to upload sample ./Iowa2012/Crotales/Crotale.ff.B6.stereo.aif, id=2315 ... Success!
Attemping to upload sample ./Iowa2012/Crotales/Crotale.ff.G6.stereo.aif, id=2316 ... Success!
Attemping to upload sample ./Iowa2012/Crotales/Crotale.ff.Eb6.stereo.aif, id=2317 ... Success!
Attemping to upload sample ./Iowa2012/Crotales/Crotale.ff.Bb6.stereo.aif, id=2318 ... Success!
Attemping to upload sample ./Iowa2012