# BLU02 - Exercises Notebook

In [1]:
import hashlib # for grading

import os
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

## 1 Read the Programs data (graded)

In this first exercise, we aim to create a single dataframe, combining all programs from all seasons.

With a caveat though: **we want to include seasons after 1900**.

In [2]:
files = os.listdir('data/programs/')
len(files)
files_after_1900 = [file for file in files if file[0:2]=='19' or file[0:2]=='20']

In [3]:
files_after_1900[-10:]

['2007-08.csv',
 '2008-09.csv',
 '2009-10.csv',
 '2010-11.csv',
 '2011-12.csv',
 '2012-13.csv',
 '2013-14.csv',
 '2014-15.csv',
 '2015-16.csv',
 '2016-17.csv']

In [4]:
pd.read_csv("./data/programs/" + files_after_1900[0])

Unnamed: 0,GUID,ProgramID,Orchestra,Season
0,06cf12ad-35ce-4ad1-9784-b41d71e444d3,416,New York Philharmonic,1900-01
1,bc12831d-b37e-41b2-97e6-0c09505c22ed,9875,New York Symphony,1900-01
2,dccd1848-bc49-46e7-92d5-822b7e31c579,4806,New York Philharmonic,1900-01
3,b97bea0a-373c-461f-b64f-9f7381faef19,4809,New York Philharmonic,1900-01
4,b964d1ce-47b3-499b-b164-d0a40e0aab2a,9874,New York Symphony,1900-01
5,d7b83a0b-a448-4b67-ad2f-178f80969855,1107,New York Philharmonic,1900-01
6,8fbfa1a7-61db-4205-8335-8fe60ec89948,9876,New York Symphony,1900-01
7,83c0e45e-1011-4dde-a0dd-2f95df8612a8,5493,New York Philharmonic,1900-01
8,c3257210-b204-4a46-9976-b72040321756,9877,New York Symphony,1900-01
9,5e425be2-77a4-4b73-813f-df5c71cecb41,5495,New York Philharmonic,1900-01


In [5]:
seasons = [pd.read_csv(".\\data\\programs\\" + file) for file in files_after_1900]
programs = pd.concat(seasons)
programs.info()
programs = programs.drop(columns = 'ProgramID')
programs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13419 entries, 0 to 126
Data columns (total 4 columns):
GUID         13419 non-null object
ProgramID    13419 non-null int64
Orchestra    13417 non-null object
Season       13419 non-null object
dtypes: int64(1), object(3)
memory usage: 524.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13419 entries, 0 to 126
Data columns (total 3 columns):
GUID         13419 non-null object
Orchestra    13417 non-null object
Season       13419 non-null object
dtypes: object(3)
memory usage: 419.3+ KB


In [6]:
def make_programs():
    files = os.listdir('data/programs/')
    # Create a list with the name of all files containing programs from
    # 1900 inclusive and onwards (just the filename, no complete path.)
    # files_after_1900: List[str] = ...
    # YOUR CODE HERE
    files_after_1900 = [file for file in files if file[0:2]=='19' or file[0:2]=='20']
    
    # Create a list with the name of all .csv files.
    # seasons: List[pd.DataFrame] = ...
    # YOUR CODE HERE
    seasons = [pd.read_csv(".\\data\\programs\\" + file) for file in files_after_1900]
    
    # Use pd.concat to create a single dataframe.
    # programs: pd.DataFrame = ...
    # YOUR CODE HERE
    programs = pd.concat(seasons)
    
    # Drop the column ProgramID.
    # programs = ...
    # YOUR CODE HERE
    programs = programs.drop(columns = 'ProgramID')
    
    # Set the index to be the column GUID, and sort the dataframe by the index 
    #( use the DataFrame.sort_index() function).
    # Feel free to use method chaining if you want.
    # YOUR CODE HERE
    programs = programs.set_index('GUID').sort_index()
    
    return programs


def read_season(file):
    path = os.path.join('data', 'programs', file)
    return pd.read_csv(path)


programs = make_programs()

In [7]:
assert programs['Season'].min() == '1900-01'

shape = str(programs.shape)
expected_hash = '16278afb4c2032bcddc35b915f5439ef586333e2723c2ba6cfb9cc1b58eca0e1'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

Let's preview the `programs` dataframe.

In [8]:
programs.head()

Unnamed: 0_level_0,Orchestra,Season
GUID,Unnamed: 1_level_1,Unnamed: 2_level_1
0002718f-a7a0-4362-9366-92fabab4ff3c,New York Philharmonic,1928-29
0004749e-19e2-4c85-a51e-76a2b0987e4e,New York Philharmonic,1922-23
0008995b-f0ce-4bdb-b2f8-2fc9827430fe,New York Symphony,1925-26
0008fd59-7b87-4e87-8b42-ab5b0f8505cf,New York Philharmonic,1942-43
000c0467-d7bf-4599-8e37-c856bc13a389,New York Philharmonic,1991-92


## 2 Read the Concerts data (graded)

Read the concerts data.

Although we list all transformations step-by-step for the sake of clarity, we expect you to use method chaining.

In [9]:
concerts = pd.read_csv(os.path.join('data', 'concerts.csv'))
concerts.Date = pd.to_datetime(concerts.Date).dt.date
concerts.Time = pd.to_datetime(concerts.Time, format='%I:%M%p').dt.time
concerts.head()

Unnamed: 0,GUID,ProgramID,ConcertID,EventType,Location,Venue,Date,Time
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,3853,0,Subscription Season,"Manhattan, NY",Apollo Rooms,1842-12-07,20:00:00
1,c7b2b95c-5e0b-431c-a340-5b37fc860b34,5178,0,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-02-18,20:00:00
2,894e1a52-1ae5-4fa7-aec0-b99997555a37,10785,0,Special,"Manhattan, NY",Apollo Rooms,1843-04-07,20:00:00
3,34ec2c2b-3297-4716-9831-b538310462b7,5887,0,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-04-22,20:00:00
4,610a4acc-94e4-4cd6-bdc1-8ad020edc7e9,305,0,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-11-18,NaT


In [10]:
def make_concerts(): 
    # Read concerts data and drop the ProgramID and ConcertID columns.
    # concerts: pd.DataFrame = ...
    # YOUR CODE HERE
    concerts = pd.read_csv(os.path.join('data', 'concerts.csv'))
    
    # Remember to_datetime? We need here. We need to parse the columns Date and 
    # Time. Use pd.to_datetime(...).dt.date for the Date and pd_to_datetime(..., 
    # format=%I:%M%p).dt.time for the Time.
    # YOUR CODE HERE
    concerts = concerts.drop(columns = ['ProgramID', 'ConcertID'])
    concerts.Date = pd.to_datetime(concerts.Date).dt.date
    concerts.Time = pd.to_datetime(concerts.Time, format='%I:%M%p').dt.time        
    
    return concerts


concerts = make_concerts()

In [11]:
shape = str(concerts.shape)
expected_hash = 'c030586e7370b1f2c34307d5de9b921d96efa28c933e44111b121ed819f339da'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

sample = str(concerts.sample(random_state=0))
expected_hash = '392a3db01753b02d85173c38cde95112fb5cdf06ca5a45d25f828238d56103be'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash

In [12]:
concerts.head()

Unnamed: 0,GUID,EventType,Location,Venue,Date,Time
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,Subscription Season,"Manhattan, NY",Apollo Rooms,1842-12-07,20:00:00
1,c7b2b95c-5e0b-431c-a340-5b37fc860b34,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-02-18,20:00:00
2,894e1a52-1ae5-4fa7-aec0-b99997555a37,Special,"Manhattan, NY",Apollo Rooms,1843-04-07,20:00:00
3,34ec2c2b-3297-4716-9831-b538310462b7,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-04-22,20:00:00
4,610a4acc-94e4-4cd6-bdc1-8ad020edc7e9,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-11-18,NaT


## 3 Combine Programs and Concerts data (graded)

Let's combine both dataframes into a single dataset, using an inner join.

In [13]:
programs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13419 entries, 0002718f-a7a0-4362-9366-92fabab4ff3c to ffffcc90-df35-4ae8-9a19-5a871b09d883
Data columns (total 2 columns):
Orchestra    13417 non-null object
Season       13419 non-null object
dtypes: object(2)
memory usage: 314.5+ KB


In [14]:
concerts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21607 entries, 0 to 21606
Data columns (total 6 columns):
GUID         21607 non-null object
EventType    21590 non-null object
Location     21588 non-null object
Venue        21552 non-null object
Date         21607 non-null object
Time         20836 non-null object
dtypes: object(6)
memory usage: 1012.9+ KB


In [15]:
# Remember that you want to join on the index of one of the dataframes.
# nyp = ...
# YOUR CODE HERE

#nyp = pd.merge(programs, concerts, how='inner', on='GUID', validate="1:m")
#nyp = pd.merge(programs, concerts, how='inner', left_index=True, left_on='GUID', right_on='GUID', validate="1:m")
#nyp = pd.merge(programs, concerts, how='inner', left_index=True, right_on='GUID', validate="1:m")
nyp = pd.merge(concerts, programs, how='inner', left_on='GUID', right_index=True, validate="m:1")


#nyp = programs.join(concerts.set_index('GUID'), how='inner', on='GUID')
#nyp = programs.join(concerts.set_index('GUID'), how='inner', on='GUID').reset_index()
nyp.info()
nyp.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20887 entries, 535 to 13953
Data columns (total 8 columns):
GUID         20887 non-null object
EventType    20870 non-null object
Location     20869 non-null object
Venue        20833 non-null object
Date         20887 non-null object
Time         20241 non-null object
Orchestra    20885 non-null object
Season       20887 non-null object
dtypes: object(8)
memory usage: 1.4+ MB


Unnamed: 0,GUID,EventType,Location,Venue,Date,Time,Orchestra,Season
535,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-16,14:00:00,New York Philharmonic,1900-01
14135,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-17,20:15:00,New York Philharmonic,1900-01
536,bc12831d-b37e-41b2-97e6-0c09505c22ed,Young People's Concert,"Manhattan, NY",Carnegie Hall,1900-12-01,14:30:00,New York Symphony,1900-01
537,dccd1848-bc49-46e7-92d5-822b7e31c579,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-12-07,14:00:00,New York Philharmonic,1900-01
14136,dccd1848-bc49-46e7-92d5-822b7e31c579,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-12-08,20:15:00,New York Philharmonic,1900-01


In [16]:
shape = str(nyp.shape)
expected_hash = 'a75738e37ac4ccf37a893a1009ba624efce9efaa7721d4319e9e078193fe8de6'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

sample = str(nyp.sample(random_state=0))
expected_hash = 'd47ed1ab14963bb6e594ebaf8d07fc89e78e83058dc78ced57a5bf5ca200efa7'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash 

## 4 Read Works and Soloists data (graded)

We will read the two remaining pieces of data. 

Again, albeit the step-by-step description, we encourage you to use method chaining.

In [17]:
works = pd.read_csv(os.path.join('data', 'works.csv'))
columns_to_keep = ['GUID', 'ComposerName', 'WorkTitle', 'Movement', 'ConductorName'] 
works.info()
works[columns_to_keep].drop_duplicates().info()
works.Interval.value_counts(dropna=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82576 entries, 0 to 82575
Data columns (total 9 columns):
GUID             82576 non-null object
ProgramID        82576 non-null int64
WorkID           82576 non-null int64
MovementID       25372 non-null float64
ComposerName     71296 non-null object
WorkTitle        71289 non-null object
Movement         24380 non-null object
ConductorName    66195 non-null object
Interval         11280 non-null object
dtypes: float64(1), int64(2), object(6)
memory usage: 5.7+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 82395 entries, 0 to 82575
Data columns (total 5 columns):
GUID             82395 non-null object
ComposerName     71200 non-null object
WorkTitle        71194 non-null object
Movement         24372 non-null object
ConductorName    66111 non-null object
dtypes: object(5)
memory usage: 3.8+ MB


NaN                    71296
Intermission           11192
Intermission-Short        51
Intermission-Second       36
Intermission-Third         1
Name: Interval, dtype: int64

In [18]:
expected_hash = 'cad58aa6cd33cfa24c08a0f0f846877178ab31278f212c80b16b952d9416f883'

for row in range(82576,0,-1):
    for col in range(9,0,-1):
        shape = str((row, col))
        if hashlib.sha256(shape.encode()).hexdigest() == expected_hash:
            print(row, col)
        if row == 71296 and col == 5:
            print(hashlib.sha256(shape.encode()).hexdigest())


71296 5
cad58aa6cd33cfa24c08a0f0f846877178ab31278f212c80b16b952d9416f883


In [19]:
row = 10
col = 5
shape = str((row, col))
type((row, col))

tuple

In [20]:
for row in range(3,0,-1):
    for col in range(3,0,-1):
        shape = str((row, col))
        print(hashlib.sha256(shape.encode()).hexdigest())

89274fc6d8d6e3864b90500aeb82f76719a006d11ac2787d67bac8245a5e8e46
f235c0d55f3014a39e092a0f1d0653888566c1cbee7b7d1247ecd4c374d87126
2039c2811a457633b0ea0a8e1485c5d230799cb39fd7051e606053c45c90e577
dc4307c0856536f8d790253fc6f914ad433118405609c52f26d7c2ed9f3ec947
9141880f8a0cebaec8813f9b25073e484d9a382a3849723c2c8a087ac131e52b
34e6f08aad18ac9868a9da1b5d2ad0bf2fabf191e92652264ecfc9bde7460695
0567007dc54b19ac74a0b8c9a4a3768328189cd5a7b5203f6e26ebb5bdf10bee
cf4dabed2ce0f4fc293425d4063a8c54df5e1164a6d4a6d85712e0f491ea08c8
d02b5ba5c34b34dbcc44c971bd1e9ee12da04d573f9a8354930f910325e10149


In [21]:
def make_works():
    # Read the works data.
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    
    # Remove the Intervals (attention to the values in the Interval column).
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    
    # Select the columns GUID, ComposerName, WorkTitle, Movement and ConductorName.
    # YOUR CODE HERE
    
    works = pd.read_csv(os.path.join('data', 'works.csv'))
    
    columns_to_keep = ['GUID', 'ComposerName', 'WorkTitle', 'Movement', 'ConductorName']
    rows_to_keep = works.Interval.isna()
    
    
    
    return works.loc[rows_to_keep, columns_to_keep]


def make_soloists():
    # Read the soloists data and drop ProgramID, WorkID and MovementID.
    # YOUR CODE HERE
    soloists = pd.read_csv(os.path.join('data', 'soloists.csv'))
    columns_to_drop = ['ProgramID', 'WorkID', 'MovementID']
 
    return soloists.drop(columns=columns_to_drop)


works = make_works()
soloists = make_soloists()

In [22]:
shape = str(works.shape)
expected_hash = 'cad58aa6cd33cfa24c08a0f0f846877178ab31278f212c80b16b952d9416f883'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(soloists.shape)
expected_hash = 'a7b0d20a45ff1344e0398eebb162af9afb8805082b0dfdcb70e9a4b78f94dd13'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

## 5 Combine Works and Soloists (graded)

Like we did for Programs and Concerts, now we combine Works and Soloists.

In [23]:
concerts.info()
concerts.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21607 entries, 0 to 21606
Data columns (total 6 columns):
GUID         21607 non-null object
EventType    21590 non-null object
Location     21588 non-null object
Venue        21552 non-null object
Date         21607 non-null object
Time         20836 non-null object
dtypes: object(6)
memory usage: 1012.9+ KB


Unnamed: 0,GUID,EventType,Location,Venue,Date,Time
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,Subscription Season,"Manhattan, NY",Apollo Rooms,1842-12-07,20:00:00
1,c7b2b95c-5e0b-431c-a340-5b37fc860b34,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-02-18,20:00:00
2,894e1a52-1ae5-4fa7-aec0-b99997555a37,Special,"Manhattan, NY",Apollo Rooms,1843-04-07,20:00:00
3,34ec2c2b-3297-4716-9831-b538310462b7,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-04-22,20:00:00
4,610a4acc-94e4-4cd6-bdc1-8ad020edc7e9,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-11-18,NaT


In [24]:
soloists.info()
soloists.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55708 entries, 0 to 55707
Data columns (total 4 columns):
GUID                 55708 non-null object
SoloistName          55707 non-null object
SoloistInstrument    55535 non-null object
SoloistRole          55572 non-null object
dtypes: object(4)
memory usage: 1.7+ MB


Unnamed: 0,GUID,SoloistName,SoloistInstrument,SoloistRole
0,9801741e-ed73-4eb9-8727-f386788e35d2,"Timm, Henry C.",Piano,A
1,50c805da-9079-467f-b525-89a6ec3bd47e,Arion Choral Society,Mens Chorus,S
2,0279aea7-5bd0-4d59-b5a5-e9d297e3a376,Chorus,Boys Choir,S
3,362f775f-8c64-4ff1-bcb8-29c0901cdba6,"Essipoff, Annette",Piano,S
4,3d66954e-48c3-4f23-80a6-41c6066adfb7,Oratorio Society of New York,Chorus,S


In [25]:
# Combine both dataframes, again using an inner type of join.
# works_and_soloists : pd.DataFrame = ....
# YOUR CODE HERE
works_and_soloists = pd.merge(works, soloists, on="GUID", how="inner")

In [26]:
shape = str(works_and_soloists.shape)
expected_hash = 'c0e73877aac4f3916267cb58f2f122ffef32c79039bde2ecb217fda123270d12'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 6 Combine everything (graded)

The final goal here is to create a single dataframe.

In [27]:
# Combine everything into a single dataframe.
# nyp_merged = ...
# YOUR CODE HERE
nyp_merged = pd.merge(nyp, works_and_soloists, on="GUID", indicator=False)

In [28]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 7 Final transformations (graded)

Now, we perform the train-test split.

We also perform some final transformations on both datasets:
* Include some date features: Year, Month, Day and Weekday
* Drop Date, Season and GUID
* Change the column name Orchestra to OrchestraName, for consistency with other name columns
* Filter out composers that appear in less than 100 concerts.

In [29]:
nyp_merged.info()
nyp_merged.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 689069 entries, 0 to 689068
Data columns (total 15 columns):
GUID                 689069 non-null object
EventType            687308 non-null object
Location             688557 non-null object
Venue                686945 non-null object
Date                 689069 non-null object
Time                 671854 non-null object
Orchestra            689064 non-null object
Season               689069 non-null object
ComposerName         689069 non-null object
WorkTitle            688906 non-null object
Movement             309734 non-null object
ConductorName        525928 non-null object
SoloistName          689068 non-null object
SoloistInstrument    686421 non-null object
SoloistRole          686922 non-null object
dtypes: object(15)
memory usage: 84.1+ MB


Unnamed: 0,GUID,EventType,Location,Venue,Date,Time,Orchestra,Season,ComposerName,WorkTitle,Movement,ConductorName,SoloistName,SoloistInstrument,SoloistRole
0,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-16,14:00:00,New York Philharmonic,1900-01,"Brahms, Johannes","ACADEMIC FESTIVAL OVERTURE, OP.80",,"Paur, Emil","Carreño, Teresa",Piano,S
1,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-16,14:00:00,New York Philharmonic,1900-01,"Bach, Johann Sebastian","TOCCATA & FUGUE, F MAJOR, BWV 540 (ARR. Esser)",,"Paur, Emil","Carreño, Teresa",Piano,S
2,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-16,14:00:00,New York Philharmonic,1900-01,"Tchaikovsky, Pyotr Ilyich","CONCERTO, PIANO, NO. 1, B-FLAT MINOR, OP. 23",,"Paur, Emil","Carreño, Teresa",Piano,S
3,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-16,14:00:00,New York Philharmonic,1900-01,"Suk, Josef","SYMPHONY NO. 1, E MAJOR, OP. 14",,"Paur, Emil","Carreño, Teresa",Piano,S
4,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-17,20:15:00,New York Philharmonic,1900-01,"Brahms, Johannes","ACADEMIC FESTIVAL OVERTURE, OP.80",,"Paur, Emil","Carreño, Teresa",Piano,S


In [30]:
pd.to_datetime(nyp_merged.Date).dt.time

0         00:00:00
1         00:00:00
2         00:00:00
3         00:00:00
4         00:00:00
5         00:00:00
6         00:00:00
7         00:00:00
8         00:00:00
9         00:00:00
10        00:00:00
11        00:00:00
12        00:00:00
13        00:00:00
14        00:00:00
15        00:00:00
16        00:00:00
17        00:00:00
18        00:00:00
19        00:00:00
20        00:00:00
21        00:00:00
22        00:00:00
23        00:00:00
24        00:00:00
25        00:00:00
26        00:00:00
27        00:00:00
28        00:00:00
29        00:00:00
            ...   
689039    00:00:00
689040    00:00:00
689041    00:00:00
689042    00:00:00
689043    00:00:00
689044    00:00:00
689045    00:00:00
689046    00:00:00
689047    00:00:00
689048    00:00:00
689049    00:00:00
689050    00:00:00
689051    00:00:00
689052    00:00:00
689053    00:00:00
689054    00:00:00
689055    00:00:00
689056    00:00:00
689057    00:00:00
689058    00:00:00
689059    00:00:00
689060    00

In [31]:
def preprocess_data(df):
    # You should follow these exact steps:
    #   1 - add_date_features, ideally using df.pipe
    #   2 - drop Date, Season and GUID
    #   3 - rename Orchestra to OrchestraName
    #   4 - filter out composers with less than 100 concerts (keep the ones with >= 100 rows)
    # YOUR CODE HERE
    df = df.copy()
    df = (df.pipe(add_date_features)
            .drop(columns=['Date','Season','GUID'])
            .rename(columns={'Orchestra': 'OrchestraName'})
            .groupby('ComposerName').filter(lambda x: x.shape[0] >= 100))                
    
    return df


def add_date_features(df):
    # YOUR CODE HERE
    df = df.copy()
    df.Date = pd.to_datetime(df.Date)
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['Weekday'] = df.Date.dt.weekday
    
    return df


nyp_ = preprocess_data(nyp_merged)
X_train, X_test = train_test_split(nyp_, random_state=0)

In [32]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(nyp_.shape)
expected_hash = '31fa2b10222342d4743fa75b3a04c69945106f22fcf7473f5d1daeb84bca88b7'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(nyp_.columns.values)
expected_hash = '7d131b98b4d7094443c094603c6db00aa20a79e49661acdefb33bf5fc1c071fa'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash 

And, finally, we would be ready to explore modeling.

For the next part, however, we will be using the famous [Boston House Prices Dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names).

## 8 Scaling features (graded)

About the Boston dataset:

> Each record in the database describes a Boston suburb or town. The data is from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

The features are all numerical (real, positive):
* **CRIM** - per capita crime rate by town
* **ZN** - proportion of residential land zoned for lots over 25,000 sq.ft.
* **INDUS** - proportion of non-retail business acres per town
* **CHAS** - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* **NOX** - nitric oxides concentration (parts per 10 million)
* **RM** - average number of rooms per dwelling
* **AGE** - proportion of owner-occupied units built prior to 1940
* **DIS** - weighted distances to five Boston employment centres
* **RAD** - index of accessibility to radial highways
* **TAX** - full-value property-tax rate per \$10,000
* **PTRATIO** - pupil-teacher ratio by town
* **B** - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* **LSTAT** - % lower status of the population
* **MEDV** - Median value of owner-occupied homes in \$1000's.

We want to scale all features to the same range, using `sklearn.preprocessing.MinMaxScaler()`.

In [33]:
boston = load_boston()
X = pd.DataFrame(data=boston.data, columns=boston.feature_names)
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Initialize the MinMaxScaler to a [0, 5] range.
# YOUR CODE HERE
transformer = MinMaxScaler(feature_range=(0, 5))

# Fit on the training set and transform X_train. We expect X_train_
# to be a dataframe **just like** X_train, only scaled. 
# X_train_: pd.DataFrame = ...
# YOUR CODE HERE
transformer.fit(X_train)

#print(transformer.get_params())
#print(transformer.data_min_ )
#print(transformer.data_max_ )
X_train_ = pd.DataFrame(transformer.transform(X_train), index=X_train.index, columns=X_train.columns)
#pd.DataFrame(X_train_).describe()


# Transform the test set.
# X_test_: pd.DataFrame = ...
# YOUR CODE HERE
#print(transformer.data_min_ )
#print(transformer.data_max_ )
X_test_ = pd.DataFrame(transformer.transform(X_test), index=X_test.index, columns=X_test.columns)


In [34]:
shape = str(X_train_.shape)
expected_hash = '6f696c7e30c15aae3f0fa4807b596cf15d28cadaf33602d8d20368f7ac921f26'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_train_.columns.values)
expected_hash = 'c4e20218e7e33f0e771a608bb05ece0152f5a15fc6a0629b6c88cef7790fbfe1'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test_.shape)
expected_hash = 'aa2b4e3c1e358b4b9f21c2c86bbf1187020582395419f1a02a949d7a6efac9e4'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_test_.columns.values)
expected_hash = 'c4e20218e7e33f0e771a608bb05ece0152f5a15fc6a0629b6c88cef7790fbfe1'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 9 Build a ColumnSelector transformer (graded)

There's a simple transformer that can be useful, from times to times, when modeling.

What we want is to build a transformer that returns the columns we select beforehand. 

This transformer could be used to determine what features go into modeling.

In [35]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    # Implement the __init__ method.
    # Our ColumnSelector must be able to receive a parameter columns.
    # The default value for columns must be set to 'all', so we can
    # initialize it without any explicit parameters.
    # YOUR CODE HERE
    def __init__(self, columns=None):
        self.columns = columns    
                 
        
    # There's no need for a fit method in this case, it does nothing.
    # We should be able to call fit without any explicit parameters.
    # Meaning: we should be able to call ColumnSelector.fit().
    # YOUR CODE HERE
    def fit(self, X=None, y=None):
        return self

    # Transform should return all columns if the parameter columns we
    # passed upon initialization is equal to 'all'. If a column or a
    # list of columns are passed, only those should be returned.
    # YOUR CODE HERE
    def transform(self, X):
        if self.columns == 'all':
            return X
        else:
            return X[self.columns]
        

cols = ['CRIM', 'DIS', 'INDUS', 'RM', 'DIS', 'TAX', 'B']
selector = ColumnSelector(columns=cols)
X_train__ = selector.fit_transform(X_train_)
X_test__ = selector.transform(X_test_)

In [36]:
assert(ColumnSelector())
assert(selector.fit())

shape = str(X_train__.shape)
expected_hash = '5d4f688e84beb21ec07f136c16a6cc11318d4f5de7b81bf0232e5282d9834123'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_train__.columns.values)
expected_hash = '901009bce1feeeccadd8cd499664598ff9319641e55dcda17a650c13c0626604'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test__.shape)
expected_hash = '0aba1c19151f76aa2ecb00fd75be05c6f73860573972e967f3d1fe1c44ae2629'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_test__.columns.values)
expected_hash = '901009bce1feeeccadd8cd499664598ff9319641e55dcda17a650c13c0626604'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 10 Building the pipeline (graded)

Finally, we want to use the two transformers together and run a linear regression on top.

In [37]:
# Create a pipeline including:
#   1 - 'selector', ColumSelector(columns=cols)
#   2 - 'min_max', MinMaxScaler() with same range as above
#   3 - 'model', LinearRegression
# YOUR CODE HERE

        
pipeline = Pipeline([('selector', ColumnSelector(columns=cols)),
                     ('min_max', MinMaxScaler(feature_range=(0, 5))),
                     ('model', LinearRegression())])


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('MSE: {}'.format(mse))
print('MAE: {}'.format(mae))

MSE: 44.380398055911535
MAE: 4.040328302332138


In [38]:
assert type(pipeline) == Pipeline
assert type(pipeline.named_steps['selector']) == ColumnSelector
assert type(pipeline.named_steps['min_max']) == MinMaxScaler
assert pipeline.named_steps['min_max'].get_params()['feature_range'] == (0,5)
assert type(pipeline.named_steps['model']) == LinearRegression 

Exercises complete, congratulations! You are about to become a certified data wrangler.