In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from sklearn import preprocessing
from sklearn import model_selection

In [3]:
# pip install ipynb

In [4]:
def remove_uninformative_attrs(df):
    relevant_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', \
                     'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', \
                     'duration_ms', 'time_signature', 'category']
    return df[relevant_cols]

In [5]:
def load_dataset(path='../data', split=True, scale=True):
    frames = []
    
    for p in os.listdir(path):
        if p.startswith('p_') == False:
            continue
        df = pd.DataFrame(pd.read_csv(os.path.join(path, p)))
        df['category'] = p[2:-4]
        df = remove_uninformative_attrs(df)
        df['mode'] = [0 if mode == 'minor' else 1 for mode in df['mode']]
        df = pd.concat([df, pd.get_dummies(df['key'])], axis=1)
        df = df.drop('key', axis = 1)
        frames.append(df)
        
    df = pd.concat(frames)
    df = df.dropna()
    
    if split:
        x_train, x_test, y_train, y_test = split_data(df)
        if scale:
            x_train, x_test = scale_data(x_train, x_test)
        return (x_train, x_test, y_train, y_test)
    else: 
        return df

In [6]:
def scale_data(x_train, x_test):
    scaler = preprocessing.StandardScaler()
    scaler.fit(x_train)
    x_train_transformed = scaler.transform(x_train)
    x_test_transformed = scaler.transform(x_test)
    return (x_train_transformed, x_test_transformed)

In [7]:
def split_data(df):
    x = df.drop(['category'], axis=1)
    y = df['category']
    return model_selection.train_test_split(x, y, train_size=0.7, test_size=0.3, stratify=y)