In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import time

### Guideline
To use this notebook, simply replace the dataset variable with the year you would like to preprocess

Make sure to save this notebook in the same directory as the datasets

In [None]:
dataset = "dataset-7k-2010.txt"
dataset_dir = os.path.join("./", dataset)
dataset_dir

In [None]:
data = pd.read_csv(dataset_dir, sep="\t", header=None)
data.columns= ["album_id", "album_name", "track_id", "track_name", "acousticness", "danceability", "duration (ms)", \
               "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", \
               "time_signature", "valence"]

In [None]:
data.head(15)

In [None]:
unique_albums = list(data['album_name'].unique())
unique_albums

In [None]:
print(len(unique_albums))

In [None]:
num_albums = data['album_name'].nunique()
num_albums

In [None]:
data_grouped = data.groupby(['album_name'])

### Initial Data Exploration 
Please comment out the next two lines for processing any other data besides 2010

Alternatively, you can test out other albums in the corresponding year

In [None]:
# For testing (2010)
bruno_mars_album = data_grouped.get_group('Doo-Wops & Hooligans')
bruno_mars_album

In [None]:
acousticness = bruno_mars_album['acousticness']
feature_values = np.array(acousticness)
feature_values

To keep, vector of features and album_name and album_id (no need for track_id and track_name because for interpolated data it won't have the appropriate values

In [None]:
indexes = pd.Series(range(0, 24))

In [None]:
def construct_feature_vector(feature_values):
    """
    @params: feature_values: the desired feature/album attribute to be expanded and filled with nans
    @returns: feat_vector: the resulting feature vector
    """
    album_size = len(feature_values)
#     print(album_size)
    fv_size = 24
    feat_vector = []
    attribute_index = 0
    step_size = int(24 / album_size)
    if album_size <= 12:
        for i in range(fv_size):
            if (i % step_size) == 0:
                if (attribute_index >= len(feature_values)):
                    feat_vector.append(np.nan)
                    continue
                feat_vector.append(feature_values[attribute_index])
                attribute_index += 1
            else:
                feat_vector.append(np.nan)
    elif (album_size > 12 and album_size < 24):
        # equally space the first x numbers
        remaining_feats = (album_size - 12) * 2
#         print("Rem: {}".format(remaining_feats))
        spaced_feats = album_size - remaining_feats
#         print("Spaced: {}".format(spaced_feats))
        for i in range(spaced_feats * 2):
            if (i % 2) == 0:
                feat_vector.append(feature_values[attribute_index])
                attribute_index += 1
            else:
                feat_vector.append(np.nan)
#             print(len(feat_vector))
        for j in range(remaining_feats):
            feat_vector.append(feature_values[attribute_index])
            attribute_index += 1
    elif (album_size == 24):
        feat_vector = feature_values.copy()
    elif (album_size > 24):
        feat_vector = feature_values[:24]
    return pd.Series(feat_vector, index=indexes)
        

In [None]:
# testcases
smallerthan12 = [i for i in range(8)]
test_smallerthan12 = construct_feature_vector(smallerthan12)
largerthan12 = [i for i in range(15)]
test_largerthan12 = construct_feature_vector(largerthan12)
exactly24 = [i for i in range(24)]
test_exactly24 = construct_feature_vector(exactly24)
largerlist = [i for i in range(30)]
test_greater24 = construct_feature_vector(largerlist)

print(test_smallerthan12, test_largerthan12, test_exactly24, test_greater24)

In [None]:
testvector = construct_feature_vector(feature_values)
testvector

In [None]:
new_group = data.groupby(['album_name', 'album_id'])
new_group

### Testing the data
Comment out the following 5 groups of code for other datasets or alternatively, can test with other albums 

In [None]:
recovery_albums = data_grouped.get_group('Recovery')
recovery_albums

In [None]:
eminem_album = new_group.get_group(('Recovery', '2bn6QpvG94QJkQTETzjTCs'))
eminem_album

In [None]:
# For testing (2010)
eminem_acc = list(eminem_album['acousticness'])
eminem_acc_vector = construct_feature_vector(eminem_acc)
eminem_acc_vector

In [None]:
quad_interpolated = eminem_acc_vector.interpolate('quadratic')
linear_interpolated = eminem_acc_vector.interpolate('linear')
fd_interpolated = eminem_acc_vector.interpolate('from_derivatives')
poly_interpolated = eminem_acc_vector.interpolate('polynomial', order=5)

In [None]:
plt.figure(1)
plt.plot(indexes, quad_interpolated)
plt.figure(2)
plt.plot(indexes, linear_interpolated)
plt.figure(3)
plt.plot(indexes, fd_interpolated)
plt.figure(4)
plt.plot(indexes, poly_interpolated)

### Preparing for creating the dataframe

In [None]:
col_list = list(data.columns)
col_list

In [None]:
col_list.remove('track_name')
col_list.remove('track_id')

In [None]:
col_list

In [None]:
filtered_data = data[[category for category in col_list]]
filtered_data

In [None]:
filtered_grouped_data = filtered_data.groupby(['album_id', 'album_name'])
groups = list(filtered_grouped_data.groups.keys())
feature_cols = col_list[2:]

In [None]:
def expand_to_vector(category):
    vector = []
    for i in range(24):
        vector.append(category)
    return vector

In [None]:
def interpolate_values(feat_vector):
    feat_vector = feat_vector.interpolate()
    return feat_vector

### Creating the resultant dataframe

In [None]:
resultdf = pd.DataFrame(columns=col_list)
frames = []
tic = time.clock()
for i, group in enumerate(groups):
    temp_dict = {}
    album = filtered_grouped_data.get_group(group)
    a_id = group[0]
    a_name = group[1]
    album_id_v = expand_to_vector(a_id)
    album_name_v = expand_to_vector(a_name)
    temp_dict.update({'album_id': album_id_v, 'album_name': album_name_v})
    for feature in feature_cols:
        album_features = list(album[feature])
        album_feat_vector = construct_feature_vector(album_features)
        album_feat_interpolated = interpolate_values(album_feat_vector)
        temp_dict.update({feature: album_feat_interpolated})
    temp_df = pd.DataFrame.from_dict(temp_dict)
    frames.append(temp_df)
    resultdf = pd.concat(frames, ignore_index=True)
    if (i % 50 == 0):
        print("Finished {} albums".format(i))
#     if (i > 10):
#         break
toc = time.clock()
exec_time = toc - tic
print("Execution time is {}".format(exec_time))

### Result
Please make sure to change the result filename to the appropriate year

In [None]:
result_filename = "2010-converted-dataset.csv"

In [None]:
resultdf.to_csv(result_filename, sep='\t')