In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import numpy as np
import pandas as pd
import math

In [2]:
FEAT_NUM = 13
FIXED_TRACK_NUM = 24

In [3]:
albumSchema = StructType([ \
    StructField("AI", StringType()), \
    StructField("AN", StringType()), \
    StructField("TI", StringType()), \
    StructField("TN", StringType()), \
    StructField("FTS", ArrayType(DoubleType()))])

In [4]:
def loadFileAsList(path):
  albums = []
  albumFile = open(path, 'r')
  for line in albumFile:
    albums.append(line.rstrip().split('\t'))
    for x in range(len(albums[-1])):
      if x > 3:
        albums[-1][x] = float(albums[-1][x])
    albums[-1] = [albums[-1][0], albums[-1][1], albums[-1][2], albums[-1][3], [f for f in albums[-1][4:]]]
  albumFile.close()
  return albums

In [5]:
def interpolateRow(row):
  finalRow = [row.AI, [ [ np.nan for _ in range(FEAT_NUM) ] for _ in range(FIXED_TRACK_NUM) ]]
  
  # For each feature type
  for i in range(FEAT_NUM):
    
    tempFeatures = []
    
    # Divide case to albums with single track and multiple tracks
    # to avoid 'Division by zero' exception
    # If multiple tracks
    if(len(row.FTS) != 1):
      interSpace = math.floor(((FIXED_TRACK_NUM - len(row.FTS)) / (len(row.FTS) - 1)))
      additionalSpace = (FIXED_TRACK_NUM - len(row.FTS)) % (len(row.FTS) - 1)
    
      ## Value and NaN placements
      # For each list of track features
      for j in row.FTS:
        tempFeatures.append(j[i])
        for s in range(interSpace):
          tempFeatures.append(np.nan)
        if additionalSpace > 0:
          tempFeatures.append(np.nan)
          additionalSpace -= 1
    
    # If single track
    else:
      tempFeatures.append(row.FTS[0][i])
      for n in range(FIXED_TRACK_NUM - 1):
        tempFeatures.append(np.nan)
    
    ## Interpolate
    tempFeatures = pd.Series(tempFeatures)
    tempFeatures = tempFeatures.interpolate()
    
    ## Assign Back
    for j in range(FIXED_TRACK_NUM):
      finalRow[1][j][i] = tempFeatures[j]
  
  return finalRow

In [6]:
def stringify(data):
  row = data[0]
  for x in data[1]:
    for y in x:
      row += '\t%f' % y
  return row

In [7]:
%fs rm -r dbfs:/FileStore/tables/all.tsv

In [8]:
finalAlbums = (sqlContext
               .createDataFrame(loadFileAsList('/dbfs//FileStore/tables/dataset_7k_2010-f3f46.txt'), albumSchema)
               .groupBy('AI')
               .agg(collect_list('FTS').alias('FTS'))
               .filter(size('FTS') <= FIXED_TRACK_NUM)
               .rdd.map(interpolateRow)
               .cache())

filenames = ['/dbfs/FileStore/tables/dataset_7k_2011-8b270.txt',
             '/dbfs/FileStore/tables/dataset_7k_2012-0a60d.txt',
             '/dbfs/FileStore/tables/dataset_7k_2013-83f9e.txt',
             '/dbfs/FileStore/tables/dataset_7k_2014-5df67.txt',
             '/dbfs/FileStore/tables/dataset_7k_2015-02ad0.txt',
             '/dbfs/FileStore/tables/dataset_7k_2016-adde7.txt',
             '/dbfs/FileStore/tables/dataset_7k_2017-e85f0.txt',
             '/dbfs/FileStore/tables/dataset_7k_2018-5803f.txt']

for fn in filenames:
  finalAlbums = finalAlbums.union(sqlContext
               .createDataFrame(loadFileAsList(fn), albumSchema)
               .groupBy('AI')
               .agg(collect_list('FTS').alias('FTS'))
               .filter(size('FTS') <= FIXED_TRACK_NUM)
               .rdd.map(interpolateRow))

finalAlbums.map(stringify).saveAsTextFile('dbfs:/FileStore/tables/all.tsv')

In [9]:
%fs ls /FileStore/tables

path,name,size
dbfs:/FileStore/tables/2010.tsv/,2010.tsv/,0
dbfs:/FileStore/tables/all.tsv/,all.tsv/,0
dbfs:/FileStore/tables/apache.log,apache.log,4752760
dbfs:/FileStore/tables/dataset_7k_2010-f3f46.txt,dataset_7k_2010-f3f46.txt,22279458
dbfs:/FileStore/tables/dataset_7k_2011-8b270.txt,dataset_7k_2011-8b270.txt,22168915
dbfs:/FileStore/tables/dataset_7k_2012-0a60d.txt,dataset_7k_2012-0a60d.txt,21216838
dbfs:/FileStore/tables/dataset_7k_2013-83f9e.txt,dataset_7k_2013-83f9e.txt,20252714
dbfs:/FileStore/tables/dataset_7k_2014-5df67.txt,dataset_7k_2014-5df67.txt,18133298
dbfs:/FileStore/tables/dataset_7k_2015-02ad0.txt,dataset_7k_2015-02ad0.txt,17900276
dbfs:/FileStore/tables/dataset_7k_2016-adde7.txt,dataset_7k_2016-adde7.txt,14504502
