In [3]:
import pandas as pd
import numpy as np
import os
import tqdm
import shutil

In [4]:
sample = False
data_dir = '../data/m4a/'
feature_file = 'id_gems_binarized.tsv'

In [5]:
if sample:
    output_dir = '../data/cat_emma_sample/'
    df_interactions = pd.read_csv('../data/m4a/userid_trackid_timestamp_emma.tsv', sep='\t', nrows=1e6)
else:
    output_dir = '../data/cat_emma/'
    df_interactions = pd.read_csv('../data/m4a/userid_trackid_timestamp_emma.tsv', sep='\t')
    
df_interactions['rating'] = 5
sample_items = df_interactions['track_id'].unique()

# map item and users to numbers
df_interactions['user_id_int'] = df_interactions['user_id'].astype('category').cat.codes
df_interactions['track_id_int'] = df_interactions['track_id'].astype('category').cat.codes

#df_interactions[['user_id', 'track_id', 'rating', 'timestamp']].to_csv(os.path.join(output_dir, 'interactions.tsv'), index=False, sep='\t', header=False)
df_interactions[['user_id_int', 'track_id_int', 'rating', 'timestamp']].to_csv(os.path.join(output_dir, 'interactions.tsv'), index=False, sep='\t', header=False)


print(df_interactions.shape)
df_interactions.head()

(1486894, 6)


Unnamed: 0,user_id,track_id,timestamp,rating,user_id_int,track_id_int
0,52740,rFJl0J6qKPImZWOQ,2013-03-03 19:41:52,5,33056,204
1,52740,rFJl0J6qKPImZWOQ,2013-02-03 15:55:45,5,33056,204
2,52740,rFJl0J6qKPImZWOQ,2013-02-01 21:45:34,5,33056,204
3,52740,rFJl0J6qKPImZWOQ,2013-01-31 21:59:41,5,33056,204
4,52740,rFJl0J6qKPImZWOQ,2013-01-31 19:15:30,5,33056,204


In [6]:
item_id_map = df_interactions[['track_id', 'track_id_int']].drop_duplicates()
item_id_map.set_index('track_id', inplace=True)
item_id_map.head()

Unnamed: 0_level_0,track_id_int
track_id,Unnamed: 1_level_1
rFJl0J6qKPImZWOQ,204
kduALxkKk2rXxDnp,177
aelKmXCNJPMQjJh8,143
OUb23fwEUhwC3Uec,92
9CHzAeSHv9tWaIXW,31


In [7]:
del df_interactions

In [23]:
df = pd.read_csv(os.path.join(data_dir, feature_file), index_col=0, sep='\t', names=[i for i in range(0, 9)])
df = df.merge(item_id_map, left_index=True, right_index=True)
df.sort_values('track_id_int', inplace=True)
df.set_index('track_id_int', inplace=True)
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
track_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,0,0,1,1,0,1
1,1,1,1,1,1,0,0,1,0
2,0,0,1,1,1,0,0,1,0
3,0,0,0,0,0,0,1,1,1
4,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
243,1,1,1,1,1,1,0,0,0
244,0,0,1,1,1,0,0,1,0
245,1,1,1,1,1,0,0,1,0
246,1,0,1,1,1,0,0,1,0


In [31]:
# elliot expects each cat feature as own number (0, 1, 2, 3, ...)
transformed_data = []

for id, row in df.iterrows():
    features = row.index[row == 1].tolist()
    transformed_data.append({"id": id, "features": features})

df_transformed = pd.DataFrame(transformed_data)
df_transformed.set_index('id', inplace=True)
df_transformed

Unnamed: 0_level_0,features
id,Unnamed: 1_level_1
0,"[5, 6, 8]"
1,"[0, 1, 2, 3, 4, 7]"
2,"[2, 3, 4, 7]"
3,"[6, 7, 8]"
4,"[6, 8]"
...,...
243,"[0, 1, 2, 3, 4, 5]"
244,"[2, 3, 4, 7]"
245,"[0, 1, 2, 3, 4, 7]"
246,"[0, 2, 3, 4, 7]"


In [49]:
if sample:
    df = df.reindex(sample_items).dropna()
    output_dir = '../data/cat_emma_sample/'
else:
    output_dir = '../data/cat_emma/'

# write list of lists to file
with open(os.path.join(output_dir, 'emma_cat_elliot_features.tsv'), 'w') as f:
    for i , (id, row) in enumerate(df_transformed.iterrows()):
        # row is a list, write as tab sep
        f.write(str(id) + '\t')
        f.write('\t'.join(map(str, row[0])))
        if i < len(df_transformed) - 1:  # Check if it's not the last line
            f.write('\n')