# Clean Dataset and create subset for KG creation

In [1]:
import pandas as pd
import numpy as np
import csv
import time
import sys
import json
import joblib
import os
import datetime

from tqdm import tqdm

In [2]:
users = pd.read_csv('lfm/users.tsv', sep='\t')
tracks = pd.read_csv('lfm/tracks.tsv', sep='\t')
listening_events = pd.read_csv('lfm/listening_events.tsv', sep='\t')
albums = pd.read_csv('lfm/albums.tsv', sep='\t')

In [3]:
len(tracks)

4079421

In [4]:
len(listening_events)

30357786

## Clean Dataset

<b> Remove users which gender is not defined and whose age is not element of [10;90] </b>

In [5]:
l = len(users)
users = users[users.gender.isin(['m', 'f']) & (users.age >= 10) & (users.age <= 95)]
print('Removed %s users' % (len(users) - l))

Removed -5163 users


<b> Remove now unassigned listening events </b>

In [6]:
l = len(listening_events)
listening_events = listening_events[listening_events.user_id.isin(users.user_id)]
print('Removed %s listening events' % (len(listening_events) - l))

Removed -9725559 listening events


<b> remove duplicate listening events </b>

In [7]:
listening_events = listening_events.sort_values('timestamp')
old_length_le = len(listening_events)
listening_events = listening_events.drop_duplicates(subset=['user_id', 'track_id'])
print(len(listening_events)-old_length_le)

-10636671


<b> 10-core filtering </b>

In [8]:
n_core = 10
while True:
    start_number = len(listening_events)

    # Item pass
    item_counts = listening_events.track_id.value_counts()
    item_above = set(item_counts[item_counts >= n_core].index)
    listening_events = listening_events[listening_events.track_id.isin(item_above)]
    print('Records after item pass: ', len(listening_events))

    # User pass
    user_counts = listening_events.user_id.value_counts()
    user_above = set(user_counts[user_counts >= n_core].index)
    listening_events = listening_events[listening_events.user_id.isin(user_above)]
    print('Records after user pass: ', len(listening_events))

    if len(listening_events) == start_number:
        print('Exiting...')
        break

Records after item pass:  4937270
Records after user pass:  4935462
Records after item pass:  4934458
Records after user pass:  4934458
Records after item pass:  4934458
Records after user pass:  4934458
Exiting...


In [9]:
users = users[users.user_id.isin(set(listening_events.user_id))]
tracks = tracks[tracks.track_id.isin(set(listening_events.track_id))]

In [10]:
users.sample(1)

Unnamed: 0,user_id,country,age,gender,creation_time
1935,5162,UK,38,m,2006-05-10 00:28:13


In [11]:
len(listening_events)

4934458

In [12]:
listening_events = listening_events[listening_events.user_id.isin(users.user_id)]
listening_events = listening_events[listening_events.track_id.isin(tracks.track_id)]

In [13]:
len(listening_events)

4931626

In [14]:
listening_events.head()

Unnamed: 0,user_id,track_id,album_id,timestamp
2,16026,40012596,18558010,2020-01-01 00:00:01
3,42410,14250568,8916172,2020-01-01 00:00:01
5,39972,33291229,6630145,2020-01-01 00:00:01
7,7744,18394900,19722005,2020-01-01 00:00:02
14,14807,21889387,16391976,2020-01-01 00:00:03


In [15]:
listening_events.timestamp = listening_events.timestamp.apply(lambda x: int(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timestamp()))

In [16]:
listening_events.head()

Unnamed: 0,user_id,track_id,album_id,timestamp
2,16026,40012596,18558010,1577833201
3,42410,14250568,8916172,1577833201
5,39972,33291229,6630145,1577833201
7,7744,18394900,19722005,1577833202
14,14807,21889387,16391976,1577833203


In [17]:
if not os.path.exists('lfm/cleaned'):
    os.makedirs('lfm/cleaned')

In [18]:
tracks.to_csv('lfm/cleaned/tracks.tsv', sep='\t', index=False)
listening_events.to_csv('lfm/cleaned/listening_events.tsv', sep='\t', index=False)
users.to_csv('lfm/cleaned/users.tsv', sep='\t', index=False)
albums.to_csv('lfm/cleaned/albums.tsv', sep='\t', index=False)

In [19]:
print(len(tracks))
print(len(users))
print(len(listening_events))

173788
9455
4931626


# Create KG files

In [20]:
#csv.field_size_limit(sys.maxsize)

## Entities

- user
- artist
- album
- track
- genre

**Additional information:**
- gender
- country

#### Users

In [21]:
if not os.path.exists('lfm/intermediate_kg/entities'):
    os.makedirs('lfm/intermediate_kg/entities')

In [22]:
path = 'lfm'
path_prefix = 'lfm/intermediate_kg'
read_path_prefix = 'lfm/cleaned'
write_path_prefix = 'lfm/intermediate_kg'

In [23]:
# user_id, country, age, gender, creation_time
with open("%s/users.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s/users.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            f.writelines("u" + entry[0] + "\t" + entry[1] + "\t" + entry[2] + "\t" + entry[3] + "\n")
            
time.sleep(0.5)            

print(header)
print("done.")

9456it [00:00, 814050.18it/s]


['user_id', 'country', 'age', 'gender', 'creation_time']
done.


#### Artists

In [24]:
# artist_id, artist_name
#with open("data/artists.tsv", "r", encoding='utf-8') as data, open("data/kg/artists.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("a" + entry[0] + "\t" + str(entry[1].encode("utf-8")) + "\n")
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")

#### Albums

In [25]:
# album_id, album_name, artist_name
#with open("data/albums.tsv", "r", encoding='utf-8') as data, open("data/kg/albums.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("b" + entry[0] + "\t" + str(entry[1].encode("utf-8")) +  "\n")
#            
#time.sleep(0.5)
#
#print(header)
#print("done.")

#### Tracks

In [26]:
# track_id, artist_name, track_name
track_complete_dict = {}
counter = 0
with open("%s/tracks.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s/tracks.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            try:
                f.writelines("t" + entry[0] + "\t" + str(entry[2].encode("utf-8")) + "\n")
                track_id = "t" + entry[0]
                if track_id not in track_complete_dict:
                    track_complete_dict[track_id] = 0
            except:
                f.writelines("t" + entry[0] + "\t" + "???" + "\n")
                counter += 1
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

173789it [00:00, 526973.45it/s]


['track_id', 'artist', 'track']
done.
401


In [27]:
print(len(list(track_complete_dict.keys())))

173387


#### Genres

In [28]:
#counter = 0
#with open("data/micro_genre_tags.json", "r", encoding='utf-8') as data, open("data/kg/genres.txt", "w") as f:
#    for line in tqdm(data):
#        try:
#            entry = json.loads(line)
#            track_name = entry['_id']['track']
#            artist_name = entry['_id']['artist']
#            main_genre = list(entry['tags'].keys())[0]
#            genres = ",".join(list(entry['tags'].keys()))
#
#            f.writelines(track_name + "\t" + artist_name + "\t" + main_genre + "\t" + genres + "\n")
#        except:
#            counter += 1
#            continue
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")
#print(counter)

#### Gender & Country

In [29]:
gdict = {}
cdict = {}

gcounter = 0
ccounter = 0

with open("%s/users.txt" % write_path_prefix, "r") as data,\
open("%s/entities/gender.txt"  % (path_prefix), "w") as f,\
open("%s/entities/country.txt" % (path_prefix), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "" or entry[3] in gdict:
            gcounter += 1
        else:
            gdict[entry[3]] = 0
            f.writelines(entry[3] + "\n")
            
        if entry[1] == "" or entry[1] in cdict:
            ccounter += 1
        else:
            cdict[entry[1]] = 0
            f2.writelines(entry[1] + "\n")
            
time.sleep(0.5)
            
print("done.")
print(gcounter)
print(ccounter)

9455it [00:00, 946719.77it/s]


done.
9453
9331


#### Country

In [30]:
# DONE

## Relations

- **listened_to:** _user_ listened_to _track_
- **in_album:** _track_ in_album _album_
- **created_by:** _track_ created_by _artist_
- **has_genre**: _track_ has_genre _genre_

**Additional information:**
- **has_micro_genre**: _track_ has_micro_genre _genre_ (**or** instead of has_genre)
- **has_gender**: _user_ has_gender _gender_
- **lives_in**: _user_ lives_in _country_

In [31]:
if not os.path.exists('lfm/intermediate_kg/relations'):
    os.makedirs('lfm/intermediate_kg/relations')

In [32]:
# check genres.txt --> clean track and artist names, we filter it with that list?
# create new dict mit track names, artist names and look up for all the relations

# track_name, artist_name, main_genre, micro_genre

#track_dict = {}

#with open("data/kg/genres.txt", "r") as data: #, open("data/kg/tracks.txt", "w") as f:
#    datareader = csv.reader(data)
    
#    for i, row in tqdm(enumerate(datareader)):
#        try:
#            helper = row[0].split("\t")
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = row[1:]
#        except:
#            row = ", ".join(row)
#            helper = row.split("\t")
            
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = helper[3].split(",")[1:]
#            continue
        
#        key = track_name # + " || " + artist_name
        
#        if key not in track_dict:
#            track_dict[key] = {}
#            track_dict[key]["artist"] = artist_name
#            track_dict[key]["main_genre"] = main_genre
#            track_dict[key]["micro_genre"] = micro_genre
        
#print("saving...")

#joblib.dump(track_dict, "data/kg/track_dict.pkl")

#print("done.")

#### listened_to

In [33]:
# user_id, track_id, album_id, timestamp
counter = 0
counts_dict = {}
track_dict = {}
with open("%s/listening_events.tsv" % (read_path_prefix), "r") as data, open("%s/relations/listened_to.txt" % (path_prefix), "w") as f:
    datareader = csv.reader(data)

    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")

            track_id = "t" + entry[1]
            if track_id not in track_dict:
                track_dict[track_id] = 0

            key = "u" + entry[0] + ", " + "listened_to" + ", " + "t" + entry[1]
            if key not in counts_dict:
                counts_dict[key] = 0
                f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            else:
                counts_dict[key] += 1

            #try:
            #    f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            #except:
            #    counter += 1
            #    continue

time.sleep(0.5)

print(header)
print("done.")
# print(counter)

4931627it [00:10, 449507.31it/s]


['user_id', 'track_id', 'album_id', 'timestamp']
done.


In [34]:
os.remove('lfm/intermediate_kg/relations/listened_to.txt')

In [35]:
#print(len(list(track_dict.keys())))

#### created_by & has_genre & has_micro_genre & in_album

In [36]:
counter = 0

artist_dict = {}
album_dict = {}
type_dict = {}
genre_dict = {}
check_dict = {}

with open("%s/micro_genre_tags_new.json" % path, "r", encoding='utf-8') as data,\
open("%s/relations/has_genre.txt" % (path_prefix), "w", encoding='utf-8') as f1,\
open("%s/relations/has_micro_genre.txt" % (path_prefix), "w", encoding='utf-8') as f2,\
open("%s/relations/in_album.txt" % (path_prefix), "w", encoding='utf-8') as f3,\
open("%s/relations/created_by.txt" % (path_prefix), "w", encoding='utf-8') as f4,\
open("%s/entities/artist.txt" % (path_prefix), "w", encoding='utf-8') as e1,\
open("%s/entities/album.txt" % (path_prefix), "w", encoding='utf-8') as e2,\
open("%s/entities/artist_type.txt" % (path_prefix), "w", encoding='utf-8') as e3,\
open("%s/entities/genre.txt" % (path_prefix), "w", encoding='utf-8') as e4,\
open("%s/entities/track.txt" % (path_prefix), "w", encoding='utf-8') as t: #?????????
    for line in tqdm(data):
        entry = json.loads(line)
        #print(entry)
        track_id = str(entry['_id'])
        track_name = str(entry['track']['track']) #?????????
        main_genre = str(list(entry['tags'].keys())[0])
        micro_genre = list(entry['tags'].keys())[1:]
        
        artist_id = str(entry['artist']['artist_id'])
        artist_name = str(entry['artist']['artist'])
        artist_gender = str(entry['artist']['gender'])
        artist_type = str(entry['artist']['type'])
        
        album_id = str(entry['album']['album_id'])
        album_name = str(entry['album']['album'])
        
        check = "t" + track_id
        if check in track_dict:
            if check not in check_dict: #?????????
                check_dict[check] = 0
                track_dict[check] = track_name
                t.writelines(check + "\t" + track_name + "\n")
                
            if artist_id not in artist_dict:
                artist_dict[artist_id] = 0
                e1.writelines("a" + artist_id + "\t" + artist_name + "\t" + artist_gender + "\t" + artist_type + "\n")

            if album_id not in album_dict:
                album_dict[album_id] = 0
                e2.writelines("b" + album_id + "\t" + album_name + "\n")

            if artist_type not in type_dict:
                type_dict[artist_type] = 0
                e3.writelines(artist_type + "\n")

            if main_genre not in genre_dict:
                genre_dict[main_genre] = 0
                e4.writelines(main_genre + "\n")

            for micro in micro_genre:
                m = str(micro)
                if m not in genre_dict:
                    genre_dict[m] = 0
                    e4.writelines(m + "\n")

            created_by = "t" + track_id + "\t" + "created_by" + "\t" + "a" + artist_id + "\n"
            f4.writelines(created_by)
            #print(created_by)
            has_genre = "t" + track_id + "\t" + "has_genre" + "\t" + main_genre + "\n"
            f1.writelines(has_genre)
            #print(has_genre)
            for micro in micro_genre:
                has_micro_genre = "t" + track_id + "\t" + "has_micro_genre" + "\t" + micro + "\n"
                f2.writelines(has_micro_genre)
            #    print(has_micro_genre)
            in_album = "t" + track_id + "\t" + "in_album" + "\t" + "b" + album_id + "\n"
            f3.writelines(in_album)
            #print(in_album)
        else:
            counter += 1
            
time.sleep(0.5)            

print("done.")
print(counter)

3602111it [00:14, 246801.09it/s]


done.
3490533


In [37]:
print(len(list(check_dict.keys())))

111578


In [38]:
with open("%s/entities/track_new.txt" % (path_prefix), "w", encoding='utf-8') as t:
    for key, value in tqdm(track_dict.items()):
        track_id = key
        track_name = value
        
        if track_name == 0:
            track_name = "???"
            
        t.writelines(track_id + "\t" + track_name + "\n")
        
time.sleep(0.5)
        
print("done.")

100%|██████████| 173788/173788 [00:00<00:00, 980652.12it/s]


done.


#### in_album

In [39]:
# DONE

#### has_genre

In [40]:
# DONE

#### has_micro_genre

In [41]:
# DONE

#### has_gender & lives_in

In [42]:
# user_id, country, age, gender, creation_time
gcounter = 0
ccounter = 0
with open("%s/users.txt" % write_path_prefix, "r") as data,\
open("%s/relations/has_gender.txt" % (path_prefix), "w") as f,\
open("%s/relations/lives_in.txt" % (path_prefix), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "":
            gcounter += 1
        else:
            f.writelines(entry[0] + "\t" + "has_gender" + "\t" + entry[3] + "\n")
            
        if entry[1] == "":
            ccounter += 1
        else:
            f2.writelines(entry[0] + "\t" + "lives_in" + "\t" + entry[1] + "\n")
            
time.sleep(0.5)
print("done.")
print(gcounter)
print(ccounter)

9455it [00:00, 488749.62it/s]


done.
0
353


#### lives_in

In [43]:
# DONE

## Create KG File

In [44]:
directory = '%s/relations' % path_prefix
 
with open('%s/kg.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

lives_in.txt: 9102it [00:00, 889233.09it/s]
in_album.txt: 111578it [00:00, 734956.00it/s]
has_micro_genre.txt: 667024it [00:01, 632174.05it/s]
created_by.txt: 111578it [00:00, 737339.06it/s]
has_gender.txt: 9455it [00:00, 1127616.49it/s]
has_genre.txt: 111578it [00:00, 728063.68it/s]


In [45]:
directory = '%s/relations' % path_prefix
 
with open('%s/kg_no_gender.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        if filename == 'has_gender.txt':
            continue
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

lives_in.txt: 9102it [00:00, 1114741.58it/s]
in_album.txt: 111578it [00:00, 794228.60it/s]
has_micro_genre.txt: 667024it [00:01, 634161.14it/s]
created_by.txt: 111578it [00:00, 737804.04it/s]
has_genre.txt: 111578it [00:00, 735411.04it/s]


## Map knowledge Graph Files for further processing

### i2kg

In [46]:
if not os.path.exists('lfm/kg'):
    os.makedirs('lfm/kg')

In [47]:
tracks = pd.read_csv('lfm/intermediate_kg/tracks.txt', sep='\t', names=['pid', 'track_name'])
tracks.insert(0, 'id', tracks['pid'].str.replace('t', ''))
tracks = tracks.drop(['track_name'], axis=1)

In [48]:
tracks.head()

Unnamed: 0,id,pid
0,812,t812
1,1003,t1003
2,4774,t4774
3,5100,t5100
4,9933,t9933


In [49]:
tracks.to_csv('lfm/kg/i2kg_map.tsv', sep='\t', index=False)

### r_map

In [50]:
kg = pd.read_csv('lfm/intermediate_kg/kg.txt', sep='\t', names=['head', 'relation', 'tail'])

In [51]:
kg.head()

Unnamed: 0,head,relation,tail
0,u2,lives_in,UK
1,u14,lives_in,UK
2,u15,lives_in,US
3,u36,lives_in,UK
4,u42,lives_in,RU


In [52]:
relations = kg.relation.unique()

In [53]:
range(10)

range(0, 10)

In [54]:
relation_map_df = pd.DataFrame({'remap_id': range(len(relations)), 'org_id': relations})

In [55]:
relation_map_df

Unnamed: 0,remap_id,org_id
0,0,lives_in
1,1,in_album
2,2,has_micro_genre
3,3,created_by
4,4,has_gender
5,5,has_genre


In [56]:
relation_map_df.to_csv('lfm/kg/r_map.tsv', sep='\t', index=False)

### e_map

In [57]:
entities = np.unique(np.concatenate([kg['head'].unique(), kg['tail'].unique()], axis=0))

In [58]:
len(entities)

184059

In [59]:
trackids = tracks.pid.unique()

In [60]:
len(tracks)

173788

In [61]:
#count = 0
#for entity in entities:
#    if entity in trackids:
#        count += 1
#count

In [62]:
entities_map_df = pd.DataFrame({'new_id': range(len(entities)), 'raw_dataset_id': entities})

In [63]:
entities_map_df

Unnamed: 0,new_id,raw_dataset_id
0,0,21st century classical
1,1,8-bit
2,2,8d
3,3,AD
4,4,AE
...,...,...
184054,184054,zeuhl
184055,184055,zither
184056,184056,zolo
184057,184057,zouk


In [64]:
entities_map_df.to_csv('lfm/kg/e_map.tsv', sep='\t', index=False)

### kg

In [94]:
relation_map_df

Unnamed: 0,remap_id,org_id
0,0,lives_in
1,1,in_album
2,2,has_micro_genre
3,3,created_by
4,4,has_gender
5,5,has_genre


In [66]:
kg.relation = kg.relation.map({'lives_in': 0, 'in_album': 1, 'has_micro_genre': 2, 'created_by': 3, 'has_gender': 4, 'has_genre': 5})
kg.relation = kg.relation.astype('int32')

In [67]:
kg = kg.rename(columns={'head': 'entity_head', 'tail': 'entity_tail'})

In [68]:
kg

Unnamed: 0,entity_head,relation,entity_tail
0,u2,0,UK
1,u14,0,UK
2,u15,0,US
3,u36,0,UK
4,u42,0,RU
...,...,...,...
1020310,t44269170,5,hardstyle
1020311,t47732928,5,pop
1020312,t10197045,5,flamenco
1020313,t18331145,5,alternative rock


In [69]:
kg.to_csv('lfm/kg/kg.tsv', index=False, sep='\t')

## Now convert kg and files to the standard format presented in the paper

In [70]:
if not os.path.exists('lfm/preprocessed'):
    os.makedirs('lfm/preprocessed')

#### Users

In [71]:
users = pd.read_csv('lfm/cleaned/users.tsv', sep='\t')
users.head()

Unnamed: 0,user_id,country,age,gender,creation_time
0,2,UK,35,m,2002-10-29 01:00:00
1,14,UK,48,m,2003-02-18 21:44:13
2,15,US,28,m,2003-02-24 03:30:33
3,36,UK,29,m,2003-03-27 12:17:12
4,42,RU,30,m,2003-04-10 01:07:48


In [72]:
users = users.drop(['country', 'age', 'gender','creation_time'], axis=1)
users.insert(0, 'new_id', range(users.shape[0]))
users.to_csv('lfm/preprocessed/users.txt', header=["new_id", "raw_dataset_id"], index=False, sep='\t', mode='w+')

In [73]:
user_id2new_id = dict(zip(users["user_id"], users["new_id"]))

#### Tracks

In [74]:
tracks = pd.read_csv('lfm/cleaned/tracks.tsv', sep='\t')
tracks.head()

Unnamed: 0,track_id,artist,track
0,812,Billie Eilish,!!!!!!!
1,1003,Dillinger Four,!!Noble Stabbings!!
2,4774,The Gaslight Anthem,45
3,5100,Barns Courtney,99
4,9933,Theo Katzman,Best


In [75]:
tracks = tracks.drop(['artist', 'track'], axis=1)
tracks.insert(0, 'new_id', range(tracks.shape[0])) #Create a new incremental ID

In [76]:
tracks.to_csv('lfm/preprocessed/products.txt', header=["new_id", "raw_dataset_id"], index=False, sep='\t', mode='w+')

In [77]:
song_id2new_id = dict(zip(tracks["track_id"], tracks['new_id']))

#### Ratings

In [78]:
interactions = pd.read_csv('lfm/cleaned/listening_events.tsv', sep='\t')
interactions = interactions.drop(['album_id'], axis=1)
interactions.head()

Unnamed: 0,user_id,track_id,timestamp
0,16026,40012596,1577833201
1,42410,14250568,1577833201
2,39972,33291229,1577833201
3,7744,18394900,1577833202
4,14807,21889387,1577833203


In [79]:
interactions.insert(2, 'rating', np.ones(len(interactions)))
interactions.rating = interactions.rating.astype('int32')

In [80]:
interactions["user_id"] = interactions['user_id'].map(user_id2new_id)
interactions["track_id"] = interactions['track_id'].map(song_id2new_id)

In [81]:
interactions.head()

Unnamed: 0,user_id,track_id,rating,timestamp
0,3629,143241,1,1577833201
1,6820,44969,1,1577833201
2,6678,116353,1,1577833201
3,2027,62386,1,1577833202
4,3419,75699,1,1577833203


In [82]:
interactions.to_csv('lfm/preprocessed/ratings.txt', header=["uid", "pid", "rating", "timestamp"], index=False, sep='\t', mode='w+')

### KG

Set Tracks to be the relation heads and remove entity2entity and song2song relations

In [83]:
songs_to_kg_df = pd.read_csv('lfm/kg/i2kg_map.tsv', sep='\t')
kg_df = pd.read_csv('lfm/kg/kg.tsv', sep='\t')

In [84]:
songs_to_kg_df.head()

Unnamed: 0,id,pid
0,812,t812
1,1003,t1003
2,4774,t4774
3,5100,t5100
4,9933,t9933


In [85]:
mask = kg_df['entity_tail'].isin(songs_to_kg_df.pid) \
        & ~kg_df['entity_head'].isin(songs_to_kg_df.pid)
kg_df.loc[mask, ['entity_head', 'entity_tail']] = \
    (kg_df.loc[mask, ['entity_tail', 'entity_head']].values)

In [86]:
n_of_triplets = kg_df.shape[0]
kg_df = kg_df[(kg_df['entity_head'].isin(songs_to_kg_df.pid) & ~kg_df['entity_tail'].isin(songs_to_kg_df.pid))]
display(kg_df.head(5))
print(f"Number of triplets before: {n_of_triplets}")
print(f"Number of triplets after: {kg_df.shape[0]}")

Unnamed: 0,entity_head,relation,entity_tail
9102,t31469731,1,b4202003
9103,t11520441,1,b14810089
9104,t36346257,1,b13466481
9105,t13847831,1,b20080443
9106,t24342900,1,b17789042


Number of triplets before: 1020315
Number of triplets after: 1001758


In [87]:
len(kg_df['relation'].unique())

4

In [88]:
v = kg_df[['relation']]
n_of_triplets = kg_df.shape[0]
kg_df = kg_df[v.replace(v.apply(pd.Series.value_counts)).gt(100).all(1)]
display(kg_df.head(5))
print(f"Number of triplets before: {n_of_triplets}")
print(f"Number of triplets after: {kg_df.shape[0]}")

Unnamed: 0,entity_head,relation,entity_tail
9102,t31469731,1,b4202003
9103,t11520441,1,b14810089
9104,t36346257,1,b13466481
9105,t13847831,1,b20080443
9106,t24342900,1,b17789042


Number of triplets before: 1001758
Number of triplets after: 1001758


Store all other kg files in a new format

In [89]:
relations_map = pd.read_csv('lfm/kg/r_map.tsv', sep='\t')
relations_map.to_csv('lfm/preprocessed/r_map.txt', header=['relation_id', 'relation_url'], index=False, sep='\t', mode='w+')

In [90]:
entities_map = pd.read_csv('lfm/kg/e_map.tsv', sep='\t')
entities_map.to_csv('lfm/preprocessed/e_map.txt', header=['entity_id', 'entity_url'], index=False, sep='\t', mode='w+')

In [91]:
songs_to_kg_df.to_csv('lfm/preprocessed/i2kg_map.txt', header=["dataset_id", 'entity_id'], index=False, sep='\t', mode='w+')

In [92]:
cols = ["entity_head", "entity_tail", "relation"]
kg_df = kg_df[cols]
display(kg_df.head(5))
kg_df.to_csv('lfm/preprocessed/kg_final.txt', header=["entity_head", "entity_tail", 'relation'],
             index=False, sep='\t', mode='w+')

Unnamed: 0,entity_head,entity_tail,relation
9102,t31469731,b4202003,1
9103,t11520441,b14810089,1
9104,t36346257,b13466481,1
9105,t13847831,b20080443,1
9106,t24342900,b17789042,1


# Create the dataset files for RecBole

### Dataset structure
Recbole knowledge aware datasets require three files: .inter, .g, .link files. 

| File | Description |
|------|-------------|
|.inter|User-Item interaction|
|.kg| head, relation, tail|
|.link|item_id to entity_id|

## Create atomic files

 Add type info to kg generated by David

In [93]:
dataset = name
path = 'data/lfm'
kg_path = '%s/intermediate_kg/%s' % (path, dataset)
prefixed_path = '%s/%s' % (path,dataset)

NameError: name 'name' is not defined

In [None]:
kg = pd.read_csv('%s_kg.txt' % kg_path, sep='\t', names=['head_id:token', 'relation_id:token', 'tail_id:token'])

In [None]:
kg.head()

In [None]:
kg['relation_id:token'].unique()

In [None]:
kg_no_listen_events = kg[kg['relation_id:token'] != 'listened_to']
kg_no_listen_events.to_csv('data/rb_lfm/rb_lfm.kg', sep='\t', index=False)

In [None]:
users = pd.read_csv('%s_users.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'country:token', 'age:token', 'gender:token', 'creation_time:token'])
users.to_csv('data/rb_lfm/rb_lfm.user', sep='\t', index=False)

In [None]:
len(users)

In [None]:
items = pd.read_csv('%s_tracks.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['item_id:token', 'artist:token', 'track:token'])
items.to_csv('data/rb_lfm/rb_lfm.item', sep='\t', index=False)
                    
track_ids = pd.DataFrame(items['item_id:token'])
track_ids['entity_id:token'] = 't' + track_ids['item_id:token'].astype(str)
track_ids.to_csv('data/rb_lfm/rb_lfm.link', sep='\t', index=False)

In [None]:
listening_events = pd.read_csv('%s_listening_events.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'item_id:token', 'album_id:token', 'timestamp:token'])
listening_events.to_csv('data/rb_lfm/rb_lfm.inter', sep='\t', index=False)