# Clean Dataset and create subset for KG creation

In [1]:
import pandas as pd
import numpy as np
import csv
import time
import sys
import json
import joblib
import os

from tqdm import tqdm

In [2]:
users = pd.read_csv('lfm/users.tsv', sep='\t')
tracks = pd.read_csv('lfm/tracks.tsv', sep='\t')
listening_events = pd.read_csv('lfm/listening_events.tsv', sep='\t')
albums = pd.read_csv('lfm/albums.tsv', sep='\t')

In [3]:
len(tracks)

4079421

In [4]:
len(listening_events)

30357786

## Clean Dataset

<b> Remove users which gender is not defined and whose age is not element of [10;90] </b>

In [5]:
l = len(users)
users = users[users.gender.isin(['m', 'f']) & (users.age >= 10) & (users.age <= 95)]
print('Removed %s users' % (len(users) - l))

Removed -5163 users


<b> Remove now unassigned listening events </b>

In [6]:
l = len(listening_events)
listening_events = listening_events[listening_events.user_id.isin(users.user_id)]
print('Removed %s listening events' % (len(listening_events) - l))

Removed -9725559 listening events


<b> remove duplicate listening events </b>

In [7]:
listening_events = listening_events.sort_values('timestamp')
old_length_le = len(listening_events)
listening_events = listening_events.drop_duplicates(subset=['user_id', 'track_id'])
print(len(listening_events)-old_length_le)

-10636671


<b> 10-core filtering </b>

In [9]:
n_core = 10
while True:
    start_number = len(listening_events)

    # Item pass
    item_counts = listening_events.track_id.value_counts()
    item_above = set(item_counts[item_counts >= n_core].index)
    listening_events = listening_events[listening_events.track_id.isin(item_above)]
    print('Records after item pass: ', len(listening_events))

    # User pass
    user_counts = listening_events.user_id.value_counts()
    user_above = set(user_counts[user_counts >= n_core].index)
    listening_events = listening_events[listening_events.user_id.isin(user_above)]
    print('Records after user pass: ', len(listening_events))

    if len(listening_events) == start_number:
        print('Exiting...')
        break

Records after item pass:  4936768
Records after user pass:  4935309
Records after item pass:  4934458
Records after user pass:  4934458
Records after item pass:  4934458
Records after user pass:  4934458
Exiting...


In [10]:
users = users[users.user_id.isin(set(listening_events.user_id))]
tracks = tracks[tracks.track_id.isin(set(listening_events.track_id))]

In [11]:
users.sample(1)

Unnamed: 0,user_id,country,age,gender,creation_time
5239,16856,FI,22,m,2007-11-28 13:36:30


In [14]:
if not os.path.exists('lfm/cleaned'):
    os.makedirs('lfm/cleaned')

In [15]:
tracks.to_csv('lfm/cleaned/tracks.tsv', sep='\t', index=False)
listening_events.to_csv('lfm/cleaned/listening_events.tsv', sep='\t', index=False)
users.to_csv('lfm/cleaned/users.tsv', sep='\t', index=False)
albums.to_csv('lfm/cleaned/albums.tsv', sep='\t', index=False)

In [17]:
print(len(tracks))
print(len(users))
print(len(listening_events))

173788
9455
4934458


# Create KG files

In [78]:
#csv.field_size_limit(sys.maxsize)

## Entities

- user
- artist
- album
- track
- genre

**Additional information:**
- gender
- country

#### Users

In [33]:
if not os.path.exists('lfm/intermediate_kg/entities'):
    os.makedirs('lfm/intermediate_kg/entities')

In [22]:
path = 'lfm'
path_prefix = 'lfm/intermediate_kg'
read_path_prefix = 'lfm/cleaned'
write_path_prefix = 'lfm/intermediate_kg'

In [28]:
# user_id, country, age, gender, creation_time
with open("%s/users.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s/users.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            f.writelines("u" + entry[0] + "\t" + entry[1] + "\t" + entry[2] + "\t" + entry[3] + "\n")
            
time.sleep(0.5)            

print(header)
print("done.")

9456it [00:00, 491691.84it/s]


['user_id', 'country', 'age', 'gender', 'creation_time']
done.


#### Artists

In [81]:
# artist_id, artist_name
#with open("data/artists.tsv", "r", encoding='utf-8') as data, open("data/kg/artists.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("a" + entry[0] + "\t" + str(entry[1].encode("utf-8")) + "\n")
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")

#### Albums

In [82]:
# album_id, album_name, artist_name
#with open("data/albums.tsv", "r", encoding='utf-8') as data, open("data/kg/albums.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("b" + entry[0] + "\t" + str(entry[1].encode("utf-8")) +  "\n")
#            
#time.sleep(0.5)
#
#print(header)
#print("done.")

#### Tracks

In [29]:
# track_id, artist_name, track_name
track_complete_dict = {}
counter = 0
with open("%s/tracks.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s/tracks.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            try:
                f.writelines("t" + entry[0] + "\t" + str(entry[2].encode("utf-8")) + "\n")
                track_id = "t" + entry[0]
                if track_id not in track_complete_dict:
                    track_complete_dict[track_id] = 0
            except:
                f.writelines("t" + entry[0] + "\t" + "???" + "\n")
                counter += 1
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

173789it [00:00, 343705.08it/s]


['track_id', 'artist', 'track']
done.
401


In [84]:
print(len(list(track_complete_dict.keys())))

277863


#### Genres

In [85]:
#counter = 0
#with open("data/micro_genre_tags.json", "r", encoding='utf-8') as data, open("data/kg/genres.txt", "w") as f:
#    for line in tqdm(data):
#        try:
#            entry = json.loads(line)
#            track_name = entry['_id']['track']
#            artist_name = entry['_id']['artist']
#            main_genre = list(entry['tags'].keys())[0]
#            genres = ",".join(list(entry['tags'].keys()))
#
#            f.writelines(track_name + "\t" + artist_name + "\t" + main_genre + "\t" + genres + "\n")
#        except:
#            counter += 1
#            continue
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")
#print(counter)

#### Gender & Country

In [34]:
gdict = {}
cdict = {}

gcounter = 0
ccounter = 0

with open("%s/users.txt" % write_path_prefix, "r") as data,\
open("%s/entities/gender.txt"  % (path_prefix), "w") as f,\
open("%s/entities/country.txt" % (path_prefix), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "" or entry[3] in gdict:
            gcounter += 1
        else:
            gdict[entry[3]] = 0
            f.writelines(entry[3] + "\n")
            
        if entry[1] == "" or entry[1] in cdict:
            ccounter += 1
        else:
            cdict[entry[1]] = 0
            f2.writelines(entry[1] + "\n")
            
time.sleep(0.5)
            
print("done.")
print(gcounter)
print(ccounter)

9455it [00:00, 1146557.89it/s]


done.
9453
9331


#### Country

In [87]:
# DONE

## Relations

- **listened_to:** _user_ listened_to _track_
- **in_album:** _track_ in_album _album_
- **created_by:** _track_ created_by _artist_
- **has_genre**: _track_ has_genre _genre_

**Additional information:**
- **has_micro_genre**: _track_ has_micro_genre _genre_ (**or** instead of has_genre)
- **has_gender**: _user_ has_gender _gender_
- **lives_in**: _user_ lives_in _country_

In [37]:
if not os.path.exists('lfm/intermediate_kg/relations'):
    os.makedirs('lfm/intermediate_kg/relations')

In [88]:
# check genres.txt --> clean track and artist names, we filter it with that list?
# create new dict mit track names, artist names and look up for all the relations

# track_name, artist_name, main_genre, micro_genre

#track_dict = {}

#with open("data/kg/genres.txt", "r") as data: #, open("data/kg/tracks.txt", "w") as f:
#    datareader = csv.reader(data)
    
#    for i, row in tqdm(enumerate(datareader)):
#        try:
#            helper = row[0].split("\t")
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = row[1:]
#        except:
#            row = ", ".join(row)
#            helper = row.split("\t")
            
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = helper[3].split(",")[1:]
#            continue
        
#        key = track_name # + " || " + artist_name
        
#        if key not in track_dict:
#            track_dict[key] = {}
#            track_dict[key]["artist"] = artist_name
#            track_dict[key]["main_genre"] = main_genre
#            track_dict[key]["micro_genre"] = micro_genre
        
#print("saving...")

#joblib.dump(track_dict, "data/kg/track_dict.pkl")

#print("done.")

#### listened_to

In [38]:
# user_id, track_id, album_id, timestamp
counter = 0
counts_dict = {}
track_dict = {}
with open("%s/listening_events.tsv" % (read_path_prefix), "r") as data, open("%s/relations/listened_to.txt" % (path_prefix), "w") as f:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            
            track_id = "t" + entry[1]
            if track_id not in track_dict:
                track_dict[track_id] = 0
            
            key = "u" + entry[0] + ", " + "listened_to" + ", " + "t" + entry[1]
            if key not in counts_dict:
                counts_dict[key] = 0
                f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            else:
                counts_dict[key] += 1
                
            #try:
            #    f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            #except:
            #    counter += 1
            #    continue
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

4934459it [00:15, 315009.26it/s]


['user_id', 'track_id', 'album_id', 'timestamp']
done.
0


In [39]:
print(len(list(track_dict.keys())))

173899


#### created_by & has_genre & has_micro_genre & in_album

In [40]:
counter = 0

artist_dict = {}
album_dict = {}
type_dict = {}
genre_dict = {}
check_dict = {}

with open("%s/micro_genre_tags_new.json" % path, "r", encoding='utf-8') as data,\
open("%s/relations/has_genre.txt" % (path_prefix), "w", encoding='utf-8') as f1,\
open("%s/relations/has_micro_genre.txt" % (path_prefix), "w", encoding='utf-8') as f2,\
open("%s/relations/in_album.txt" % (path_prefix), "w", encoding='utf-8') as f3,\
open("%s/relations/created_by.txt" % (path_prefix), "w", encoding='utf-8') as f4,\
open("%s/entities/artist.txt" % (path_prefix), "w", encoding='utf-8') as e1,\
open("%s/entities/album.txt" % (path_prefix), "w", encoding='utf-8') as e2,\
open("%s/entities/artist_type.txt" % (path_prefix), "w", encoding='utf-8') as e3,\
open("%s/entities/genre.txt" % (path_prefix), "w", encoding='utf-8') as e4,\
open("%s/entities/track.txt" % (path_prefix), "w", encoding='utf-8') as t: #?????????
    for line in tqdm(data):
        entry = json.loads(line)
        #print(entry)
        track_id = str(entry['_id'])
        track_name = str(entry['track']['track']) #?????????
        main_genre = str(list(entry['tags'].keys())[0])
        micro_genre = list(entry['tags'].keys())[1:]
        
        artist_id = str(entry['artist']['artist_id'])
        artist_name = str(entry['artist']['artist'])
        artist_gender = str(entry['artist']['gender'])
        artist_type = str(entry['artist']['type'])
        
        album_id = str(entry['album']['album_id'])
        album_name = str(entry['album']['album'])
        
        check = "t" + track_id
        if check in track_dict:
            if check not in check_dict: #?????????
                check_dict[check] = 0
                track_dict[check] = track_name
                t.writelines(check + "\t" + track_name + "\n")
                
            if artist_id not in artist_dict:
                artist_dict[artist_id] = 0
                e1.writelines("a" + artist_id + "\t" + artist_name + "\t" + artist_gender + "\t" + artist_type + "\n")

            if album_id not in album_dict:
                album_dict[album_id] = 0
                e2.writelines("b" + album_id + "\t" + album_name + "\n")

            if artist_type not in type_dict:
                type_dict[artist_type] = 0
                e3.writelines(artist_type + "\n")

            if main_genre not in genre_dict:
                genre_dict[main_genre] = 0
                e4.writelines(main_genre + "\n")

            for micro in micro_genre:
                m = str(micro)
                if m not in genre_dict:
                    genre_dict[m] = 0
                    e4.writelines(m + "\n")

            created_by = "t" + track_id + "\t" + "created_by" + "\t" + "a" + artist_id + "\n"
            f4.writelines(created_by)
            #print(created_by)
            has_genre = "t" + track_id + "\t" + "has_genre" + "\t" + main_genre + "\n"
            f1.writelines(has_genre)
            #print(has_genre)
            for micro in micro_genre:
                has_micro_genre = "t" + track_id + "\t" + "has_micro_genre" + "\t" + micro + "\n"
                f2.writelines(has_micro_genre)
            #    print(has_micro_genre)
            in_album = "t" + track_id + "\t" + "in_album" + "\t" + "b" + album_id + "\n"
            f3.writelines(in_album)
            #print(in_album)
        else:
            counter += 1
            
time.sleep(0.5)            

print("done.")
print(counter)

3602111it [00:23, 150546.56it/s]


done.
3490463


In [92]:
print(len(list(check_dict.keys())))

158640


In [41]:
with open("%s/entities/track_new.txt" % (path_prefix), "w", encoding='utf-8') as t:
    for key, value in tqdm(track_dict.items()):
        track_id = key
        track_name = value
        
        if track_name == 0:
            track_name = "???"
            
        t.writelines(track_id + "\t" + track_name + "\n")
        
time.sleep(0.5)
        
print("done.")

100%|███████████████████████████████| 173899/173899 [00:00<00:00, 652053.66it/s]


done.


#### in_album

In [94]:
# DONE

#### has_genre

In [95]:
# DONE

#### has_micro_genre

In [96]:
# DONE

#### has_gender & lives_in

In [43]:
# user_id, country, age, gender, creation_time
gcounter = 0
ccounter = 0
with open("%s/users.txt" % write_path_prefix, "r") as data,\
open("%s/relations/has_gender.txt" % (path_prefix), "w") as f,\
open("%s/relations/lives_in.txt" % (path_prefix), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "":
            gcounter += 1
        else:
            f.writelines(entry[0] + "\t" + "has_gender" + "\t" + entry[3] + "\n")
            
        if entry[1] == "":
            ccounter += 1
        else:
            f2.writelines(entry[0] + "\t" + "lives_in" + "\t" + entry[1] + "\n")
            
time.sleep(0.5)
print("done.")
print(gcounter)
print(ccounter)

9455it [00:00, 336490.98it/s]


done.
0
353


#### lives_in

In [98]:
# DONE

## Create KG File

In [44]:
directory = '%s/relations' % path_prefix
 
with open('%s/kg.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

has_genre.txt: 111648it [00:00, 520346.30it/s]
has_gender.txt: 9455it [00:00, 688253.11it/s]
in_album.txt: 111648it [00:00, 534890.18it/s]
lives_in.txt: 9102it [00:00, 714716.00it/s]
listened_to.txt: 4934458it [00:08, 550610.24it/s]
has_micro_genre.txt: 667392it [00:01, 449547.98it/s]
created_by.txt: 111648it [00:00, 540225.29it/s]


In [45]:
directory = '%s/relations' % path_prefix
 
with open('%s/kg_no_gender.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        if filename == 'has_gender.txt':
            continue
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

has_genre.txt: 111648it [00:00, 529485.73it/s]
in_album.txt: 111648it [00:00, 561227.78it/s]
lives_in.txt: 9102it [00:00, 509679.92it/s]
listened_to.txt: 4934458it [00:09, 546895.21it/s]
has_micro_genre.txt: 667392it [00:01, 443452.33it/s]
created_by.txt: 111648it [00:00, 529549.80it/s]


# Create the dataset files for RecBole

### Dataset structure
Recbole knowledge aware datasets require three files: .inter, .g, .link files. 

| File | Description |
|------|-------------|
|.inter|User-Item interaction|
|.kg| head, relation, tail|
|.link|item_id to entity_id|

## Create atomic files

 Add type info to kg generated by David

In [101]:
dataset = name
path = 'data/lfm'
kg_path = '%s/intermediate_kg/%s' % (path, dataset)
prefixed_path = '%s/%s' % (path,dataset)

In [102]:
kg = pd.read_csv('%s_kg.txt' % kg_path, sep='\t', names=['head_id:token', 'relation_id:token', 'tail_id:token'])

In [103]:
kg.head()

Unnamed: 0,head_id:token,relation_id:token,tail_id:token
0,t36346257,has_micro_genre,indie pop
1,t36346257,has_micro_genre,rock
2,t36346257,has_micro_genre,singer-songwriter
3,t36346257,has_micro_genre,indie rock
4,t36346257,has_micro_genre,art pop


In [104]:
kg['relation_id:token'].unique()

array(['has_micro_genre', 'has_gender', 'listened_to', 'lives_in',
       'created_by', 'in_album', 'has_genre'], dtype=object)

In [105]:
kg_no_listen_events = kg[kg['relation_id:token'] != 'listened_to']
kg_no_listen_events.to_csv('data/rb_lfm/rb_lfm.kg', sep='\t', index=False)

In [106]:
users = pd.read_csv('%s_users.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'country:token', 'age:token', 'gender:token', 'creation_time:token'])
users.to_csv('data/rb_lfm/rb_lfm.user', sep='\t', index=False)

In [107]:
len(users)

9724

In [108]:
items = pd.read_csv('%s_tracks.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['item_id:token', 'artist:token', 'track:token'])
items.to_csv('data/rb_lfm/rb_lfm.item', sep='\t', index=False)
                    
track_ids = pd.DataFrame(items['item_id:token'])
track_ids['entity_id:token'] = 't' + track_ids['item_id:token'].astype(str)
track_ids.to_csv('data/rb_lfm/rb_lfm.link', sep='\t', index=False)

In [109]:
listening_events = pd.read_csv('%s_listening_events.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'item_id:token', 'album_id:token', 'timestamp:token'])
listening_events.to_csv('data/rb_lfm/rb_lfm.inter', sep='\t', index=False)