# Following the guide to import the data as per [this link](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems)

## Importing Relevant Packages

In [3]:
import pandas as pd
from urllib.request import urlretrieve
import zipfile
import os

## Extracting the files from the Zip Folder

In [4]:
urlretrieve("http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip", "lastfm.zip")
zip_ref = zipfile.ZipFile('lastfm.zip', "r")
zip_ref.extractall()

## Removing the zipped folder and README file

In [7]:
zip_ref.close()
zip_file = 'lastfm.zip'
unnecessary_timestamp_file = 'user_taggedartists-timestamps.dat'
os.remove(zip_file)
os.remove(unnecessary_timestamp_file)

## Cleaning Some of the Files

In [14]:
dataframe_names = [
    'user_friends',
    'user_taggedartists',
    'artists',
    'tags',
    'user_artists']

file_names = [
    'user_friends.dat',
    'user_taggedartists.dat',
    'artists.dat',
    'tags.dat',
    'user_artists.dat']

In [15]:
for (dataframe, file) in zip(dataframe_names, file_names):
    if dataframe == 'tags':
        vars()[dataframe] = pd.read_table(file, sep="\t", encoding = "latin-1")
    else:
        vars()[dataframe] = pd.read_table(file, sep="\t")

## Re-indexing the User and Artist IDs

In [16]:
user_id_mapping = {}
artist_id_mapping = {}

new_id = 0
for userID in user_artists.userID.unique():
    user_id_mapping[userID] = new_id
    new_id += 1

new_id = 0
for artistID in artists.id.unique():
    artist_id_mapping[artistID] = new_id
    new_id += 1

In [21]:
user_artists = user_artists.replace({"userID": user_id_mapping})
user_artists = user_artists.replace({"artistID": artist_id_mapping})
artists = artists.replace({"id": artist_id_mapping})
user_friends = user_friends.replace({"userID": user_id_mapping})
user_friends = user_friends.replace({"friendID": user_id_mapping})
user_taggedartists = user_taggedartists.replace({"userID": user_id_mapping})
user_taggedartists = user_taggedartists.replace({"artistID": artist_id_mapping})

## Deleting the Old Datasets

In [None]:
user_artists_file = 'user_artists.dat'
artists_file = 'artists.dat'
user_friends_file = 'user_friends.dat'
user_taggedartists_file = 'user_taggedartists.dat'

os.remove(user_artists_file)
os.remove(artists_file)
os.remove(user_friends_file)
os.remove(user_taggedartists_file)

## Exporting the Cleaned Datasets

In [27]:
user_artists.to_csv('user_artists.csv', index=False)
artists.to_csv('artists.csv', index=False)
user_friends.to_csv('user_friends.csv', index=False)
user_taggedartists.to_csv('user_taggedartists.csv', index=False)