In [1]:
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np

In [2]:
def column_renaming(data, origin, modified):
    new_name = dict(zip(origin, modified))
    data = data.rename(columns=new_name)
    return data

In [3]:
song_att_raw  = pd.DataFrame.from_csv("./rawdata/song-attributes.txt", sep="\t", header=None)
genre_raw  = pd.DataFrame.from_csv("./rawdata/genre-hierarchy.txt", sep="\t", header=None)

## Modify Song_ID

Due to the requirement of SVD, we must sort the songs in the order of genre.
In this case, we are only using the highest level of genre.

In [4]:
origin = [0,1,2,3]
modified1 = ["song_id","album_id", "artist_id", "genre_id"]
modified2 = ["genre_id","parent_genre_id","level", "genre_name"]
song_att = song_att_raw.reset_index()
song_att = column_renaming(song_att, origin, modified1)
genre = column_renaming(genre_raw, origin, modified2)

In [5]:
song_att.head()

Unnamed: 0,song_id,album_id,artist_id,genre_id
0,0,12070,8490,0
1,1,19512,7975,134
2,2,18953,3492,0
3,3,695,2653,0
4,4,243,2282,0


In [6]:
genre.head()

Unnamed: 0_level_0,parent_genre_id,level,genre_name
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1,Unknown
1,1,1,Electronic/Dance
2,1,2,Ambient
3,2,3,Ambient Dub
4,2,3,Ambient Tech


In [7]:
genre["highestlvl"]=genre.index
genre.head()

Unnamed: 0_level_0,parent_genre_id,level,genre_name,highestlvl
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,1,Unknown,0
1,1,1,Electronic/Dance,1
2,1,2,Ambient,2
3,2,3,Ambient Dub,3
4,2,3,Ambient Tech,4


In [8]:
genre.iloc[0]#["genre_id"]

parent_genre_id          0
level                    1
genre_name         Unknown
highestlvl               0
Name: 0, dtype: object

In [9]:
#turn should start from 0
def find_parent(dfrow, turn, genre):
    if (dfrow["level"]-turn) < 2:
        return dfrow["highestlvl"]
    else:
        return genre.iloc[dfrow["highestlvl"]]["parent_genre_id"]

In [10]:
for t in range(0,genre["level"].max()):
    genre["highestlvl"] = list(map(lambda x: find_parent(genre.iloc[x], t, genre), genre.index))
    #print(genre.head())

In [11]:
len(genre["highestlvl"].unique())

20

In [12]:
result = song_att.join(genre, on="genre_id", how="right")
result.sort_index().head()

Unnamed: 0,song_id,album_id,artist_id,genre_id,parent_genre_id,level,genre_name,highestlvl
0,0.0,12070.0,8490.0,0,0,1,Unknown,0
1,1.0,19512.0,7975.0,134,134,1,Rock,134
2,2.0,18953.0,3492.0,0,0,1,Unknown,0
3,3.0,695.0,2653.0,0,0,1,Unknown,0
4,4.0,243.0,2282.0,0,0,1,Unknown,0


In [13]:
song_ind = result[["highestlvl"]].sort_index()

In [14]:
song_ind = song_ind.sort_values("highestlvl")
song_ind = song_ind.reset_index()
song_ind = song_ind.rename(columns={"index":"song_id"})
song_ind.to_csv("./tables/song_back.csv")

In [15]:
song_ind.head()

Unnamed: 0,song_id,highestlvl
0,0,0
1,87782,0
2,87779,0
3,87778,0
4,87777,0


In [16]:
song_ind = song_ind.reset_index()
song_ind.set_index('song_id', inplace=True)
song_ind = song_ind.sort_index()
song_ind = song_ind.rename(columns={"index":"lvl_id"})
song_ind.to_csv("./tables/song_forth.csv")

In [17]:
song_ind.sort_index().head()

Unnamed: 0_level_0,lvl_id,highestlvl
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,129016,134
2,84325,0
3,84324,0
4,84323,0


## Build user and level ID table

In [4]:
#forth_ind = pd.DataFrame.from_csv("./tables/song_forth.csv", sep=",", index_col="song_id")
rawtrain = pd.DataFrame.from_csv("./rawdata/train_0.txt", sep="\t", header=None)
#forth_ind.head()

In [5]:
origin = [0,1,2]
modified = ["user_id", "song_id", "rating"]
train = rawtrain.reset_index()
train = column_renaming(train, origin, modified)
train.head()

Unnamed: 0,user_id,song_id,rating
0,0,166,5
1,0,2245,4
2,0,3637,4
3,0,5580,4
4,0,5859,4


In [6]:
origin = [0,1,2,3]
modified1 = ["song_id","album_id", "artist_id", "genre_id"]
#modified2 = ["genre_id","parent_genre_id","level", "genre_name"]
song_att = column_renaming(song_att_raw, origin, modified1) 
result = train.join(song_att, on="song_id", how="right")
result.sort_index().head()

Unnamed: 0,user_id,song_id,rating,album_id,artist_id,genre_id
0,0,166,5,5303,7231,0
1,0,2245,4,14290,3265,0
2,0,3637,4,15761,629,0
3,0,5580,4,8324,4761,0
4,0,5859,4,19671,4716,0


In [7]:
modified2 = ["genre_id","parent_genre_id","level", "genre_name"]
genre = column_renaming(genre_raw, origin, modified2)
result = result.join(genre, on="genre_id", how="right")
result.sort_index().head()

Unnamed: 0,user_id,song_id,rating,album_id,artist_id,genre_id,parent_genre_id,level,genre_name
0,0.0,166.0,5.0,5303.0,7231.0,0,0,1,Unknown
1,0.0,2245.0,4.0,14290.0,3265.0,0,0,1,Unknown
2,0.0,3637.0,4.0,15761.0,629.0,0,0,1,Unknown
3,0.0,5580.0,4.0,8324.0,4761.0,0,0,1,Unknown
4,0.0,5859.0,4.0,19671.0,4716.0,0,0,1,Unknown


In [8]:
result = result[["user_id","genre_id","rating"]]

In [9]:
result["user_id"].unique()

array([      0.,    2941.,    3476., ...,  107679.,  127442.,      nan])

In [10]:
np.where(result["user_id"].isnull())

(array([76344627, 76344628, 76344629, 76344630, 76344631, 76344632,
        76344633, 76344634, 76344635, 76344636, 76344637, 76344638,
        76344639, 76344640, 76344641, 76344642, 76344643, 76344644,
        76344645, 76344646, 76344647, 76344648, 76344649, 76344650,
        76344651, 76344652, 76344653, 76344654, 76344655, 76344656,
        76344657, 76344658, 76344659, 76344660, 76344661, 76344662,
        76344663, 76344664, 76344665, 76344666, 76344667, 76344668,
        76344669, 76344670, 76344671, 76344672, 76344673, 76344674,
        76344675, 76344676, 76344677, 76344678, 76344679, 76344680,
        76344681, 76344682, 76344683, 76344684, 76344685, 76344686,
        76344687, 76344688, 76344689, 76344690, 76344691, 76344692,
        76344693, 76344694, 76344695, 76344696, 76344697, 76344698,
        76344699, 76344700, 76344701, 76344702, 76344703, 76344704,
        76344705, 76344706, 76344707, 76344708, 76344709, 76344710,
        76344711, 76344712, 76344713, 76344714, 

In [11]:
result.iloc[76344627]

user_id     NaN
genre_id    2.0
rating      NaN
Name: 75619014, dtype: float64

In [12]:
print(len(result))
print(len(train))

76344785
76344627


In [13]:
result = result.dropna()
print(len(result))

76344627


In [14]:
result["user_id"] = result["user_id"].astype(int)
result.head()

Unnamed: 0,user_id,genre_id,rating
0,0,0,5.0
1094002,2941,0,5.0
1292795,3476,0,5.0
1378966,3748,0,1.0
1505164,4086,0,2.0


In [15]:
my_count = result.groupby(["user_id", "genre_id"]).size().reset_index(name="Count")

In [16]:
my_rating = result.groupby(["user_id", "genre_id"])['rating'].mean().reset_index(name="Rating")

In [17]:
my_rating.head()

Unnamed: 0,user_id,genre_id,Rating
0,0,0,4.25
1,0,134,4.0
2,0,135,5.0
3,1,0,3.203822
4,1,54,2.777778


In [18]:
user_cnt = 200000
genre_cnt = 216
svdtable = np.zeros((user_cnt, genre_cnt))

In [19]:
svdtable[19999][200]

0.0

In [28]:
for x in my_rating.index:
    dfrow = my_rating.iloc[x]
    svdtable[int(dfrow['user_id'])][int(dfrow['genre_id'])] = dfrow['Rating']

In [21]:
#def fillSVDTable(x):
#    dfrow = my_rating.iloc[x]
#    svdtable[int(dfrow['user_id'])][int(dfrow['genre_id'])] = dfrow['Rating']
#a = np.arange(0, len(my_rating), 1)
#map(lambda x: fillSVDTable(x), a)
#map(fillSVDTable, a)

In [29]:
svdtable[0][0]

4.25

In [31]:
np.savetxt("./tables/svdtable.txt", svdtable, delimiter="\t")

In [32]:
a_table = np.genfromtxt('./tables/svdtable.txt', delimiter='\t')

In [34]:
a_table[1][0]

3.2038216560509554

#### Some other tests

In [5]:
forth_ind.iloc[[1,1]]

Unnamed: 0_level_0,lvl_id,highestlvl
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,129016,134
1,129016,134


In [6]:
temp_pd = forth_ind.iloc[train["song_id"]]

In [7]:
temp_pd.head()

Unnamed: 0_level_0,lvl_id,highestlvl
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1
166,84430,0
2245,83668,0
3637,85343,0
5580,80703,0
5859,79486,0


In [9]:
temp_pd = temp_pd.reset_index()
temp_pd.head()

In [12]:
train = train.assign(lvl_id=temp_pd["lvl_id"])

In [13]:
train.head()

Unnamed: 0,user_id,song_id,rating,lvl_id
0,0,166,5,84430
1,0,2245,4,83668
2,0,3637,4,85343
3,0,5580,4,80703
4,0,5859,4,79486
