In [3]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
import faiss
import joblib

### Загрузка данных

In [4]:
import gdown
url = 'https://drive.google.com/uc?id=1UgDwPShtfqq9QAk5GI5IX-2hhyErVbze'
table_name = "rec_test_assignment_playlist2track.csv"
gdown.download(url, table_name)

Downloading...
From: https://drive.google.com/uc?id=1UgDwPShtfqq9QAk5GI5IX-2hhyErVbze
To: /home/elias/zvuk/zvuk_test/rec_test_assignment_playlist2track.csv
100%|██████████| 98.6M/98.6M [00:02<00:00, 42.7MB/s]


'rec_test_assignment_playlist2track.csv'

 ### Чтение данных

In [5]:
df_full = pd.read_csv(table_name)

In [6]:
print(df_full.shape)
print(df_full.nunique())

(2000000, 3)
playlist_id    714818
track_id       169548
track_uri      169548
dtype: int64


 ### Сборка словарей для матчинга track_id и track_uri

In [7]:
df_tracks = df_full.drop(columns=['playlist_id'])
df_unique = df_tracks.drop_duplicates(['track_id'])
df_unique = df_unique.set_index('track_id')
unique = df_unique.to_dict()

id_to_track = unique['track_uri']
track_to_id = dict((v,k) for k,v in id_to_track.items())
joblib.dump(id_to_track, 'binary/id_to_track.pickle')
joblib.dump(track_to_id, 'binary/track_to_id.pickle')

['track_to_id.pickle']

 ### SVD

In [8]:
df = df_full.drop(columns=['track_uri'])

In [None]:
n_track = df['track_id'].unique().shape[0]
n_playlist = df['playlist_id'].unique().shape[0]
print('tracks: {}, playlists: {}'.format(n_track, n_playlist))
# создаём tracks-playlists матрицу
ratio = np.zeros((n_track, n_playlist))
for line in df.itertuples():
    ratio[line[2], line[1]] = 1

In [58]:
# определяем размерность матрицы треков
dimension = 100

In [59]:
%%time
# вычисление svd
u, s, vt = svds(ratio, k=dimension)

CPU times: user 13h 56min 42s, sys: 2h 1min 51s, total: 15h 58min 33s
Wall time: 17min 21s


In [61]:
# размерности выходных матриц
print(u.shape, s.shape, vt.shape)

(169548, 100) (100,) (100, 714818)


### Построение индекса в faiss

In [62]:
index = faiss.IndexFlat(dimension)   
index.add(u)          
print(index.ntotal)

169548


In [63]:
# проверка что для любого вектора ближайший он сам
for vec_id in range(5):
    vec_0 = index.reconstruct_batch(vec_id)
    D, I = index.search(vec_0, k=1) 
    assert I[0] == vec_id, 'created index is broken'

In [64]:
# запись
faiss.write_index(index, "binary/index_spotify_d100.bin")