In [1]:
import pickle

d = pickle.load(open('ml_latest-small_core_10_type_hete.pkl', 'rb'))

In [2]:
d.keys()

dict_keys(['unique_uids', 'num_uids', 'unique_iids', 'num_iids', 'unique_genres', 'num_genres', 'unique_years', 'num_years', 'unique_directors', 'num_directors', 'unique_actors', 'num_actors', 'unique_writers', 'num_writers', 'unique_tids', 'num_tids', 'num_nodes', 'num_node_types', 'e2nid_dict', 'nid2e_dict', 'rating_np', 'edge_index_nps', 'test_pos_unid_inid_map', 'neg_unid_inid_map', 'edge_type_dict', 'num_edge_types', 'item_nid_occs', 'types', 'num_nodes_dict', 'type_accs'])

In [3]:
import numpy as np

unique, counts = np.unique(d['edge_index_nps']['user2item'][0], return_counts=True)

In [4]:
np.std(counts)

175.9464800539701

In [5]:
import pandas as pd

In [6]:
movies = pd.read_csv('../raw/raw_movies.csv', sep=';').fillna('')
ratings = pd.read_csv('../raw/raw_ratings.csv', sep=';')
tagging = pd.read_csv('../raw/raw_tagging.csv', sep=';')

In [7]:
def reindex_df_mlsmall(movies, ratings, tagging):
    """

    Args:
        movies:
        ratings:
        tagging:
        genome_tagging:
        genome_tags:

    Returns:

    """
    # Reindex uid
    unique_uids = np.sort(ratings.uid.unique()).astype(np.int)
    uids = np.arange(unique_uids.shape[0]).astype(np.int)
    raw_uid2uid = {raw_uid: uid for raw_uid, uid in zip(unique_uids, uids)}
    ratings['uid'] = np.array([raw_uid2uid[raw_uid] for raw_uid in ratings.uid], dtype=np.int)
    tagging['uid'] = np.array([raw_uid2uid[raw_uid] for raw_uid in tagging.uid], dtype=np.int)

    # Reindex iid
    unique_iids = np.sort(movies.iid.unique()).astype(np.int)
    iids = np.arange(unique_iids.shape[0]).astype(np.int)
    raw_iid2iid = {raw_iid: iid for raw_iid, iid in zip(unique_iids, iids)}
    movies['iid'] = np.array([raw_iid2iid[raw_iid] for raw_iid in movies.iid], dtype=np.int)
    ratings['iid'] = np.array([raw_iid2iid[raw_iid] for raw_iid in ratings.iid], dtype=np.int)
    tagging['iid'] = np.array([raw_iid2iid[raw_iid] for raw_iid in tagging.iid], dtype=np.int)

    # Create tid
    unique_tags = np.sort(tagging.tag.unique()).astype(np.str)
    tids = np.arange(unique_tags.shape[0]).astype(np.int)
    tags = pd.DataFrame({'tid': tids, 'tag': unique_tags})
    tag2tid = {tag: tid for tag, tid in zip(unique_tags, tids)}
    tagging['tid'] = np.array([tag2tid[tag] for tag in tagging.tag], dtype=np.int)
    tagging = tagging.drop(columns=['tag'])

    return movies, ratings, tagging, tags

num_feat_core = 10
num_core = 10

movies = movies.drop_duplicates()
ratings = ratings.drop_duplicates()
tagging = tagging.drop_duplicates()

movies = movies[movies.iid.isin(ratings.iid.unique())]
ratings = ratings[ratings.iid.isin(movies.iid.unique())]
tagging = tagging[tagging.iid.isin(ratings.iid.unique())]
tagging = tagging[tagging.uid.isin(ratings.uid.unique())]

movie_count = ratings['iid'].value_counts()
movie_count.name = 'movie_count'
ratings = ratings[ratings.join(movie_count, on='iid').movie_count > num_core]

user_count = ratings['uid'].value_counts()
user_count.name = 'user_count'
ratings = ratings[ratings.join(user_count, on='uid').user_count > num_core]

movies = movies[movies.iid.isin(ratings.iid.unique())]
tagging = tagging[tagging.iid.isin(ratings.iid.unique())]
tagging = tagging[tagging.uid.isin(ratings.uid.unique())]

tag_count = tagging['tag'].value_counts()
tag_count.name = 'tag_count'
tagging = tagging[tagging.join(tag_count, on='tag').tag_count > num_feat_core]

years = movies.year.to_numpy()
years[years < 1950] = 1950
movies['year'] = years

years = movies.year.to_numpy().astype(np.int)
min_year = min(years)
max_year = max(years)
num_years = (max_year - min_year) // 10
discretized_years = [min_year + i * 10 for i in range(num_years + 1)]
for i in range(len(discretized_years) - 1):
    years[(discretized_years[i] <= years) & (years < discretized_years[i + 1])] = str(
    discretized_years[i])
    years[years < discretized_years[0]] = discretized_years[0]
    years[years >= discretized_years[-1]] = discretized_years[-1]

movies['year'] = years

movies, ratings, tagging, tags = reindex_df_mlsmall(
                    movies, ratings, tagging)

In [8]:
ratings = ratings.sort_values('timestamp')

In [14]:
tf_size = 3000

for i in range(25):
    tf = ratings[i*tf_size:(i+1)*tf_size]
    print(len(tf['uid'].unique()))
    counts = tf['uid'].value_counts().to_numpy()
    print(f'std: {np.std(counts)} mean: {np.mean(counts)}')

55
std: 26.63478121269131 mean: 54.54545454545455
55
std: 39.146725478671634 mean: 54.54545454545455
44
std: 66.7451982363024 mean: 68.18181818181819
29
std: 101.18388389015473 mean: 103.44827586206897
18
std: 158.06046662942353 mean: 166.66666666666666
31
std: 105.44275611865437 mean: 96.7741935483871
39
std: 88.71168645454479 mean: 76.92307692307692
29
std: 107.94365906115148 mean: 103.44827586206897
31
std: 136.89735021291455 mean: 96.7741935483871
28
std: 108.5441695103605 mean: 107.14285714285714
36
std: 132.79223538211028 mean: 83.33333333333333
40
std: 112.98694614865913 mean: 75.0
33
std: 152.88743405889494 mean: 90.9090909090909
34
std: 150.0445147327422 mean: 88.23529411764706
35
std: 91.40977485354348 mean: 85.71428571428571
40
std: 125.48485964450055 mean: 75.0
42
std: 109.71741687000222 mean: 71.42857142857143
54
std: 75.02263033064352 mean: 55.55555555555556
41
std: 99.53842313374473 mean: 73.17073170731707
44
std: 64.19121744206302 mean: 68.18181818181819
34
std: 138.198

In [10]:
counts = ratings['uid'].value_counts().to_numpy()
print(f'cv: {np.mean(counts)/np.std(counts)}')

cv: 0.7442735006202479


In [15]:
len(ratings['uid'].unique())

608

In [21]:
ratings.groupby("uid").count().iid.values

array([ 209,   26,   24,  177,   44,  220,  136,   47,   29,  102,   60,
         32,   28,   46,  130,   95,  105,  422,  513,  206,  342,  114,
         95,  108,   23,   21,  120,  417,   76,   34,   44,   97,  120,
         77,   23,   50,   21,   78,   97,   93,  201,  358,  100,   41,
        359,   42,  103,   32,   21,  180,  273,  116,   20,   30,   25,
         46,  408,  112,   91,   22,   39,  314,  253,  485,   34,  281,
         32, 1064,   44,   59,   34,   43,  170,  113,   64,  114,   29,
         61,   55,  137,   26,  193,  105,  236,   20,   68,   21,   50,
        174,   29,  458,   24,   93,   56,  145,   75,   34,   87,   48,
        136,   54,   56,  333,  208,  460,   32,   34,   50,  120,   45,
        391,   65,  101,   28,  107,   85,  157,   22,  171,   22,   58,
        268,   50,   50,  202,   38,   22,   27,  133,   28,   64,  315,
         35,   35,  258,   98,  123,   19,  193,  423,  168,   37,   51,
        125,   23,   30,   20,   45,   54,   26,   