In [1]:
import pandas as pd
import numpy as np
import warnings
import time
import itertools
import copy
warnings.filterwarnings("ignore")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth',100)

In [2]:
dataset_name = "/Movie"

# 1. Read Data

In [3]:
rating_df = pd.read_csv('./ml-1m/ratings.dat',sep="::", names=["userid","itemid","rating","timestamp"])
rating_df
(rating_df.isna().sum()/rating_df.shape[0]).sort_values(ascending=False)
user_max = rating_df["userid"].max()
item_max = rating_df["itemid"].max()
print(user_max)
print(item_max)
print(len(rating_df["userid"].unique()))
print(len(rating_df["itemid"].unique()))

Unnamed: 0,userid,itemid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


userid       0.0
itemid       0.0
rating       0.0
timestamp    0.0
dtype: float64

6040
3952
6040
3706


In [4]:
user_df = pd.read_csv('./ml-1m/users.dat',sep="::", names=["userid","gender","age","occupation","zip_code"])
user_df
(user_df.isna().sum()/user_df.shape[0]).sort_values(ascending=False)

Unnamed: 0,userid,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


userid        0.0
gender        0.0
age           0.0
occupation    0.0
zip_code      0.0
dtype: float64

In [5]:
item_df = pd.read_csv('./ml-1m/movies.dat',sep="::", names=["itemid","title","genres"],encoding="latin-1")
item_df
(item_df.isna().sum()/item_df.shape[0]).sort_values(ascending=False)

Unnamed: 0,itemid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


itemid    0.0
title     0.0
genres    0.0
dtype: float64

In [6]:
def process_movielens_name(s):
    s = s[:-7]
    s = s.split(" (")[0]
    for pattern in [", The", ", A"]:
        if s.endswith(pattern):
            s = pattern.split(", ")[1] + " " + s.replace(pattern, "")
    return s
item_df["title"] = item_df["title"].apply(lambda x: process_movielens_name(x))
item_df

Unnamed: 0,itemid,title,genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy
...,...,...,...
3878,3948,Meet the Parents,Comedy
3879,3949,Requiem for a Dream,Drama
3880,3950,Tigerland,Drama
3881,3951,Two Family House,Drama


In [7]:
import json
def split_genres(genres):
    return genres.split('|')

# 将DataFrame转换为字典，并处理genres列
list_genre = item_df.set_index('title')['genres'].apply(split_genres).to_dict()
with open("/data1/shiwt/FireAct/env/movie/list_genre.json", "w") as file:
    json.dump(list_genre, file)

In [8]:
id2item_dict = item_df.set_index("itemid")["title"].to_dict()
id2item_dict
len(id2item_dict)
item2id_dict = item_df.set_index("title")["itemid"].to_dict()
item2id_dict
len(item2id_dict)

{1: 'Toy Story',
 2: 'Jumanji',
 3: 'Grumpier Old Men',
 4: 'Waiting to Exhale',
 5: 'Father of the Bride Part II',
 6: 'Heat',
 7: 'Sabrina',
 8: 'Tom and Huck',
 9: 'Sudden Death',
 10: 'GoldenEye',
 11: 'The American President',
 12: 'Dracula: Dead and Loving It',
 13: 'Balto',
 14: 'Nixon',
 15: 'Cutthroat Island',
 16: 'Casino',
 17: 'Sense and Sensibility',
 18: 'Four Rooms',
 19: 'Ace Ventura: When Nature Calls',
 20: 'Money Train',
 21: 'Get Shorty',
 22: 'Copycat',
 23: 'Assassins',
 24: 'Powder',
 25: 'Leaving Las Vegas',
 26: 'Othello',
 27: 'Now and Then',
 28: 'Persuasion',
 29: 'The City of Lost Children',
 30: 'Shanghai Triad',
 31: 'Dangerous Minds',
 32: 'Twelve Monkeys',
 33: 'Wings of Courage',
 34: 'Babe',
 35: 'Carrington',
 36: 'Dead Man Walking',
 37: 'Across the Sea of Time',
 38: 'It Takes Two',
 39: 'Clueless',
 40: 'Cry, the Beloved Country',
 41: 'Richard III',
 42: 'Dead Presidents',
 43: 'Restoration',
 44: 'Mortal Kombat',
 45: 'To Die For',
 46: 'How to 

3883

{'Toy Story': 1,
 'Jumanji': 2,
 'Grumpier Old Men': 3,
 'Waiting to Exhale': 4,
 'Father of the Bride Part II': 5,
 'Heat': 6,
 'Sabrina': 915,
 'Tom and Huck': 8,
 'Sudden Death': 9,
 'GoldenEye': 10,
 'The American President': 11,
 'Dracula: Dead and Loving It': 12,
 'Balto': 13,
 'Nixon': 14,
 'Cutthroat Island': 15,
 'Casino': 16,
 'Sense and Sensibility': 17,
 'Four Rooms': 18,
 'Ace Ventura: When Nature Calls': 19,
 'Money Train': 20,
 'Get Shorty': 21,
 'Copycat': 22,
 'Assassins': 23,
 'Powder': 24,
 'Leaving Las Vegas': 25,
 'Othello': 2848,
 'Now and Then': 27,
 'Persuasion': 28,
 'The City of Lost Children': 29,
 'Shanghai Triad': 30,
 'Dangerous Minds': 31,
 'Twelve Monkeys': 32,
 'Wings of Courage': 33,
 'Babe': 34,
 'Carrington': 35,
 'Dead Man Walking': 36,
 'Across the Sea of Time': 37,
 'It Takes Two': 38,
 'Clueless': 39,
 'Cry, the Beloved Country': 40,
 'Richard III': 41,
 'Dead Presidents': 42,
 'Restoration': 43,
 'Mortal Kombat': 44,
 'To Die For': 45,
 'How to 

3833

# 生成distance mat

In [9]:
distance = np.zeros((item_max, item_max))
genres = item_df['genres'].tolist()
genre_list = []
for genre in genres:
    temp_list = genre.split('|')
    for j in temp_list:
        if j not in genre_list:
            genre_list.append(j)
print(genre_list)
# for i in range(len(genres)):
#     for j in range(len(genres)):
#         genre1 = genres[i].split('|')
#         genre2 = genres[j].split('|')
#         common_genres = set(genre1) & set(genre2)
#         count = len(common_genres)
#         x = item_df.loc[i, 'itemid'] -1
#         y = item_df.loc[j, 'itemid'] -1
#         distance[x, y] = count
        
# np.save('./distance.mat', distance)

['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir', 'Western']


# 2. Fliter

In [10]:
item_title_exist_idset = item_df.itemid.tolist()
rating_df_fliter = rating_df[rating_df["itemid"].isin(item_title_exist_idset)].reset_index(drop=True)
rating_df_fliter

Unnamed: 0,userid,itemid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [11]:
rating_df_fliter["label"] = rating_df_fliter["rating"].apply(lambda x: 1 if x>=4 else 0)
rating_df_fliter

Unnamed: 0,userid,itemid,rating,timestamp,label
0,1,1193,5,978300760,1
1,1,661,3,978302109,0
2,1,914,3,978301968,0
3,1,3408,4,978300275,1
4,1,2355,5,978824291,1
...,...,...,...,...,...
1000204,6040,1091,1,956716541,0
1000205,6040,1094,5,956704887,1
1000206,6040,562,5,956704746,1
1000207,6040,1096,4,956715648,1


In [12]:
rating_df_fliter[rating_df_fliter["label"]==1].shape[0] / rating_df_fliter.shape[0]

0.5751607913945985

In [13]:
item_pop = rating_df_fliter[rating_df_fliter["label"]==1]["itemid"].value_counts().rename_axis("itemid").reset_index(name="pop")
item_pop

Unnamed: 0,itemid,pop
0,2858,2853
1,260,2622
2,1196,2510
3,2028,2260
4,1198,2260
...,...,...
3528,396,1
3529,2955,1
3530,1102,1
3531,687,1


In [14]:
# 按照时间顺序排序过的
rating_df_fliter.sort_values(["userid", "timestamp"] , inplace=True, ascending=True)
sequence_df = rating_df_fliter.groupby(['userid']).agg(
    itemid_seq=("itemid", list),
    label_seq=("label", list)
).reset_index()
sequence_df

Unnamed: 0,userid,itemid_seq,label_seq
0,1,"[3186, 1270, 1721, 1022, 2340, 1836, 3408, 2804, 1207, 1193, 720, 260, 919, 608, 2692, 1961, 202...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,..."
1,2,"[1198, 1210, 1217, 2717, 1293, 2943, 1225, 1193, 318, 3030, 2858, 1213, 1945, 1207, 593, 3095, 3...","[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,3,"[593, 2858, 3534, 1968, 1431, 1961, 1266, 1378, 1379, 3671, 590, 260, 1196, 2871, 1197, 1198, 31...","[0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,..."
3,4,"[1210, 1097, 3468, 480, 3527, 260, 1196, 1198, 1387, 2028, 2366, 1201, 2692, 2947, 1214, 3418, 3...","[0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,5,"[2717, 908, 919, 1250, 356, 2858, 1127, 2188, 2683, 3051, 2997, 2770, 2355, 2908, 3786, 3016, 27...","[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,..."
...,...,...,...
6035,6036,"[1721, 2428, 3438, 1883, 2376, 2492, 2826, 2827, 2858, 2572, 2683, 2699, 2706, 2707, 3005, 2842,...","[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,..."
6036,6037,"[1882, 3508, 702, 1267, 2028, 3148, 858, 562, 912, 3543, 1221, 923, 2804, 2858, 2132, 1193, 318,...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,..."
6037,6038,"[920, 3396, 1210, 2146, 356, 1387, 1079, 1148, 3548, 1276, 2716, 3088, 232, 1136, 1223, 1296, 13...","[0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1]"
6038,6039,"[282, 111, 2067, 930, 1230, 3022, 947, 3088, 3133, 1294, 3421, 2804, 1269, 955, 1244, 1276, 2622...","[1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,..."


In [15]:
def get_seq(row):
    pos_seq = []
    neg_seq = []
    pos_seq_name = []
    for i in range(len(row.label_seq)):
        if row.label_seq[i]==1:
            pos_seq.append(row.itemid_seq[i])
            pos_seq_name.append(id2item_dict[row.itemid_seq[i]])
        else:
            neg_seq.append(row.itemid_seq[i])
    return pos_seq, neg_seq, len(row.itemid_seq), len(pos_seq), len(neg_seq), pos_seq_name

sequence_df[["pos_seq","neg_seq", "seq_length", "pos_seq_length", "neg_seq_length", "pos_seq_name"]] = sequence_df.apply(get_seq, axis=1, result_type="expand") 
sequence_df

Unnamed: 0,userid,itemid_seq,label_seq,pos_seq,neg_seq,seq_length,pos_seq_length,neg_seq_length,pos_seq_name
0,1,"[3186, 1270, 1721, 1022, 2340, 1836, 3408, 2804, 1207, 1193, 720, 260, 919, 608, 2692, 1961, 202...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,...","[3186, 1270, 1721, 1022, 1836, 3408, 2804, 1207, 1193, 260, 919, 608, 2692, 1961, 2028, 3105, 93...","[2340, 720, 914, 661, 2321, 1197, 2687, 745]",53,45,8,"[Girl, Interrupted, Back to the Future, Titanic, Cinderella, The Last Days of Disco, Erin Brocko..."
1,2,"[1198, 1210, 1217, 2717, 1293, 2943, 1225, 1193, 318, 3030, 2858, 1213, 1945, 1207, 593, 3095, 3...","[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[1198, 1210, 1293, 2943, 1225, 1193, 318, 3030, 2858, 1945, 1207, 593, 3095, 3468, 1873, 515, 25...","[1217, 2717, 1213, 1090, 1084, 3654, 3735, 2728, 1968, 1103, 902, 2852, 2312, 1253, 1244, 3699, ...",129,73,56,"[Raiders of the Lost Ark, Star Wars: Episode VI - Return of the Jedi, Gandhi, Indochine, Amadeus..."
2,3,"[593, 2858, 3534, 1968, 1431, 1961, 1266, 1378, 1379, 3671, 590, 260, 1196, 2871, 1197, 1198, 31...","[0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...","[2858, 1968, 1961, 1266, 1378, 1379, 3671, 590, 260, 1196, 2871, 1197, 1198, 3168, 1210, 1291, 2...","[593, 3534, 1431, 1580, 1261, 2617, 648, 3114, 2997, 3619, 1270, 1265, 1641, 3868]",51,37,14,"[American Beauty, The Breakfast Club, Rain Man, Unforgiven, Young Guns, Young Guns II, Blazing S..."
3,4,"[1210, 1097, 3468, 480, 3527, 260, 1196, 1198, 1387, 2028, 2366, 1201, 2692, 2947, 1214, 3418, 3...","[0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1097, 3468, 480, 260, 1198, 1387, 2028, 2366, 1201, 2692, 2947, 1214, 3418, 3702, 1240, 2951, 1...","[1210, 3527, 1196]",21,18,3,"[E.T. the Extra-Terrestrial, The Hustler, Jurassic Park, Star Wars: Episode IV - A New Hope, Rai..."
4,5,"[2717, 908, 919, 1250, 356, 2858, 1127, 2188, 2683, 3051, 2997, 2770, 2355, 2908, 3786, 3016, 27...","[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,...","[908, 919, 1250, 2858, 2997, 2770, 2355, 2908, 3016, 2599, 2959, 2333, 2580, 2318, 2390, 913, 27...","[2717, 356, 1127, 2188, 2683, 3051, 3786, 2759, 1093, 3113, 3408, 2428, 3409, 2716, 2607, 2734, ...",198,82,116,"[North by Northwest, The Wizard of Oz, The Bridge on the River Kwai, American Beauty, Being John..."
...,...,...,...,...,...,...,...,...,...
6035,6036,"[1721, 2428, 3438, 1883, 2376, 2492, 2826, 2827, 2858, 2572, 2683, 2699, 2706, 2707, 3005, 2842,...","[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...","[1721, 1883, 2858, 2997, 2710, 223, 3203, 1093, 3408, 2575, 2599, 2336, 3159, 2580, 2712, 2757, ...","[2428, 3438, 2376, 2492, 2826, 2827, 2572, 2683, 2699, 2706, 2707, 3005, 2842, 2555, 2975, 3285,...",888,399,489,"[Titanic, Bulworth, American Beauty, Being John Malkovich, The Blair Witch Project, Clerks, Dead..."
6036,6037,"[1882, 3508, 702, 1267, 2028, 3148, 858, 562, 912, 3543, 1221, 923, 2804, 2858, 2132, 1193, 318,...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,...","[1267, 2028, 3148, 858, 562, 912, 1221, 923, 2804, 2858, 1193, 318, 527, 3006, 908, 3196, 1225, ...","[1882, 3508, 702, 3543, 2132, 1250, 1299, 1237, 1949, 2728, 3362, 1233, 2289, 3270, 2890, 2919, ...",202,120,82,"[The Manchurian Candidate, Saving Private Ryan, The Cider House Rules, The Godfather, Welcome to..."
6037,6038,"[920, 3396, 1210, 2146, 356, 1387, 1079, 1148, 3548, 1276, 2716, 3088, 232, 1136, 1223, 1296, 13...","[0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1]","[1210, 2146, 356, 1079, 1148, 3548, 3088, 232, 1136, 1223, 1296, 1419, 1183]","[920, 3396, 1387, 1276, 2716, 1354, 2700]",20,13,7,"[Star Wars: Episode VI - Return of the Jedi, St. Elmo's Fire, Forrest Gump, A Fish Called Wanda,..."
6038,6039,"[282, 111, 2067, 930, 1230, 3022, 947, 3088, 3133, 1294, 3421, 2804, 1269, 955, 1244, 1276, 2622...","[1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,...","[282, 111, 2067, 930, 3022, 3088, 2804, 955, 1276, 2791, 2300, 2396, 1028, 1197, 3548, 951, 1211...","[1230, 947, 3133, 1294, 3421, 1269, 1244, 2622, 2863, 3072, 1066, 935, 2671, 1014, 909, 1265, 21...",123,90,33,"[Nell, Taxi Driver, Doctor Zhivago, Notorious, The General, Harvey, A Christmas Story, Bringing ..."


In [16]:
import json

sequence_df_fliter = sequence_df[(sequence_df["pos_seq_length"]>=10)].reset_index(drop=True)
# sequence_df_fliter.to_csv(f"./ml-1m/processed/full_sequence_df.csv", sep="\t", index=False)
# sequence_df_fliter
sequence = sequence_df_fliter[['userid', 'pos_seq_name']].to_dict('records')
for task in sequence:
    task['question'] = f'''The user's viewing history is {str(task['pos_seq_name'][-15:])}, please recommend item for this user'''
with open("./train.json", "w") as json_file:
    json.dump(sequence, json_file)

# 4. Generate Datamaps

In [17]:
# def process_movielens_name(s):
#     s = s[:-7]
#     s = s.split(" (")[0]
#     for pattern in [", The", ", A"]:
#         if s.endswith(pattern):
#             s = pattern.split(", ")[1] + " " + s.replace(pattern, "")
#     return s
# item_df["title"] = item_df["title"].apply(lambda x: process_movielens_name(x))
# item_df

In [18]:
id2item_dict = item_df.set_index("itemid")["title"].to_dict()
id2item_dict
item2id_dict = item_df.set_index("title")["itemid"].to_dict()
item2id_dict

{1: 'Toy Story',
 2: 'Jumanji',
 3: 'Grumpier Old Men',
 4: 'Waiting to Exhale',
 5: 'Father of the Bride Part II',
 6: 'Heat',
 7: 'Sabrina',
 8: 'Tom and Huck',
 9: 'Sudden Death',
 10: 'GoldenEye',
 11: 'The American President',
 12: 'Dracula: Dead and Loving It',
 13: 'Balto',
 14: 'Nixon',
 15: 'Cutthroat Island',
 16: 'Casino',
 17: 'Sense and Sensibility',
 18: 'Four Rooms',
 19: 'Ace Ventura: When Nature Calls',
 20: 'Money Train',
 21: 'Get Shorty',
 22: 'Copycat',
 23: 'Assassins',
 24: 'Powder',
 25: 'Leaving Las Vegas',
 26: 'Othello',
 27: 'Now and Then',
 28: 'Persuasion',
 29: 'The City of Lost Children',
 30: 'Shanghai Triad',
 31: 'Dangerous Minds',
 32: 'Twelve Monkeys',
 33: 'Wings of Courage',
 34: 'Babe',
 35: 'Carrington',
 36: 'Dead Man Walking',
 37: 'Across the Sea of Time',
 38: 'It Takes Two',
 39: 'Clueless',
 40: 'Cry, the Beloved Country',
 41: 'Richard III',
 42: 'Dead Presidents',
 43: 'Restoration',
 44: 'Mortal Kombat',
 45: 'To Die For',
 46: 'How to 

{'Toy Story': 1,
 'Jumanji': 2,
 'Grumpier Old Men': 3,
 'Waiting to Exhale': 4,
 'Father of the Bride Part II': 5,
 'Heat': 6,
 'Sabrina': 915,
 'Tom and Huck': 8,
 'Sudden Death': 9,
 'GoldenEye': 10,
 'The American President': 11,
 'Dracula: Dead and Loving It': 12,
 'Balto': 13,
 'Nixon': 14,
 'Cutthroat Island': 15,
 'Casino': 16,
 'Sense and Sensibility': 17,
 'Four Rooms': 18,
 'Ace Ventura: When Nature Calls': 19,
 'Money Train': 20,
 'Get Shorty': 21,
 'Copycat': 22,
 'Assassins': 23,
 'Powder': 24,
 'Leaving Las Vegas': 25,
 'Othello': 2848,
 'Now and Then': 27,
 'Persuasion': 28,
 'The City of Lost Children': 29,
 'Shanghai Triad': 30,
 'Dangerous Minds': 31,
 'Twelve Monkeys': 32,
 'Wings of Courage': 33,
 'Babe': 34,
 'Carrington': 35,
 'Dead Man Walking': 36,
 'Across the Sea of Time': 37,
 'It Takes Two': 38,
 'Clueless': 39,
 'Cry, the Beloved Country': 40,
 'Richard III': 41,
 'Dead Presidents': 42,
 'Restoration': 43,
 'Mortal Kombat': 44,
 'To Die For': 45,
 'How to 

In [19]:
datamaps = {}
datamaps["id2item_dict"] = id2item_dict
datamaps["item2id_dict"] = item2id_dict

print(len(id2item_dict))
print(len(item2id_dict))
import json
json_str = json.dumps(datamaps)
with open(f"./ml-1m/processed/datamaps.json", 'w') as out:
    out.write(json_str)


3883
3833


203126

In [20]:
item2pop_dict = item_pop.set_index("itemid")["pop"].to_dict()
for i in id2item_dict.keys():
    if i not in item2pop_dict.keys():
        item2pop_dict[i] = 0
item2pop_dict

json_str = json.dumps(item2pop_dict)
with open(f"./ml-1m/processed/popularity_datamaps.json", 'w') as out:
    out.write(json_str)

{2858: 2853,
 260: 2622,
 1196: 2510,
 2028: 2260,
 1198: 2260,
 593: 2252,
 2571: 2171,
 2762: 2163,
 1210: 2127,
 608: 2074,
 527: 2071,
 318: 2046,
 589: 2044,
 858: 1989,
 110: 1977,
 1197: 1924,
 1270: 1910,
 2396: 1877,
 1617: 1876,
 296: 1770,
 2997: 1759,
 480: 1730,
 1240: 1683,
 356: 1668,
 1265: 1661,
 1: 1655,
 1580: 1644,
 1097: 1643,
 1214: 1623,
 457: 1615,
 50: 1608,
 2716: 1548,
 1193: 1519,
 3578: 1508,
 541: 1485,
 1221: 1444,
 912: 1434,
 1200: 1421,
 1259: 1420,
 919: 1385,
 1213: 1370,
 1136: 1364,
 1036: 1332,
 3114: 1302,
 1291: 1297,
 1610: 1297,
 2791: 1284,
 924: 1270,
 1387: 1266,
 1704: 1256,
 2916: 1248,
 34: 1234,
 1307: 1211,
 750: 1207,
 2355: 1192,
 908: 1170,
 1304: 1170,
 2000: 1168,
 2918: 1168,
 1225: 1155,
 3175: 1145,
 2628: 1132,
 2599: 1130,
 2987: 1111,
 2804: 1105,
 2959: 1096,
 32: 1083,
 1968: 1067,
 1394: 1066,
 3471: 1064,
 1230: 1061,
 2797: 1051,
 1784: 1047,
 1247: 1043,
 3793: 1042,
 1961: 1028,
 1219: 1028,
 1127: 1028,
 223: 1013,
 

45821