In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from datetime import datetime
import numpy as np
from tqdm import tqdm

In [3]:
users = pd.read_csv('/content/drive/MyDrive/ml-1m/users.dat', sep='::',engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip'])

ratings = pd.read_csv('/content/drive/MyDrive/ml-1m/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])

movies = pd.read_csv('/content/drive/MyDrive/ml-1m/movies.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False)

In [4]:
movies['movieid_norm'] = np.arange(len(movies.movieid))
ratings = ratings.merge(movies, how='left', left_on='movieid', right_on='movieid')

In [5]:
ratings

Unnamed: 0,userid,movieid,rating,timestamp,title,genre,movieid_norm
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1176
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,655
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,902
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,3339
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,2286
...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy,1075
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War,1078
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama,558
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama,1080


In [6]:
len(ratings.movieid_norm.unique())

3706

In [7]:
# ratings['rating_date'] = ratings.timestamp.apply(
#     lambda x: datetime.fromtimestamp(x)
# )

In [8]:
ratings.userid = ratings.userid - 1

In [9]:
mintime = min(ratings.timestamp)

In [10]:
maxtime = max(ratings.timestamp)

In [11]:
movies_list = [movie for movie in ratings.movieid.unique() if len(ratings[ratings.movieid == movie].userid.unique()) >= 10]

In [12]:
len(movies_list)

3260

In [13]:
ratings['choose'] = ratings.movieid.apply(
    lambda x: 1 if x in movies_list else 0
)

In [14]:
cnt = 0
for movie in ratings.movieid.unique():
    if movie in movies_list:
        ratings.loc[ratings.movieid == movie, 'movieid_norm'] = cnt
        cnt += 1
    else:
        ratings.loc[ratings.movieid == movie, 'movieid_norm'] = -1


In [15]:
users_list = np.array(ratings.userid.unique())
# users_list = [user for user in ratings.userid.unique() if len(ratings[ratings.userid == user].movieid) >= 10]

with open('user_item.txt', 'w') as f:
  for user in tqdm(users_list):
    movie_list = ratings[(ratings.userid == user) & (ratings.movieid_norm > 0) & (ratings.rating >= 3)].movieid_norm.unique()
    for movie in movie_list:
      f.write(f'{user}\t{movie}\n')


100%|██████████| 6040/6040 [00:41<00:00, 146.55it/s]


In [16]:
ratings['timestamp_norm'] = ratings.timestamp.apply(
    lambda x: x - mintime
)

In [17]:
month_time = 30 * 24 * 60 * 60
week_time = 7 * 24 * 60 * 60
three_day = 3 * 24 * 60 * 60
one_day = 24 * 60 * 60
h_hour = 8 * 60 * 60
a_hour = 60 * 60

In [18]:
ratings['month'] = ratings.timestamp_norm.apply(
    lambda x: int(x/month_time) + 1
)

In [19]:
ratings['week'] = ratings.timestamp_norm.apply(
    lambda x: int(x/week_time) + 1
)

In [20]:
ratings['three_days'] = ratings.timestamp_norm.apply(
    lambda x: int(x/three_day) + 1
)

In [21]:
ratings['one_day'] = ratings.timestamp_norm.apply(
    lambda x: int(x/one_day) + 1
)

In [22]:
ratings['h_hour'] = ratings.timestamp_norm.apply(
    lambda x: int(x/h_hour) + 1
)

In [23]:
ratings['a_hour'] = ratings.timestamp_norm.apply(
    lambda x: int(x/a_hour) + 1
)

In [26]:
import random
bundle_dict = {}
users_list = np.array(ratings.userid.unique())
random.seed(2024)

with open('/content/drive/MyDrive/ml-1m/bundle_item.txt', 'w') as f2, open('/content/drive/MyDrive/ml-1m/user_bundle_train.txt', 'w') as f3, open('/content/drive/MyDrive/ml-1m/user_bundle_tune.txt', 'w') as f4, open('/content/drive/MyDrive/ml-1m/user_bundle_test.txt', 'w') as f5:
  for user in tqdm(users_list):
    bundles = []
    for one_day in ratings[ratings.userid == user].one_day.unique():
      bundle = ratings[(ratings.userid == user) & (ratings.one_day == one_day) & (ratings.movieid_norm > 0) & (ratings.rating >= 3)].movieid_norm.unique()
      bundle.sort()
      bundle = tuple(bundle)

      if len(bundle) > 1:
        if bundle not in bundle_dict:
          bundle_dict[bundle] = len(bundle_dict)
          for item in bundle:
            f2.write(f'{bundle_dict[bundle]}\t{item}\n')
        bundles.append(bundle)

    random.shuffle(bundles)
    if len(bundles) >= 10:
      for bundle in bundles[:int(0.6*len(bundles))]:
      # for bundle in bundles[:-2]:
        f3.write(f'{user}\t{bundle_dict[bundle]}\n')

      for bundle in bundles[int(0.6*len(bundles)):int(0.8*len(bundles))]:
        f4.write(f'{user}\t{bundle_dict[bundle]}\n')

      for bundle in bundles[int(0.8*len(bundles)):]:
        f5.write(f'{user}\t{bundle_dict[bundle]}\n')


100%|██████████| 6040/6040 [03:45<00:00, 26.80it/s]


In [29]:
len(bundle_dict)

15913

In [None]:
norm = movies.movieid_norm.unique()
with open('/content/drive/MyDrive/ml-1m/movieid_lookup.txt', 'w') as f:
  for id in tqdm(norm):
    f.write(f'{id}\t{movies[movies.movieid_norm==id].movieid}\n')


100%|██████████| 3883/3883 [00:02<00:00, 1420.13it/s]
