1-imports

In [1]:
import pandas as pd
import numpy as np

2-Sample Data

In [2]:

# --- 1. CONFIGURATION ---
# Define the dimensionality (size) of the embedding vector.
# We use 3 for simplicity in the example, but 100 or 256 is common in real systems.
EMBEDDING_DIMENSION = 50

# --- 2. SAMPLE DATA (REPLACING 'ratings.csv') ---
# In a real scenario, you would use: df = pd.read_csv('ratings.csv')
print("--- Initial Data (Ratings Table) ---")
df = pd.read_csv('./data/rating.csv')
print(df)
print("\n" + "="*50 + "\n")

--- Initial Data (Ratings Table) ---
          userId  movieId  rating            timestamp
0              1        2     3.5  2005-04-02 23:53:47
1              1       29     3.5  2005-04-02 23:31:16
2              1       32     3.5  2005-04-02 23:33:39
3              1       47     3.5  2005-04-02 23:32:07
4              1       50     3.5  2005-04-02 23:29:40
...          ...      ...     ...                  ...
20000258  138493    68954     4.5  2009-11-13 15:42:00
20000259  138493    69526     4.5  2009-12-03 18:31:48
20000260  138493    69644     3.0  2009-12-07 18:10:57
20000261  138493    70286     5.0  2009-11-13 15:42:24
20000262  138493    71619     2.5  2009-10-17 20:25:36

[20000263 rows x 4 columns]




3-Create Id-To-Index mapping

In [3]:

# --- 3. CREATE ID-TO-INDEX MAPPINGS ---
# We need to map the unique, non-sequential IDs (e.g., 5001, 5002) to 
# sequential array indices (0, 1, 2...) for efficient lookup.

unique_users = df['userId'].unique()
unique_movies = df['movieId'].unique()

# Maps original ID -> Array Index
# --- Explicit Loop for Users ---
user_to_idx = {}
# enumerate returns (index, value). Example: (0, 101), (1, 102)
for index, user_id in enumerate(unique_users):
    # Store: { Original ID : Array Index }
    user_to_idx[user_id] = index 

# --- Explicit Loop for Movies ---
movie_to_idx = {}
for index, movie_id in enumerate(unique_movies):
    movie_to_idx[movie_id] = index
    
print("Mapping User IDs to Array Index:")
print(user_to_idx[2])
print("Mapping Movie IDs to Array Index:")
print(movie_to_idx[2])
print("\n" + "="*50 + "\n")



Mapping User IDs to Array Index:
1
Mapping Movie IDs to Array Index:
0




4-Initialize random embedding matrices

In [4]:

# --- 4. INITIALIZE RANDOM EMBEDDING MATRICES (W) ---
# Each row in these matrices is the random vector for one unique entity.

num_users = len(unique_users)
num_movies = len(unique_movies)

# Create a matrix of size (Number of Unique Users) x (Dimension)
# The values are random floats between -1 and 1.
users_embed = np.random.uniform(
    low=-1.0, high=1.0, 
    size=(num_users, EMBEDDING_DIMENSION)
)

# Create a matrix of size (Number of Unique Movies) x (Dimension)
movies_embed = np.random.uniform(
    low=-1.0, high=1.0, 
    size=(num_movies, EMBEDDING_DIMENSION)
)

print(f"User Embedding Matrix (W_user) Shape: {users_embed.shape}")
print(f"Movie Embedding Matrix (W_movie) Shape: {movies_embed.shape}")
print("\nExample of User of ID = 101 (vector format):")
print(users_embed[user_to_idx[101]])
print("\n" + "="*50 + "\n")

User Embedding Matrix (W_user) Shape: (138493, 50)
Movie Embedding Matrix (W_movie) Shape: (26744, 50)

Example of User of ID = 101 (vector format):
[ 0.43564307  0.97449836 -0.37775465 -0.95418875 -0.23995074  0.48664576
  0.45864161 -0.93614468 -0.93332675 -0.22691458  0.69949299 -0.464135
  0.66474404 -0.96346142  0.73242426 -0.46800001  0.98787577  0.8561139
 -0.6231519  -0.65145213  0.10275017  0.18954588 -0.06239443  0.27914489
  0.52363129 -0.12688866 -0.07940699 -0.03169153 -0.93526102  0.37147882
  0.67400882  0.31008283 -0.03693043  0.24533074 -0.66290096 -0.46751943
 -0.81646319 -0.16601179 -0.80262719  0.82980696  0.14026866 -0.62550372
  0.62211017 -0.72869238 -0.89877266 -0.75431576 -0.71831733  0.34819833
  0.73009655 -0.16366827]




5-

In [5]:

# --- 5. BUILD FINAL FEATURE MATRIX (X) AND TARGET (Y) ---
# We iterate through every rating and look up the 100-dimensional vector 
# for both the user and the movie, then concatenate them.

# Lists to hold our final data
X_features = []
Y_ratings = []

print("Building X and Y by Iterating and Concatenating...")

# Loop through every single rating event (row)
for index, row in df.iterrows():
    # 1. Look up the index of the current userId and movieId
    u_idx = user_to_idx[row['userId']]
    m_idx = movie_to_idx[row['movieId']]

    # 2. Retrieve the n-dimensional random vectors (the embeddings)
    user_vector = users_embed[u_idx]
    movie_vector = movies_embed[m_idx]
    # print(index)
    # 3. Concatenate (join) the two vectors to create a single feature vector (2*3=6 dimensions)
    # This is the feature vector (x_k) for this specific rating event
    combined_feature_vector = np.concatenate([user_vector, movie_vector])

    # 4. Store the results
    X_features.append(combined_feature_vector)
    Y_ratings.append(row['rating'])
    
    # Optional: Print first two feature vectors to show the concatenation
    # if index < 2:
    #     print(f"\nRating {index}: User {row['userId']} rated Movie {row['movieId']} with {row['rating']}")
    #     print(f"  User Vector: {user_vector}")
    #     print(f"  Movie Vector: {movie_vector}")
    #     print(f"  Combined X: {combined_feature_vector}")

Building X and Y by Iterating and Concatenating...


KeyboardInterrupt: 

6-printing

In [None]:

# Convert lists to NumPy arrays
X = np.array(X_features)
Y = np.array(Y_ratings)

print("\n" + "="*50 + "\n")
print(f"FINAL FEATURE MATRIX X (Input for Model) Shape: {X.shape}")
print(f"FINAL TARGET VECTOR Y (Ratings) Shape: {Y.shape}")
print("\nFirst row of X (6-dimensional feature vector):")
print(X[0])
print("\nFirst value of Y (the rating):")
print(Y[0])

# X and Y are now ready to be split into training/testing sets and fed into 
# a model like LinearRegression. The model will learn to optimize the 
# values in the initial 'users_embed' and 'movies_embed' matrices.