# **영화 추천 시스템**
MovieLens 데이터로 사용자의 영화 평점 패턴 학습

→ 비어 있는 평점 예측 (사용자가 아직 보지 않은 영화에 대해 어떻게 평가할지 예측)

In [1]:
import pandas as pd
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

### **1. Load dataset**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load dataset
ratings = pd.read_csv('/content/drive/MyDrive/Samples/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
display(ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### **2. Data Preprocessing**

In [5]:
# Select some columns
ratings = ratings[['userId', 'movieId', 'rating']]

In [6]:
# Create rating matrix
ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
print(ratings_matrix.info())

<class 'pandas.core.frame.DataFrame'>
Index: 610 entries, 1 to 610
Columns: 9724 entries, 1 to 193609
dtypes: float64(9724)
memory usage: 45.3 MB
None


In [7]:
display(ratings_matrix.head())

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Fill review value of unseen movie as 0
ratings_matrix.fillna(0, inplace=True)

In [9]:
# Normalize review value
ratings_matrix = ratings_matrix.values / 5.0

### **3. Modeling**

In [11]:
# Split the dataset
x_train, x_val = train_test_split(ratings_matrix, test_size=0.2, random_state=42)

In [12]:
# Build autoencoder model
input_layer = Input(shape=(ratings_matrix.shape[1],))                            # input layer
encoded = Dense(9724, activation='relu')(input_layer)
encoded = Dense(1024, activation='relu')(encoded)
encoded = Dense(512, activation='relu')(encoded)                            # encoder (dimension reduction)
encoded = Dense(256, activation='relu')(encoded)
encoded = Dense(128, activation='relu')(encoded)
encoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)                                # decoder (dimension expansion)
decoded = Dense(256, activation='relu')(decoded)
decoded = Dense(512, activation='relu')(decoded)
decoded = Dense(1024, activation='relu')(decoded)
decoded = Dense(9724, activation='relu')(decoded)
output_layer = Dense(ratings_matrix.shape[1], activation='sigmoid')(decoded)    # output layer
autoencoder = Model(input_layer, output_layer)
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 9724)]            0         
                                                                 
 dense (Dense)               (None, 9724)              94565900  
                                                                 
 dense_1 (Dense)             (None, 1024)              9958400   
                                                                 
 dense_2 (Dense)             (None, 512)               524800    
                                                                 
 dense_3 (Dense)             (None, 256)               131328    
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 64)                8256  

In [13]:
# Define a loss function that calculates loss by only using real rating value
def masked_mse(y_true, y_pred):                         # masking technique
    mask = K.cast(K.not_equal(y_true, 0), K.floatx())   # if real rating value then 1, else fake rating value then 0
    error = K.square((y_true - y_pred) * mask)          # loss of fake rating value becomes 0 (only taking loss of real rating value)
    return K.sum(error) / K.sum(mask)                   # calculate MSE

In [14]:
# Compile
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss=masked_mse)

In [15]:
# Fit the model
autoencoder.fit(x_train, x_train, epochs=50, batch_size=256, validation_data=(x_val, x_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7d18d4c25a50>

In [16]:
# Evaluate the model
val_loss = autoencoder.evaluate(x_val, x_val)
print('Validation loss:', val_loss)

Validation loss: 0.037209950387477875


### **4. Prediction**

In [17]:
# Predict ratings
predicted_ratings = autoencoder.predict(ratings_matrix) * 5.0



In [18]:
# Recommend movies
def generate_recommendations(user_id, predicted_ratings, ratings_matrix):
    user_ratings = ratings_matrix[user_id-1]
    unseen_movies_idx = user_ratings == 0
    predicted_ratings_for_user = predicted_ratings[user_id-1]
    recommended_movies = predicted_ratings_for_user[unseen_movies_idx]
    top_recommended_movie_ids = recommended_movies.argsort()[::-1][:10]
    return top_recommended_movie_ids

user_id = 1
recommendations = generate_recommendations(user_id, predicted_ratings, ratings_matrix)
print('Recommended movie IDs for user {}: {}'.format(user_id, recommendations))

Recommended movie IDs for user 1: [8711 3269 5335 5244 6195 2442 5164 7371 5802 9368]
