# Movie-Lens 100k Dataset

## Data Import

In [1]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/han942/vscode/refs/heads/main/datafile/Recsys/ml-100k/u.data',sep='\t',
                   names=['userID','movieID','ratings','timestamp'])
data.head()

Unnamed: 0,userID,movieID,ratings,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
a="unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western |"
a.replace(' |','').split()

['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [3]:
#각 index가 ID가 됨.
items = pd.read_csv('https://raw.githubusercontent.com/han942/vscode/refs/heads/main/datafile/Recsys/ml-100k/u.item',encoding='latin-1',
                    sep='|',header=None,names=['movieID','title','release_date','video_release_date','URL','unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'])

In [4]:
items

Unnamed: 0,movieID,title,release_date,video_release_date,URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
users = pd.read_csv('https://raw.githubusercontent.com/han942/vscode/refs/heads/main/datafile/Recsys/ml-100k/u.user',sep='|',
                   names=['userID','age','gender','occupation','zipcode'])
users.head()

Unnamed: 0,userID,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
df = pd.merge(data,items.iloc[:,:2],on='movieID')
user_movie_matrix = pd.pivot_table(data=df,index='userID',columns='movieID',values='ratings')
user_movie_matrix_na = user_movie_matrix.fillna(0)


## Collaborative Filtering

### User-Based Filtering
1. Find Similarity Score btw different users
2. For Target User, Set Neighbors ($K$) to refer
3. Find the Neighbor Similarity Score
4. For Target Items, Predict Ratings for the unknown
    - Compution only needs to be done using observed ratings

$$prediction(a,p) = \bar{r_a} + \frac{\sum{sim(a,b)}*(r_{b,p}-\bar{r_b})}{\sum{|sim(a,b)|}}$$

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
user_movie_matrix = pd.pivot_table(data=df,index='userID',columns='movieID',values='ratings')
user_movie_matrix_na = user_movie_matrix.fillna(0)

user_cos = cosine_similarity(user_movie_matrix_na)

In [9]:
print('영화1의 rating 평균',user_movie_matrix[1].mean())
print('user1의 rating 평균',user_movie_matrix.loc[1].mean())

영화1의 rating 평균 3.8783185840707963
user1의 rating 평균 3.610294117647059


In [10]:
user_movie_matrix_nu = user_movie_matrix_na.to_numpy()

In [38]:
#특정 user_id에 대해 전체 참고
import numpy as np
new_p = user_movie_matrix.copy()

def prediction(user_id,p,b,new_p):
    new_p.loc[user_id,p] = user_movie_matrix_nu[user_id-1].mean() + np.sum(user_cos[user_id-1,b-1] * (user_movie_matrix_nu[b-1,p-1] - user_movie_matrix_nu[b-1].mean())) / np.sum(np.abs(user_cos[user_id-1,b-1]))
user_id = 13
for p in range(1,(len(user_movie_matrix.columns.values)+1)):
    for b in range(1,(len(user_movie_matrix.index.values)+1)):
        if user_movie_matrix_na.loc[user_id,p] == 0:
            if b == user_id:
                pass
            prediction(user_id,p,b,new_p)

KeyboardInterrupt: 

### Item Based CF

## Latent Factor Models

### SVD(MF,Matrix Factorization)

#### 방법1. 기존의 MF 방법
1. 정규분포를 따르는 무작위의 숫자로 구성된 User / Item Latent Matrix P,Q를 구성
2. Nan값이 아닌 Rating을 Test $P,Q^T$의 내적값을 predict로 정의
3. 두 값 사이의 Error를 구해서 Objective Function에 맞는 Optimization 진행
4. 모든 값을 Update 후에 2)~4)의 과정 반복
$$\text{Dimension Guide : } \\ R(m*n),P(m*k),Q(n*k) $$

In [12]:
import numpy as np
R = np.array(user_movie_matrix)
n_users,n_items = user_movie_matrix.shape
k = 20
P = np.random.normal(scale=0.1/k, size=(n_users,k))
Q = np.random.normal(scale=0.1/k, size=(n_items,k))
epochs = 500
lr = 0.03   #높이면 에러 뜨는듯
reg_param = 0.1 #0.01~0.1
not_nan_ind = [(i,j) for i in range(n_users) for j in range(n_items) if R[i,j]>0]

for epoch in range(epochs):
    np.random.shuffle(not_nan_ind)
    for i,j in not_nan_ind:
        error = R[i,j] - np.dot(P[i,:],Q[j,:].T)
        P[i,:] += lr * (error * Q[j,:] - reg_param * P[i,:])
        Q[j,:] += lr * (error * P[i,:] - reg_param * Q[j,:])
    if epoch % 100 == 0:
        loss = np.sum([((R[i,j]-np.dot(P[i,:],Q[j,:].T))**2) for i,j in not_nan_ind])
        rmse = np.sqrt(loss / len(not_nan_ind))
        print(f"Epoch: {epoch}, Loss: {loss:.4f}, RMSE: {rmse:.4f}")

R_hat = np.dot(P,Q.T)
print("\n예측된 평점 행렬")
print(np.round(R_hat,2))

Epoch: 0, Loss: 113548.4872, RMSE: 0.0034
Epoch: 100, Loss: 53524.1984, RMSE: 0.0023
Epoch: 200, Loss: 53516.6915, RMSE: 0.0023
Epoch: 300, Loss: 53129.4864, RMSE: 0.0023
Epoch: 400, Loss: 52910.0832, RMSE: 0.0023

예측된 평점 행렬
[[3.7  3.03 3.27 ... 1.32 2.7  2.99]
 [4.15 3.59 2.41 ... 1.17 2.87 3.02]
 [2.93 1.93 2.78 ... 0.98 1.76 2.14]
 ...
 [4.25 3.61 2.96 ... 1.31 2.98 2.87]
 [4.59 3.87 2.9  ... 1.11 3.07 2.68]
 [3.37 3.08 2.3  ... 1.23 2.41 2.15]]


#### 방법2. Tensorflow 이용
$$ \text{update rule} \\  b_u \leftarrow b_u + \eta*(e_{u,i}-\lambda b_u)\\ p_u \leftarrow p_u+\eta(e_{u,i}q_i-\lambda p_u) $$

In [8]:
import numpy as np
R = np.array(user_movie_matrix)
mask = (R>0).astype(np.float32)

In [11]:
import tensorflow as tf
class MF(tf.keras.models):
    R = np.array(user_movie_matrix)
    n_users,n_items = R.shape
    lr = 0.01
    K = 10
    reg_param = 0.01

    mask = (R>0).astype(np.float32) # Boolean 값을 float로 바꾸면 (0,1)로 표현됨됨

    P = tf.Variable(tf.random.normal([n_users,K],stddev=0.1),dtype=np.float32)
    Q = tf.Variable(tf.random.normal([n_items,K],stddev=0.1),dtype=np.float32)

    def predict():
        return tf.matmul(P,Q.T)
    def loss():
        pred = predict()
        error = tf.multiply(mask,(R-pred))
        loss = tf.reduce_sum(tf.square(error) / tf.reduce_sum(mask)) #Loss 함수만 제대로 정의하면, Gradient Descent는 따로 정의할 필요 X
        reg_loss = 

SyntaxError: invalid syntax (134692799.py, line 20)