# 準備

In [70]:
import pandas as pd
import numpy as np
import json
import joblib

from sklearn.decomposition import PCA

from tqdm import tqdm
tqdm.pandas()

In [71]:
movie_df = pd.read_csv('../../../data/movies_df.csv')
rating_df = pd.read_csv('../../../data/Eval.csv')

In [72]:
np.random.seed(42)

# データ

### リスト作成
- ユーザーID、映画ID、ジャンルのリストを作成
- 映画ごとのジャンルのデータをワンホットで作成

In [73]:
user_list = rating_df['SessionId'].unique().tolist()
user_list[0:3]

[119, 142, 156]

In [74]:
item_list = sorted(rating_df['ItemId'].unique())
item_list[0:3]

[5, 11, 12]

In [75]:
selected_columns = movie_df.filter(regex='^(tmdbId|genre_)')
for col in ['genre_names', 'genre_ids']:
    if col in selected_columns:
        selected_columns = selected_columns.drop(columns=[col])
selected_columns.columns = selected_columns.columns.str.replace('genre_', '')
movie_genres_onehot = selected_columns.rename(columns={'tmdbId': 'ItemId'})
movie_genres_onehot.head(3)

Unnamed: 0,ItemId,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,2,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,3,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
columns = movie_genres_onehot.columns.tolist()
genre_list = [col for col in columns if col != 'ItemId']
genre_list[0:3]

['Action', 'Adventure', 'Animation']

### 

### データフレームを作成
- ユーザーごとの視聴したジャンルをカウント
- ユーザーが視聴した映画のジャンルの上位3つを取得しワンホットで保存

In [77]:
merged_df = pd.merge(rating_df, movie_genres_onehot, on='ItemId')
user_genre_count = merged_df.groupby('SessionId')[genre_list].sum()
user_genre_count.head(3)

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,29,35,2,15,9,0,15,7,26,2,15,0,4,3,17,0,16,3,1
2,14,14,0,2,2,0,11,1,0,2,5,1,3,1,16,0,10,5,2
3,36,38,2,29,20,0,41,13,10,3,16,2,14,12,59,0,38,6,2


In [78]:
def select_top_3_genres(row):
    random_tiebreaker = np.random.rand(len(row))
    top_3_indices = (row + random_tiebreaker).nlargest(3).index
    new_row = pd.Series(0, index=row.index)
    new_row[top_3_indices] = 1
    return new_row

user_genre_matrix_df = user_genre_count.progress_apply(select_top_3_genres, axis=1)
user_genre_matrix_df.head(3)
print("Shape of user-genre matrix:", user_genre_matrix_df.shape)

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0


In [88]:
user_movie_matrix_df = rating_df.pivot_table(index='SessionId', columns='ItemId', aggfunc='size', fill_value=0)
user_movie_matrix_df.head(3)

ItemId,5,11,12,13,14,15,16,18,19,20,...,294690,295315,297222,297596,297608,302376,304023,306650,307663,326359
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
first_row = user_movie_matrix_df.loc[1]

count_zeros = (first_row == 0).sum()
count_ones = (first_row == 1).sum()

print("Number of 0s on 1st row:", count_zeros)
print("Number of 1s on 1st row:", count_ones)

Number of 0s on 1st row: 4645
Number of 1s on 1st row: 72


### PCAを実行
- PCAで次元削減を実行

In [43]:
n_components = 100  
pca = PCA(n_components=n_components)
user_movie_matrix_df_pca = pca.fit_transform(user_movie_matrix_df)
print("Shape of user-movie matrix after pca:", user_movie_matrix_df_pca.shape)

(137872, 100)


In [45]:
# joblib.dump(pca, 'pca_model.pkl')
# pca_loaded = joblib.load('pca_model.pkl')

In [46]:
# pca_loaded = joblib.load('pca_model.pkl')
# new_user_history_pca = pca_loaded.transform(movie_matrix)

### マトリックスを作成
- マトリックスをfloat64で作成する

In [52]:
user_genre_matrix = user_genre_matrix_df.to_numpy().astype('float64')
print(user_genre_matrix)

[[1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
(137872, 19)


In [53]:
user_movie_matrix = user_movie_matrix_df_pca.to_numpy().astype('float64')
print(user_movie_matrix)

[[0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]
(137872, 4717)


### データの長さを確認

In [51]:
num_users = len(user_list)
num_genres = len(genre_list)
num_items = n_components

print("Number of users:", num_users)
print("Number of genres:", num_genres)
print("Number of items:", num_items)

137872
19
4717


# モデル

In [None]:
class LinUCB:
    def __init__(self, alpha, num_users, num_genres, num_items):
        self.alpha = alpha
        self.num_users = num_users
        self.num_genres = num_genres
        self.num_items = num_items
        self.d = num_genres + num_items
        self.A = np.repeat(np.identity(self.d, dtype=np.float32)[np.newaxis, :, :], num_genres, axis=0)
        self.b = np.zeros((num_genres, self.d), dtype=np.float32)

    def fit(self, user_genre_matrix, user_movie_matrix, num_epochs, batch_size=50):
        avg_rewards = []
        for epoch in tqdm(range(num_epochs)):
            rewards = []
            A_a_inv = np.array([np.linalg.inv(self.A[a]) for a in range(self.num_genres)])

            for batch_start in range(0, self.num_users, batch_size):
                batch_end = min(batch_start + batch_size, self.num_users)
                batch_user_features = np.concatenate((user_genre_matrix[batch_start:batch_end], 
                                                      user_movie_matrix[batch_start:batch_end]), axis=1)
                
                for a in range(self.num_genres):
                    theta_a = A_a_inv[a].dot(self.b[a])
                    p_t_batch = batch_user_features.dot(theta_a) + \
                                self.alpha * np.sqrt(np.sum(batch_user_features.dot(A_a_inv[a]) * batch_user_features, axis=1))
                    
                    for i, user_id in enumerate(range(batch_start, batch_end)):
                        a_t = a if p_t_batch[i] == max(p_t_batch) else None
                        if a_t is not None:
                            r_t = 1 if user_genre_matrix[user_id, a_t] == 1 else 0
                            rewards.append(r_t)

                            x_t_at = batch_user_features[i].reshape(-1, 1)
                            self.A[a_t] = self.A[a_t] + x_t_at.dot(x_t_at.T)
                            self.b[a_t] = self.b[a_t] + r_t * x_t_at.flatten()

                            A_a_inv[a_t] = np.linalg.inv(self.A[a_t])

            avg_rewards.append(np.mean(rewards))

        return avg_rewards

    def predict(self, user_features, context_features):
        p_t = np.zeros(self.num_genres)
        
        for genre_id in range(self.num_genres):
            user_features_vector = user_features.reshape(-1)
            context_features_vector = context_features.reshape(-1)
        
            combined_features = np.concatenate((user_features_vector, context_features_vector))
        
            x_ta = combined_features.reshape(-1, 1)
            A_a_inv = np.linalg.inv(self.A[genre_id])
            theta_a = A_a_inv.dot(self.b[genre_id])
        
            p_t[genre_id] = theta_a.T.dot(x_ta) + self.alpha * np.sqrt(x_ta.T.dot(A_a_inv).dot(x_ta))
    
        recommended_genres = np.argsort(-p_t)
        return recommended_genres

# 学習

In [19]:
alpha = 1.0
num_epochs = 30
batch_size = 10

In [20]:
linucb_model = LinUCB(alpha=alpha, num_users=num_users, num_genres=num_genres, num_items=num_items)
avg_rewards = linucb_model.fit(user_genre_matrix, user_movie_matrix, num_epochs=num_epochs, batch_size=batch_size)

start epoch 0
start epoch 1
start epoch 2
start epoch 3
start epoch 4
start epoch 5
start epoch 6
start epoch 7
start epoch 8
start epoch 9
start epoch 10
start epoch 11
start epoch 12
start epoch 13
start epoch 14
start epoch 15
start epoch 16
start epoch 17
start epoch 18
start epoch 19
start epoch 20
start epoch 21
start epoch 22
start epoch 23
start epoch 24
start epoch 25
start epoch 26
start epoch 27
start epoch 28
start epoch 29


# 結果

In [21]:
avg_rewards

[0.1631578947368421,
 0.14736842105263157,
 0.1736842105263158,
 0.1736842105263158,
 0.1736842105263158,
 0.15263157894736842,
 0.16842105263157894,
 0.15263157894736842,
 0.1631578947368421,
 0.15263157894736842,
 0.49473684210526314,
 0.4842105263157895,
 0.5052631578947369,
 0.5,
 0.5052631578947369,
 0.5,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369,
 0.5052631578947369]

# 予測

In [22]:
selected_user_row = 0

In [23]:
selected_user_features = user_genre_matrix[selected_user_row]
print('User features:', selected_user_features)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
      dtype=int64)

In [24]:
selected_contex_features = user_movie_matrix[selected_user_row]
print('Contex features:', selected_contex_features)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
predicted_items = linucb_model.predict(selected_user_features, selected_contex_features)
print(predicted_items)

[ 6 16 17 14  3  0  8 13  1  4  9  2 12 10 11  5 15 18  7]


In [18]:
unique_session_ids = np.random.choice(rating_matrix.index.unique(), 100, replace=False)
print(len(unique_session_ids))

100


In [19]:
filtered_rating_matrix = rating_matrix.loc[unique_session_ids]
filtered_movie_matrix = movie_matrix.loc[unique_session_ids]

In [20]:
item_counts = filtered_movie_matrix.sum()

In [21]:
top_100_items = item_counts.nlargest(100)

In [22]:
top_100_items

ItemId
680    38
278    36
424    32
11     31
13     30
       ..
154    11
155    11
235    11
429    11
492    11
Length: 100, dtype: int64

In [23]:
top_100_columns = top_100_items.index

In [24]:
filtered_movie_matrix = filtered_movie_matrix[top_100_columns]

In [25]:
filtered_movie_matrix

ItemId,680,278,424,11,13,85,274,1891,862,197,...,36657,24,77,98,101,154,155,235,429,492
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45986,0,0,1,1,0,1,0,1,1,1,...,0,0,0,1,0,1,0,0,0,1
96028,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
92108,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
71071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138106,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
81468,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
20742,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
86915,0,1,1,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
file_path = 'rating_matrix.csv'
filtered_rating_matrix.to_csv(file_path)

In [27]:
file_path = 'movie_matrix.csv'
filtered_movie_matrix.to_csv(file_path)

In [30]:
filtered_rating_matrix

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
51997,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0
45986,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
96028,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
92108,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
71071,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138106,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
81468,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
20742,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
86915,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


In [31]:
filtered_movie_matrix

ItemId,680,278,424,11,13,85,274,1891,862,197,...,36657,24,77,98,101,154,155,235,429,492
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45986,0,0,1,1,0,1,0,1,1,1,...,0,0,0,1,0,1,0,0,0,1
96028,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
92108,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
71071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138106,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
81468,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
20742,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
86915,0,1,1,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [38]:
genre_dict = {col: idx for idx, col in enumerate(filtered_rating_matrix.columns)}

with open('genre_dict.json', 'w') as file:
    json.dump(genre_dict, file)

In [39]:
genre_dict

{'Action': 0,
 'Adventure': 1,
 'Animation': 2,
 'Comedy': 3,
 'Crime': 4,
 'Documentary': 5,
 'Drama': 6,
 'Family': 7,
 'Fantasy': 8,
 'History': 9,
 'Horror': 10,
 'Music': 11,
 'Mystery': 12,
 'Romance': 13,
 'Science Fiction': 14,
 'TV Movie': 15,
 'Thriller': 16,
 'War': 17,
 'Western': 18}

In [40]:
movie_dict = {col: idx for idx, col in enumerate(filtered_movie_matrix.columns)}

with open('movie_dict.json', 'w') as file:
    json.dump(movie_dict, file)

In [41]:
movie_dict

{680: 0,
 278: 1,
 424: 2,
 11: 3,
 13: 4,
 85: 5,
 274: 6,
 1891: 7,
 862: 8,
 197: 9,
 603: 10,
 120: 11,
 5503: 12,
 629: 13,
 807: 14,
 1892: 15,
 14: 16,
 121: 17,
 275: 18,
 329: 19,
 2493: 20,
 78: 21,
 238: 22,
 812: 23,
 857: 24,
 568: 25,
 8587: 26,
 63: 27,
 122: 28,
 762: 29,
 9598: 30,
 289: 31,
 550: 32,
 935: 33,
 500: 34,
 630: 35,
 639: 36,
 712: 37,
 745: 38,
 10020: 39,
 103: 40,
 105: 41,
 115: 42,
 240: 43,
 510: 44,
 562: 45,
 607: 46,
 620: 47,
 62: 48,
 89: 49,
 268: 50,
 280: 51,
 642: 52,
 769: 53,
 11072: 54,
 36955: 55,
 28: 56,
 137: 57,
 194: 58,
 393: 59,
 581: 60,
 600: 61,
 627: 62,
 1934: 63,
 9427: 64,
 15: 65,
 146: 66,
 168: 67,
 218: 68,
 348: 69,
 378: 70,
 387: 71,
 389: 72,
 409: 73,
 539: 74,
 583: 75,
 585: 76,
 628: 77,
 651: 78,
 679: 79,
 792: 80,
 813: 81,
 858: 82,
 963: 83,
 1585: 84,
 2118: 85,
 4584: 86,
 8012: 87,
 8963: 88,
 9331: 89,
 36657: 90,
 24: 91,
 77: 92,
 98: 93,
 101: 94,
 154: 95,
 155: 96,
 235: 97,
 429: 98,
 492: 99}