In [1]:
import os
import ast
import json
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def extract_genre(x):
    try:
        if isinstance(x, str):
            x = ast.literal_eval(x)  # 문자열이면 리스트로 변환
        if isinstance(x, list) and x:  # 리스트이며 비어 있지 않으면
            return x[0]['name']
    except (ValueError, SyntaxError, IndexError, TypeError):
        return None  # 변환 불가하거나 오류 발생 시 None 반환
    return None

In [3]:
users = pd.read_csv('../Database/attributes/ratings.csv')
movies = pd.read_csv('../Database/attributes/movies_metadata.csv', low_memory=False)
links = pd.read_csv('../Database/attributes/links.csv')

In [4]:
users = users[['userId', 'movieId', 'rating']].dropna()
movies = movies[['imdb_id', 'genres', 'original_title', 'release_date', 'popularity', 'runtime', 'revenue']].dropna()
movies['genres'] = movies['genres'].apply(extract_genre)
movies = movies.dropna()

In [5]:
movies['imdb_id'] = movies['imdb_id'].apply(lambda x: x[2:])
movies['imdb_id'] = movies['imdb_id'].astype(int)
# links 데이터프레임에서 imdbId를 인덱스로 설정 후 movieId를 매핑
links_dict = links.set_index("imdbId")["movieId"]
# imdb_id를 movieId로 변환 (존재하지 않는 경우 NaN)
movies["movieId"] = movies["imdb_id"].map(links_dict)
movies = movies.dropna()
movies["movieId"] = movies["movieId"].astype(int)
movies = movies[['movieId', 'genres', 'original_title', 'release_date', 'popularity', 'runtime', 'revenue']].dropna()

In [8]:
movies_df_0 = movies.copy()
movies_df_0["release_date"] = (pd.to_datetime("2021-12-16") - pd.to_datetime(movies_df_0["release_date"])).dt.days
movies_df_1 = movies_df_0.copy()
encoder = LabelEncoder()
movies_df_1['genres'] = encoder.fit_transform(movies_df_1['genres'])
movies_df_2 = movies_df_1.drop(columns=['original_title'])

movies_df_3 = movies_df_2.copy()
scaler = MinMaxScaler()
movies_df_3[['release_date', 'popularity', 'runtime', 'revenue']] = scaler.fit_transform(
    movies_df_3[['release_date', 'popularity', 'runtime', 'revenue']].values)
movies_df_3 = movies_df_3.set_index('movieId', drop=True)
movies_df_3.index = movies_df_3.index.map(lambda x: int(x))

In [9]:
att_dict = {}
for i, row in enumerate(movies_df_3.values):
    att_dict[movies_df_3.index[i]] = [row[0], row[1], row[2], row[3], row[4]]

att_dict = {int(k): v for k, v in att_dict.items()}

In [10]:
users_df_0 = users.copy()
df_merged = users_df_0.merge(movies_df_3, left_on="movieId", right_index=True, how="left")
df_merged = df_merged.dropna()

In [11]:
df_4, test = train_test_split(df_merged, test_size=0.1, random_state=42)
df_4.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

x = df_4.drop(columns='rating')
y = df_4['rating']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)

x_test = test.drop(columns='rating')
y_test = test['rating']

In [21]:
users_df_1 = x_test[['userId', 'movieId']]
users_df_1.columns = ['user', 'item']

In [19]:
# # 데이터 저장 경로
# train_val_test_split = {
#     "x_train": x_train,
#     "y_train": y_train,
#     "x_val": x_val,
#     "y_val": y_val,
#     "x_test": x_test,
#     "y_test": y_test
# }
# 
# # pickle 파일로 저장
# file_path = "../Database/train_val_test_for_moiverating.pkl"
# with open(file_path, "wb") as f:
#     pickle.dump(train_val_test_split, f)

In [23]:
users_df_1.to_csv('../Database/train_items.csv')

In [14]:
with open("../Database/Ml_item2att.json", "w", encoding="utf-8") as f:
    json.dump(att_dict, f, ensure_ascii=False, indent=4)

In [15]:
# user_dict = {}
# user_path = '../Database/users/'
# for i, filename in enumerate(os.listdir(user_path)):
#     user_dict[i] = pd.read_csv(user_path + filename)
# 
# user_df = pd.DataFrame()
# for i in range(len(os.listdir(user_path))):
#     user_dict[i]['user'] = i
#     user_df = pd.concat([user_df, user_dict[i]], axis=0)
# 
# user_df.columns = [
#     'Title',
#     'Content',
#     'Rating',
#     'Like',
#     'User'
# ]
# 
# # Title
# encoder = LabelEncoder()
# user_df['Title'] = encoder.fit_transform(user_df['Title'])
# 
# # Rating
# user_df['Rating'] = pd.to_numeric(user_df['Rating'], errors='coerce')
# 
# # user_df.loc[user_df['Rating'].isna(), :]
# user_df = user_df.dropna(how='any')
# 
# # Format Conversion
# df_0 = pd.DataFrame(data=user_df[['User', 'Title']].values, columns=['user', 'item'])
# df_1 = pd.DataFrame(data=user_df[['Title', 'Rating', 'Like']].values, columns=['Title', 'Att1', 'Att2'])
# df_1['Title'] = df_1['Title'].astype(int)
# df_1 = df_1.groupby('Title').mean()
# att_dict = {}
# 
# # [[row.values[0], row.values[1]] for row in df_1.iloc[x,:] for x in range(len(df_1))]
# 
# for i, row in enumerate(df_1.values):
#     att_dict[df_1.index[i]] = [row[0], row[1]]
# 
# att_dict = {int(key): value for key, value in att_dict.items()}
# 
# df_0.to_csv('../Database/train_items.csv')
# # att_dict.to_json('../Database/item2att.json')
# 
# import json
# 
# with open("../Database/Ml_item2att.json", "w", encoding="utf-8") as f:
#     json.dump(att_dict, f, ensure_ascii=False, indent=4)
# 
# print("JSON 파일 저장 완료!")