In [3]:
import time
import argparse
import pandas as pd
import numpy as np
import json
import pickle 
import re


from src.utils import Logger, Setting, models_load
from src.data import process_context_data, context_data_load
from src.train import train, test

import torch
import torch.nn as nn
import torch.optim as optim
import copy
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, Dataset

from sklearn.model_selection import train_test_split

import copy
import json

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [4]:
train = pd.read_csv('~/data/inters4.csv')
users = pd.read_csv('~/data/users4.csv')
books = pd.read_csv('~/data/books4.csv')
test = pd.read_csv('~/data/test_ratings.csv')
sub = pd.read_csv('~/data/sample_submission.csv')

ids = pd.concat([train['user_id'], sub['user_id']]).unique()
isbns = pd.concat([train['isbn'], sub['isbn']]).unique()

idx2user = {idx:id for idx, id in enumerate(ids)}
idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

user2idx = {id:idx for idx, id in idx2user.items()}
isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

train['user_id'] = train['user_id'].map(user2idx)
sub['user_id'] = sub['user_id'].map(user2idx)
test['user_id'] = test['user_id'].map(user2idx)
users['user_id'] = users['user_id'].map(user2idx)

train['isbn'] = train['isbn'].map(isbn2idx)
sub['isbn'] = sub['isbn'].map(isbn2idx)
test['isbn'] = test['isbn'].map(isbn2idx)
books['isbn'] = books['isbn'].map(isbn2idx)

In [5]:
books.columns

Index(['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path'],
      dtype='object')

In [6]:
def process_text(text):
    if not isinstance(text, str):
        return text
    text = text.strip().lower()
    text = re.sub(r'\s+', '', text)  # 모든 연속 공백 문자를 제거합니다.
    return text

In [7]:
books = books[['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path']]
books = (pd.concat([books, books['category'].str.replace(r'[^0-9a-zA-Z:,]+', '', regex=True).str.split(',', expand=True)], axis=1).drop(['category',1,2,3,4], axis=1).rename(columns={0:'category'}))

# 다시 한 번 lower 등의 오탈자 제거를 해줍니다.
columns_to_modify = ['category', 'publisher', 'language', 'book_author']
for column_name in columns_to_modify:
    books[column_name] = books[column_name].apply(process_text)

books2 = books[['publisher', 'language']]
# publisher와 language 별로 그룹화하여 카운트 계산
publisher_language_count = books2.groupby(['publisher', 'language']).size().reset_index(name='count')

# publisher별로 가장 많은 언어 찾기
max_language_by_publisher = publisher_language_count.loc[publisher_language_count.groupby('publisher')['count'].idxmax()]

# books 데이터프레임의 language 결측치 행 찾기
missing_language_rows = books[books['language'].isna()]

# 결측치를 채우기 위한 함수 정의
def fill_missing_language(row):
    publisher = row['publisher']
    if publisher in max_language_by_publisher['publisher'].values:
        return max_language_by_publisher.loc[max_language_by_publisher['publisher'] == publisher, 'language'].iloc[0]
    else:
        return 'en'

# 결측치를 채우기 위해 apply 함수 사용
books.loc[books['language'].isna(), 'language'] = missing_language_rows.apply(fill_missing_language, axis=1)

books3 = books[['category', 'book_author']]
book_author_category_count = books3.groupby(['category', 'book_author']).size().reset_index(name='count')

# 저자별로 가장 많은 카테고리 찾기
max_category_by_author = book_author_category_count.loc[book_author_category_count.groupby('book_author')['count'].idxmax()]

# books 데이터프레임의 category 결측치 행 찾기
missing_category_rows = books[books['category'].isna()]

# 결측치를 채우기 위한 함수 정의
def fill_missing_category(row):
    author = row['book_author']
    if author in max_category_by_author['book_author'].values:
        return max_category_by_author.loc[max_category_by_author['book_author'] == author, 'category'].iloc[0]
    else:
        return np.nan

# 결측치를 채우기 위해 apply 함수 사용
books.loc[books['category'].isna(), 'category'] = missing_category_rows.apply(fill_missing_category, axis=1)

In [8]:
train.head()

Unnamed: 0,user_id,isbn,rating
0,0,0,4
1,1,0,7
2,2,0,8
3,3,0,8
4,4,0,9


In [9]:
test.head()

Unnamed: 0,user_id,isbn,rating
0,13,0,0
1,13426,0,0
2,26761,1,0
3,16495,2,0
4,6225,3,0


In [10]:
users.head()

Unnamed: 0,user_id,age,location_country
0,0.0,,canada
1,2334.0,49.0,canada
2,13.0,,usa-california
3,1.0,30.0,canada
4,4789.0,36.0,canada


In [11]:
ratings = pd.concat([train, test]).reset_index(drop=True)
context_df = ratings.merge(users, on='user_id', how='left').merge(books, on='isbn', how='left')
print(context_df.shape)

with open('preprocessed_context_df_with_nan_age.pkl', 'wb') as file:
    pickle.dump(context_df, file)

(383494, 14)


In [34]:

with open('preprocessed_context_df_with_nan_age.pkl', 'rb') as file:
    context_df = pickle.load(file)
    


In [35]:
context_df.head()

Unnamed: 0,user_id,isbn,rating,age,location_country,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category
0,0,0,4,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
1,1,0,7,30.0,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
2,2,0,8,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
3,3,0,8,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
4,4,0,9,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses


In [36]:
df1 = context_df.drop(['rating'], axis=1)
df1.head()

Unnamed: 0,user_id,isbn,age,location_country,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category
0,0,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
1,1,0,30.0,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
2,2,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
3,3,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
4,4,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses


In [37]:
df1['category'] = df1['category'].fillna('None')
df1.isna().sum()

user_id                     0
isbn                        0
age                    115787
location_country            0
book_title                  0
book_author                 0
year_of_publication         0
publisher                   0
img_url                     0
language                    0
summary                     4
img_path                    0
category                    0
dtype: int64

In [38]:
df_no_missing = df1.dropna()
missing_age_rows_index = df1[df1['age'].isna()].index
missing_age_data = df1.loc[missing_age_rows_index].drop(['user_id'], axis=1)
missing_age_data.head()

Unnamed: 0,isbn,age,location_country,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category
0,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
2,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
3,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
4,0,,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
12,2,,usa-arizona,flu the story of the great influenza pandemic ...,gbkolata,1999.0,farrarstrausgiroux,http://images.amazon.com/images/P/0374157065.0...,en,describes the great flu epidemic of 1918 an ou...,images/0374157065.01.THUMBZZZ.jpg,medical


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

In [40]:
X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(df_no_missing.drop(['user_id', 'img_url', 'summary','age'], axis=1), df_no_missing.drop(['user_id', 'img_url', 'summary'], axis=1)['age'], test_size=0.1, shuffle=True, random_state=42)
X_test_cat = missing_age_data.drop(['age','img_url','summary'],axis=1)
cat_list = [x for x in X_train_cat.columns.tolist() if x not in ['age', 'year_of_publication']]

In [41]:
X_train_cat.head()

Unnamed: 0,isbn,location_country,book_title,book_author,year_of_publication,publisher,language,img_path,category
102086,11115,usa-oregon,silent night a christmas suspense story,mhclark,1996.0,pocket,en,images/067100042X.01.THUMBZZZ.jpg,fiction
53673,3955,usa-alabama,wish you well,dbaldacci,2001.0,warnerbooks,en,images/0446610100.01.THUMBZZZ.jpg,fiction
74383,6769,usa-virginia,funny in farsi a memoir of growing up iranian ...,fdumas,2004.0,randomhousetradepaperbacks,en,images/0812968379.01.THUMBZZZ.jpg,biographyautobiography
194763,43507,usa-iowa,uppity women of ancient times,vleon,1995.0,conaripress,en,images/1573240109.01.THUMBZZZ.jpg,history
105218,11766,usa-california,wish you were here mrs murphy mysteries paperb...,rmbrown,1991.0,bantam,en,images/0553287532.01.THUMBZZZ.jpg,fiction


In [42]:
X_test_cat.head()

Unnamed: 0,isbn,location_country,book_title,book_author,year_of_publication,publisher,language,img_path,category
0,0,canada,clara callan,rbwright,2001.0,harperflamingocanada,en,images/0002005018.01.THUMBZZZ.jpg,actresses
2,0,canada,clara callan,rbwright,2001.0,harperflamingocanada,en,images/0002005018.01.THUMBZZZ.jpg,actresses
3,0,canada,clara callan,rbwright,2001.0,harperflamingocanada,en,images/0002005018.01.THUMBZZZ.jpg,actresses
4,0,canada,clara callan,rbwright,2001.0,harperflamingocanada,en,images/0002005018.01.THUMBZZZ.jpg,actresses
12,2,usa-arizona,flu the story of the great influenza pandemic ...,gbkolata,1999.0,farrarstrausgiroux,en,images/0374157065.01.THUMBZZZ.jpg,medical


In [43]:
catboost_reg = CatBoostRegressor(
            iterations= 3000,
            loss_function= 'RMSE',  # 회귀 문제를 위한 손실 함수로 RMSE를 사용합니다.
            eval_metric= 'RMSE',  # 평가 지표로 RMSE를 사용합니다.
            verbose= 200,
            early_stopping_rounds= 200,
            cat_features= cat_list,
            task_type= 'GPU',  # GPU를 사용하여 학습합니다.
            devices= '0:1',  # 0번과 1번 장치를 사용합니다.
            learning_rate = 0.12,
            depth = 10,
            use_best_model = True,
            dev_score_calc_obj_block_size = 5000000
        )

catboost_reg.fit(
            X_train_cat,
            y_train_cat,
            eval_set=(X_valid_cat, y_valid_cat),  # 검증 세트를 지정하여 모델의 성능을 평가합니다.
            use_best_model = True,
        )

0:	learn: 12.1027474	test: 12.0538876	best: 12.0538876 (0)	total: 61.8ms	remaining: 3m 5s
200:	learn: 10.6825552	test: 10.8446516	best: 10.8446516 (200)	total: 7.36s	remaining: 1m 42s
400:	learn: 10.3436025	test: 10.8311754	best: 10.8309068 (399)	total: 14.9s	remaining: 1m 36s
bestTest = 10.83090683
bestIteration = 399
Shrink model to first 400 iterations.


<catboost.core.CatBoostRegressor at 0x7fa9a2e2ffa0>

In [44]:
y_last = catboost_reg.predict(X_test_cat)

In [45]:
y_last_df = pd.DataFrame(y_last, index=missing_age_rows_index, columns=['age'])
context_df.update(y_last_df)

with open('age_filled_context_df.pkl', 'wb') as file:
    pickle.dump(context_df, file)


In [46]:
with open('age_filled_context_df.pkl', 'rb') as file:
    context_df = pickle.load(file)

In [47]:
context_df.head()

Unnamed: 0,user_id,isbn,rating,age,location_country,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category
0,0,0,4,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
1,1,0,7,30.0,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
2,2,0,8,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
3,3,0,8,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses
4,4,0,9,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,actresses


In [48]:
context_df.shape

(383494, 14)

In [49]:
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

def image_vector(path):
    
    path = '/opt/ml/data/' + path
    img = Image.open(path)
    scale = transforms.Resize((32, 32))
    tensor = transforms.ToTensor()
    img_fe = Variable(tensor(scale(img)))
    
    return np.array(img_fe)

context_df['img_path'] = context_df['img_path'].apply(image_vector)
    


In [50]:
import torch

def numpy_arrays_to_combined_tensor(numpy_arrays):
    tensor_list = []
    for arr in numpy_arrays:
        if arr.shape[0] == 1:  # 흑백 이미지일 경우
            arr = np.repeat(arr, 3, axis=0)  # 채널 수를 3으로 만듭니다.
        tensor_list.append(torch.from_numpy(arr))
    combined_tensor = torch.stack(tensor_list, dim=0)
    return combined_tensor

# DataFrame에서 이미지 NumPy 배열을 가져옵니다.
image_numpy_arrays = context_df['img_path'].tolist()

# 이미지 NumPy 배열을 하나의 텐서로 결합합니다.
combined_tensor = numpy_arrays_to_combined_tensor(image_numpy_arrays)

print("Combined tensor shape:", combined_tensor.shape)

Combined tensor shape: torch.Size([383494, 3, 32, 32])


In [52]:
class AutoRec(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AutoRec, self).__init__()
        self.encoder = nn.Linear(input_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, input_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.encoder(x)
        x = self.relu(x)
        x = self.decoder(x)
        x = self.sigmoid(x)
        return x
    
def train_autorec(model, data, epochs, batch_size, learning_rate, device):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    data_tensor = torch.tensor(data, dtype=torch.float32).to(device)

    for epoch in range(epochs):
        permutation = torch.randperm(data_tensor.size()[0])
        for i in range(0, data_tensor.size()[0], batch_size):
            optimizer.zero_grad()
            indices = permutation[i:i + batch_size]
            batch = data_tensor[indices]

            output = model(batch)
            loss = criterion(output, batch)
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')

input_size = 3 * 32 * 32
hidden_size = 10
num_data = 383494
epochs = 30
batch_size = 256
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 임의의 데이터 생성
data = combined_tensor
data = data.reshape(num_data, -1)

# 모델 초기화 및 GPU 사용
autorec = AutoRec(input_size, hidden_size).to(device)

# 모델 학습
train_autorec(autorec, data, epochs, batch_size, learning_rate, device)

  data_tensor = torch.tensor(data, dtype=torch.float32).to(device)


Epoch [1/30], Loss: 0.1302
Epoch [2/30], Loss: 0.0894
Epoch [3/30], Loss: 0.0852
Epoch [4/30], Loss: 0.0969
Epoch [5/30], Loss: 0.1314
Epoch [6/30], Loss: 0.0967
Epoch [7/30], Loss: 0.0876
Epoch [8/30], Loss: 0.0684
Epoch [9/30], Loss: 0.0895
Epoch [10/30], Loss: 0.1160
Epoch [11/30], Loss: 0.0855
Epoch [12/30], Loss: 0.0749
Epoch [13/30], Loss: 0.0904
Epoch [14/30], Loss: 0.0879
Epoch [15/30], Loss: 0.1137
Epoch [16/30], Loss: 0.1095
Epoch [17/30], Loss: 0.1067
Epoch [18/30], Loss: 0.1014
Epoch [19/30], Loss: 0.0857
Epoch [20/30], Loss: 0.0851
Epoch [21/30], Loss: 0.1152
Epoch [22/30], Loss: 0.0974
Epoch [23/30], Loss: 0.0959
Epoch [24/30], Loss: 0.0754
Epoch [25/30], Loss: 0.1503
Epoch [26/30], Loss: 0.0957
Epoch [27/30], Loss: 0.0929
Epoch [28/30], Loss: 0.0669
Epoch [29/30], Loss: 0.0546
Epoch [30/30], Loss: 0.1133


In [53]:
def encode_data(model, data, device):
    data_tensor = torch.tensor(data, dtype=torch.float32).to(device)
    encoded_data = model.encoder(data_tensor)
    return encoded_data.detach().cpu().numpy()

encoded_vectors = encode_data(autorec, data, device)
print("Encoded vectors shape:", encoded_vectors.shape)

  data_tensor = torch.tensor(data, dtype=torch.float32).to(device)


Encoded vectors shape: (383494, 10)


In [54]:
# 먼저 encoded_vectors를 DataFrame으로 변환합니다.
encoded_vectors_df = pd.DataFrame(encoded_vectors, columns=[f'feature_{i}' for i in range(encoded_vectors.shape[1])])

# context_df의 img_path 열을 encoded_vectors_df로 교체합니다.
context_df = pd.concat([context_df.drop(columns=['img_path']), encoded_vectors_df], axis=1)

context_df.head()

Unnamed: 0,user_id,isbn,rating,age,location_country,book_title,book_author,year_of_publication,publisher,img_url,...,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,0,0,4,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
1,1,0,7,30.0,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
2,2,0,8,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
3,3,0,8,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
4,4,0,9,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344


In [55]:
with open('encoded_vector_filled_context_df.pkl', 'wb') as file:
    pickle.dump(context_df, file)

In [71]:
with open('encoded_vector_filled_context_df.pkl', 'rb') as file:
    context_df = pickle.load(file)

In [72]:
context_df.head()

Unnamed: 0,user_id,isbn,rating,age,location_country,book_title,book_author,year_of_publication,publisher,img_url,...,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,0,0,4,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
1,1,0,7,30.0,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
2,2,0,8,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
3,3,0,8,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
4,4,0,9,36.307149,canada,clara callan,rbwright,2001.0,harperflamingocanada,http://images.amazon.com/images/P/0002005018.0...,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344


In [78]:
context_df.columns

Index(['user_id', 'isbn', 'rating', 'age', 'location_country', 'book_title',
       'book_author', 'year_of_publication', 'publisher', 'img_url',
       'language', 'summary', 'category', 'feature_0', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'feature_9'],
      dtype='object')

In [79]:
# 혹시 모르니 인코딩해주는 함수. 이때 위에 있는 context_df를 보고, 카테고리화 해 줄 컬럼을 직접 적는다.
for col in ['user_id', 'isbn', 'location_country', 'book_title', 'book_author', 'year_of_publication', 'publisher'
            , 'category']:
    col2idx = {value: idx for idx, value in enumerate(context_df[col].unique())}
    context_df[col] = context_df[col].map(col2idx)

context_df = context_df.drop(['img_url','summary'], axis=1)
    

train_df = context_df.iloc[:len(train), :]
test_df = context_df.iloc[len(train):, :]

data_cat = {}
data_cat['train'] = train_df
data_cat['test'] = test_df

In [80]:
data_cat['train'].head()

Unnamed: 0,user_id,isbn,rating,age,location_country,book_title,book_author,year_of_publication,publisher,language,...,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,0,0,4,36.307149,0,0,0,0,0,en,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
1,1,0,7,30.0,0,0,0,0,0,en,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
2,2,0,8,36.307149,0,0,0,0,0,en,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
3,3,0,8,36.307149,0,0,0,0,0,en,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344
4,4,0,9,36.307149,0,0,0,0,0,en,...,-8.298337,-8.343067,-8.453951,-8.627418,-8.342608,-8.436244,-8.190323,-6.870789,-8.412859,-8.051344


In [81]:
with open('encoded_vector_filled_data_fin.pkl', 'wb') as file:
    pickle.dump(data_cat, file)

In [70]:
#CatBoost_encoded_vector에서 보려고 쓰는 것
context_df.columns

Index(['user_id', 'isbn', 'rating', 'age', 'location_country', 'book_title',
       'book_author', 'year_of_publication', 'publisher', 'language',
       'category', 'feature_0', 'feature_1', 'feature_2', 'feature_3',
       'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8',
       'feature_9'],
      dtype='object')

In [90]:
'''#클러스터링 수행하는 함수

from sklearn.cluster import KMeans

# context_df에서 필요한 열만 선택하여 새로운 DataFrame을 생성합니다.
columns_to_select = ['user_id', 'rating', 'age'] + [f'feature_{i}' for i in range(10)]
selected_df = context_df[columns_to_select]

# K-means 클러스터링을 수행합니다.
num_clusters = 10  # 클러스터 수를 설정합니다.
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
selected_df['cluster'] = kmeans.fit_predict(selected_df)

# context_df에서 feature_0부터 feature_9까지의 열을 삭제합니다.
columns_to_drop = [f'feature_{i}' for i in range(10)]
context_df = context_df.drop(columns=columns_to_drop)

# context_df에 cluster 열을 추가합니다.
context_df['cluster'] = selected_df['cluster']

with open('img_cluster_finished.pkl', 'wb') as file:
    pickle.dump(context_df, file)

with open('img_cluster_finished.pkl', 'rb') as file:
    context_df = pickle.load(file)

#활용 안할 column 지우기
context_df = context_df.drop(['summary', 'img_url'], axis=1)

# age 매핑함수
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6
    
context_df['age'] = context_df['age'].apply(age_map)'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['cluster'] = kmeans.fit_predict(selected_df)


Unnamed: 0,user_id,rating,age,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,cluster
0,0,4,36.307395,-6.861934,-8.435457,-8.262573,-7.769368,-8.153619,-8.275292,-7.86638,-8.344859,-8.358066,-8.637857,1
1,1,7,30.0,-6.861934,-8.435457,-8.262573,-7.769368,-8.153619,-8.275292,-7.86638,-8.344859,-8.358066,-8.637857,1
2,2,8,36.307395,-6.861934,-8.435457,-8.262573,-7.769368,-8.153619,-8.275292,-7.86638,-8.344859,-8.358066,-8.637857,1
3,3,8,36.307395,-6.861934,-8.435457,-8.262573,-7.769368,-8.153619,-8.275292,-7.86638,-8.344859,-8.358066,-8.637857,1
4,4,9,36.307395,-6.861934,-8.435457,-8.262573,-7.769368,-8.153619,-8.275292,-7.86638,-8.344859,-8.358066,-8.637857,1
