In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
data = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [None]:
def add_avg_count(data, test):
    avg_rating_per_user = data.groupby('user')['rating'].agg(['mean', 'count'])
    avg_rating_per_user.columns = ['user_avg_rating', 'user_count']
    avg_rating_per_movie = data.groupby('movie')['rating'].agg(['mean', 'count'])
    avg_rating_per_movie.columns = ['movie_avg_rating', 'movie_count']

    data = pd.merge(data, avg_rating_per_user, on='user', how='left')
    data = pd.merge(data, avg_rating_per_movie, on='movie', how='left')
    data['user_avg_rating'] = data['user_avg_rating'].fillna(3)
    data['user_count'] = data['user_count'].fillna(0).astype(int)
    data['movie_avg_rating'] = data['movie_avg_rating'].fillna(3)
    data['movie_count'] = data['movie_count'].fillna(0).astype(int)

    test = pd.merge(test, avg_rating_per_user, on='user', how='left')
    test = pd.merge(test, avg_rating_per_movie, on='movie', how='left')
    test['user_avg_rating'] = test['user_avg_rating'].fillna(3)
    test['user_count'] = test['user_count'].fillna(0).astype(int)
    test['movie_avg_rating'] = test['movie_avg_rating'].fillna(3)
    test['movie_count'] = test['movie_count'].fillna(0).astype(int)

    user_avg_rate = avg_rating_per_user.to_dict('index')
    movie_avg_rate = avg_rating_per_movie.to_dict('index')

    movie_relative_rate = defaultdict(lambda: {'sum':0, 'count':0})
    for row in data.itertuples():
        movie_relative_rate[row[2]]['sum'] += row[4] - user_avg_rate[row[3]]['user_avg_rating']
        movie_relative_rate[row[2]]['count'] += 1
        
    for key, val in movie_relative_rate.items():
        movie_relative_rate[key]['movie_relative'] = val['sum']/val['count']

    movie_relative_rate = pd.DataFrame.from_dict(movie_relative_rate, orient='index')
    movie_relative_rate.index.name = 'movie'

    user_relative_rate = defaultdict(lambda: {'sum':0, 'count':0})
    for row in data.itertuples():
        user_relative_rate[row[3]]['sum'] += row[4] - movie_avg_rate[row[3]]['movie_avg_rating']
        user_relative_rate[row[3]]['count'] += 1
        
    for key, val in user_relative_rate.items():
        user_relative_rate[key]['user_relative'] = val['sum']/val['count']

    user_relative_rate = pd.DataFrame.from_dict(user_relative_rate, orient='index')
    user_relative_rate.index.name = 'user'

    data = pd.merge(data, user_relative_rate[['user_relative']], on='user', how='left')
    data = pd.merge(data, movie_relative_rate[['movie_relative']], on='movie', how='left')
    data['user_relative'] = data['user_relative'].fillna(0)
    data['movie_relative'] = data['movie_relative'].fillna(0)

    test = pd.merge(test, user_relative_rate[['user_relative']], on='user', how='left')
    test = pd.merge(test, movie_relative_rate[['movie_relative']], on='movie', how='left')
    test['user_relative'] = test['user_relative'].fillna(0)
    test['movie_relative'] = test['movie_relative'].fillna(0)

    return data, test, (user_avg_rate, user_relative_rate, movie_avg_rate, movie_relative_rate)

In [None]:
data, test, tmp = add_avg_count(data, test)

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
test.loc[test['user']>6039]

In [None]:
data

In [None]:
pivot_ratings = data.pivot(index='user', columns='movie', values='rating')
pivot_ratings

In [None]:
user_bias    = data.groupby('user')['user_relative'].first().rename('user_bias')
user_avg     = data.groupby('user')['user_avg_rating'].first().rename('user_avg')

movie_bias   = data.groupby('movie')['movie_relative'].first().rename('movie_bias')
movie_avg    = data.groupby('movie')['movie_avg_rating'].first().rename('movie_avg')


In [None]:
feature = ['user', 'movie', 'movie_avg_rating']
X, y = data[feature], data['rating']


In [None]:
X, y = data[['user', 'movie', 'user_avg_rating', 'user_count', 'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']], data['rating']
cat_features = ['user', 'movie', 'user_count', 'movie_count']

In [None]:
X, y = data[['user_avg_rating', 'user_count', 'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']], data['rating']
cat_features = []

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=8,
    loss_function='RMSE',
    cat_features=cat_features,
    verbose=100
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")


In [None]:
pred = model.predict(test[['user', 'movie', 'user_avg_rating', 'user_count', 'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']])

In [None]:
model.get_feature_importance()

In [None]:
pred = pd.DataFrame({
    'ID': test['ID'],
    'rating': pred
})

In [None]:
pred.to_csv('pred.csv', index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y.values, test_size=0.2, random_state=42
)

# 텐서 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

# 데이터로더 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.model(x)

# 모델 초기화 및 디바이스로 이동
model = MLP(input_dim=X.shape[1]).to(device)

# 손실 함수 및 옵티마이저 정의
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
import copy

best_val_loss = float('inf')
patience = 10
counter = 0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(100):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    # 검증 단계
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}")

    # Early Stopping 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            break

# 최상의 모델 저장
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'best_mlp_model.pth')


In [None]:
data

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# NumPy 배열로 변환
X_test_np = test[feature]

X_test_np = scaler.transform(X_test_np)

#X_test_np = test[['user', 'movie', 'user_avg_rating', 'user_count',
#                  'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']].values

# 텐서로 변환
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)

# TensorDataset 및 DataLoader 생성
test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

# 모델 예측
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        inputs = batch[0].to(device)
        outputs = model(inputs)
        predictions.append(outputs.cpu())
predictions = torch.cat(predictions, dim=0)

In [None]:
predictions_np = predictions.cpu().numpy()

id_series = test['ID']

pred_series = pd.Series(predictions_np.flatten(), name='rating')

submission_df = pd.concat([id_series, pred_series], axis=1)

submission_df.to_csv('submission_ID_movie_avg.csv', index=False)

In [None]:
test

In [None]:
test['rating'] = (test['user_avg_rating'] + test['movie_avg_rating'])/2
#test.loc[test['user_count']==0, 'rating'] = test.loc[test['user_count']==0, 'movie_avg_rating']

In [None]:
test[['ID', 'rating']].to_csv('test.csv', index=False)