In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
data = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
def add_avg_count(data, test):
    avg_rating_per_user = data.groupby('user')['rating'].agg(['mean', 'count'])
    avg_rating_per_user.columns = ['user_avg_rating', 'user_count']
    avg_rating_per_movie = data.groupby('movie')['rating'].agg(['mean', 'count'])
    avg_rating_per_movie.columns = ['movie_avg_rating', 'movie_count']

    data = pd.merge(data, avg_rating_per_user, on='user', how='left')
    data = pd.merge(data, avg_rating_per_movie, on='movie', how='left')
    data['user_avg_rating'] = data['user_avg_rating'].fillna(3)
    data['user_count'] = data['user_count'].fillna(0).astype(int)
    data['movie_avg_rating'] = data['movie_avg_rating'].fillna(3)
    data['movie_count'] = data['movie_count'].fillna(0).astype(int)

    test = pd.merge(test, avg_rating_per_user, on='user', how='left')
    test = pd.merge(test, avg_rating_per_movie, on='movie', how='left')
    test['user_avg_rating'] = test['user_avg_rating'].fillna(3)
    test['user_count'] = test['user_count'].fillna(0).astype(int)
    test['movie_avg_rating'] = test['movie_avg_rating'].fillna(3)
    test['movie_count'] = test['movie_count'].fillna(0).astype(int)

    user_avg_rate = avg_rating_per_user.to_dict('index')
    movie_avg_rate = avg_rating_per_movie.to_dict('index')

    movie_relative_rate = defaultdict(lambda: {'sum':0, 'count':0})
    for row in data.itertuples():
        movie_relative_rate[row[2]]['sum'] += row[4] - user_avg_rate[row[3]]['user_avg_rating']
        movie_relative_rate[row[2]]['count'] += 1
        
    for key, val in movie_relative_rate.items():
        movie_relative_rate[key]['movie_relative'] = val['sum']/val['count']

    movie_relative_rate = pd.DataFrame.from_dict(movie_relative_rate, orient='index')
    movie_relative_rate.index.name = 'movie'

    user_relative_rate = defaultdict(lambda: {'sum':0, 'count':0})
    for row in data.itertuples():
        user_relative_rate[row[3]]['sum'] += row[4] - movie_avg_rate[row[3]]['movie_avg_rating']
        user_relative_rate[row[3]]['count'] += 1
        
    for key, val in user_relative_rate.items():
        user_relative_rate[key]['user_relative'] = val['sum']/val['count']

    user_relative_rate = pd.DataFrame.from_dict(user_relative_rate, orient='index')
    user_relative_rate.index.name = 'user'

    data = pd.merge(data, user_relative_rate[['user_relative']], on='user', how='left')
    data = pd.merge(data, movie_relative_rate[['movie_relative']], on='movie', how='left')
    data['user_relative'] = data['user_relative'].fillna(0)
    data['movie_relative'] = data['movie_relative'].fillna(0)

    test = pd.merge(test, user_relative_rate[['user_relative']], on='user', how='left')
    test = pd.merge(test, movie_relative_rate[['movie_relative']], on='movie', how='left')
    test['user_relative'] = test['user_relative'].fillna(0)
    test['movie_relative'] = test['movie_relative'].fillna(0)

    return data, test

In [4]:
data, test = add_avg_count(data, test)

In [5]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [37]:
data

Unnamed: 0,ID,movie,user,rating,user_avg_rating,user_count,movie_avg_rating,movie_count,user_relative,movie_relative
0,610739,3704,3784,3,3.212598,127,4.395062,162,-0.649471,0.627800
1,324753,1924,802,3,3.535714,336,3.088235,68,-0.407875,-0.487233
2,808218,4837,1387,4,4.084652,1264,3.284000,250,0.467509,0.128105
3,133808,867,1196,4,4.300133,2259,3.705882,34,0.444711,-0.030625
4,431858,2631,3072,5,3.794837,736,4.338235,68,0.403811,0.435923
...,...,...,...,...,...,...,...,...,...,...
750151,259179,1586,1077,5,3.978062,547,4.424437,311,-0.110433,0.598645
750152,365839,2129,2700,5,3.778959,941,3.258865,282,0.402812,-0.131217
750153,131933,854,3102,3,3.670213,188,3.072193,374,0.205927,-0.360123
750154,671156,4033,3479,5,3.657635,406,3.765748,508,0.048940,0.376630


In [28]:
feature = ['user', 'movie', 'movie_avg_rating']
X, y = data[feature], data['rating']


In [None]:
X, y = data[['user', 'movie', 'user_avg_rating', 'user_count', 'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']], data['rating']
cat_features = ['user', 'movie', 'user_count', 'movie_count']

In [None]:
X, y = data[['user_avg_rating', 'user_count', 'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']], data['rating']
cat_features = []

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=8,
    loss_function='RMSE',
    cat_features=cat_features,
    verbose=100
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")


In [None]:
pred = model.predict(test[['user', 'movie', 'user_avg_rating', 'user_count', 'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']])

In [None]:
model.get_feature_importance()

In [None]:
pred = pd.DataFrame({
    'ID': test['ID'],
    'rating': pred
})

In [None]:
pred.to_csv('pred.csv', index=False)

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y.values, test_size=0.2, random_state=42
)

# 텐서 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

# 데이터로더 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.model(x)

# 모델 초기화 및 디바이스로 이동
model = MLP(input_dim=X.shape[1]).to(device)

# 손실 함수 및 옵티마이저 정의
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [30]:
import copy

best_val_loss = float('inf')
patience = 10
counter = 0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(100):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    # 검증 단계
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}")

    # Early Stopping 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            break

# 최상의 모델 저장
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'best_mlp_model.pth')


Epoch 1, Validation Loss: 1.0571
Epoch 2, Validation Loss: 1.0483
Epoch 3, Validation Loss: 1.0454
Epoch 4, Validation Loss: 1.0444
Epoch 5, Validation Loss: 1.0431
Epoch 6, Validation Loss: 1.0442
Epoch 7, Validation Loss: 1.0408
Epoch 8, Validation Loss: 1.0375
Epoch 9, Validation Loss: 1.0384
Epoch 10, Validation Loss: 1.0378
Epoch 11, Validation Loss: 1.0408
Epoch 12, Validation Loss: 1.0396
Epoch 13, Validation Loss: 1.0419
Epoch 14, Validation Loss: 1.0368
Epoch 15, Validation Loss: 1.0362
Epoch 16, Validation Loss: 1.0333
Epoch 17, Validation Loss: 1.0335
Epoch 18, Validation Loss: 1.0339
Epoch 19, Validation Loss: 1.0372
Epoch 20, Validation Loss: 1.0318
Epoch 21, Validation Loss: 1.0322
Epoch 22, Validation Loss: 1.0309
Epoch 23, Validation Loss: 1.0305
Epoch 24, Validation Loss: 1.0337
Epoch 25, Validation Loss: 1.0387
Epoch 26, Validation Loss: 1.0299
Epoch 27, Validation Loss: 1.0350
Epoch 28, Validation Loss: 1.0307
Epoch 29, Validation Loss: 1.0295
Epoch 30, Validation Lo

In [36]:
data

Unnamed: 0,ID,movie,user,rating,user_avg_rating,user_count,movie_avg_rating,movie_count,user_relative,movie_relative
0,610739,3704,3784,3,3.212598,127,4.395062,162,-0.649471,0.627800
1,324753,1924,802,3,3.535714,336,3.088235,68,-0.407875,-0.487233
2,808218,4837,1387,4,4.084652,1264,3.284000,250,0.467509,0.128105
3,133808,867,1196,4,4.300133,2259,3.705882,34,0.444711,-0.030625
4,431858,2631,3072,5,3.794837,736,4.338235,68,0.403811,0.435923
...,...,...,...,...,...,...,...,...,...,...
750151,259179,1586,1077,5,3.978062,547,4.424437,311,-0.110433,0.598645
750152,365839,2129,2700,5,3.778959,941,3.258865,282,0.402812,-0.131217
750153,131933,854,3102,3,3.670213,188,3.072193,374,0.205927,-0.360123
750154,671156,4033,3479,5,3.657635,406,3.765748,508,0.048940,0.376630


In [31]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# NumPy 배열로 변환
X_test_np = test[feature]

X_test_np = scaler.transform(X_test_np)

#X_test_np = test[['user', 'movie', 'user_avg_rating', 'user_count',
#                  'movie_avg_rating', 'movie_count', 'user_relative', 'movie_relative']].values

# 텐서로 변환
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)

# TensorDataset 및 DataLoader 생성
test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

# 모델 예측
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        inputs = batch[0].to(device)
        outputs = model(inputs)
        predictions.append(outputs.cpu())
predictions = torch.cat(predictions, dim=0)

In [33]:
predictions_np = predictions.cpu().numpy()

id_series = test['ID']

pred_series = pd.Series(predictions_np.flatten(), name='rating')

submission_df = pd.concat([id_series, pred_series], axis=1)

submission_df.to_csv('submission_ID_movie_avg.csv', index=False)