In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import classification_report, confusion_matrix
import copy
import seaborn as sns
import matplotlib.pyplot as plt


interaction_data = pd.read_csv('/mnt/c/Users/80753/Desktop/MNS_data_full(1)(1) (1).csv')
article_data = pd.read_csv('/mnt/c/Users/80753/Desktop/articles_2020_flexible_difficulty.csv')


interaction_data.fillna(interaction_data.median(), inplace=True)
article_data.dropna(inplace=True)

article_data.rename(columns={'id': 'article_id'}, inplace=True)

merged_data = pd.merge(interaction_data, article_data, on='article_id', how='inner')

user_encoder = LabelEncoder()
merged_data['user_idx'] = user_encoder.fit_transform(merged_data['user_id'])

item_encoder = LabelEncoder()
merged_data['item_idx'] = item_encoder.fit_transform(merged_data['article_id'])

num_users = merged_data['user_idx'].nunique()
num_items = merged_data['item_idx'].nunique()

numerical_features = ['reading_time', 'SleepHours', 'Tired', 'Excited', 'Motivated',
                     'Depression', 'Anxiety', 'Extraversion', 'Agreeableness',
                     'Conscientiousness', 'Neuroticism', 'OpennessToExperience',
                     'HowOftenNews', 'TimeWeekNews','flexible_difficulty' ]

scaler = StandardScaler()
merged_data[numerical_features] = scaler.fit_transform(merged_data[numerical_features])

user_feature_cols = ['SleepHours', 'Tired', 'Excited', 'Motivated',
                     'Depression', 'Anxiety', 'Extraversion', 'Agreeableness',
                     'Conscientiousness', 'Neuroticism', 'OpennessToExperience',
                     'HowOftenNews', 'TimeWeekNews']

item_feature_cols = ['reading_time', 'flexible_difficulty']

# target_col = 'likability'
# merged_data['likability'] = merged_data['likability'].round().astype(int)

# max_class = merged_data['likability'].max()
# min_class = merged_data['likability'].min()
# merged_data = merged_data[~merged_data['likability'].isin([min_class])]
# merged_data['likability'] = merged_data['likability']*1/2
# merged_data['likability_class'] = merged_data[target_col].round().astype(int) - merged_data[target_col].min()  # 假设评分从1开始

# num_classes = merged_data['likability_class'].nunique()

target_col = 'likability'
merged_data['likability'] = merged_data['likability'].round().astype(int)

merged_data['likability_class'] = (merged_data[target_col] > 2).astype(int)

num_classes = merged_data['likability_class'].nunique()

X = merged_data[user_feature_cols + item_feature_cols]
y = merged_data['likability_class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)  # 移动到设备


#Train
train_user_ids = torch.tensor(
    merged_data.loc[X_train.index, 'user_idx'].values
)
train_item_ids = torch.tensor(
    merged_data.loc[X_train.index, 'item_idx'].values
)
train_user_features = torch.tensor(
    X_train[user_feature_cols].values, dtype=torch.float32
)
train_item_features = torch.tensor(
    X_train[item_feature_cols].values, dtype=torch.float32
)
train_targets = torch.tensor(
    y_train.values, dtype=torch.long
)

#VAL
val_user_ids = torch.tensor(
    merged_data.loc[X_val.index, 'user_idx'].values
)
val_item_ids = torch.tensor(
    merged_data.loc[X_val.index, 'item_idx'].values
)
val_user_features = torch.tensor(
    X_val[user_feature_cols].values, dtype=torch.float32
)
val_item_features = torch.tensor(
    X_val[item_feature_cols].values, dtype=torch.float32
)
val_targets = torch.tensor(
    y_val.values, dtype=torch.long
)

train_dataset = TensorDataset(
    train_user_ids, train_item_ids,
    train_user_features, train_item_features, train_targets
)

val_dataset = TensorDataset(
    val_user_ids, val_item_ids,
    val_user_features, val_item_features, val_targets
)

batch_size = 32  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

def add_noise(X, noise_level=0.5):
    noise = np.random.normal(0, noise_level, X.shape)
    X_noisy = X + noise
    return X_noisy


noise_ratio = 0.1  
num_noisy_samples = int(len(X_train) * noise_ratio)

np.random.seed(42)
noisy_indices = np.random.choice(len(X_train), num_noisy_samples, replace=False)
X_noisy = X_train.iloc[noisy_indices].copy()
y_noisy = y_train.iloc[noisy_indices].copy()

X_noisy[user_feature_cols + item_feature_cols] = add_noise(X_noisy[user_feature_cols + item_feature_cols].values)

X_train_aug = pd.concat([X_train, X_noisy], axis=0)
y_train_aug = pd.concat([y_train, y_noisy], axis=0)
print("Original Data:\n", X_train)
print("Noisy Data:\n", X_noisy)

train_user_ids_aug = torch.tensor(
    merged_data.loc[X_train_aug.index, 'user_idx'].values
)
train_item_ids_aug = torch.tensor(
    merged_data.loc[X_train_aug.index, 'item_idx'].values
)
train_user_features_aug = torch.tensor(
    X_train_aug[user_feature_cols].values, dtype=torch.float32
)
train_item_features_aug = torch.tensor(
    X_train_aug[item_feature_cols].values, dtype=torch.float32
)
train_targets_aug = torch.tensor(
    y_train_aug.values, dtype=torch.long
)


train_dataset_aug = TensorDataset(
    train_user_ids_aug, train_item_ids_aug,
    train_user_features_aug, train_item_features_aug, train_targets_aug
)

train_loader_aug = DataLoader(train_dataset_aug, batch_size=batch_size, shuffle=True)

# Module
class NCF(nn.Module):
    def __init__(self, num_users, num_items,
                 user_feature_dim, item_feature_dim, num_classes, embed_dim=64, feature_dim=32):
        super(NCF, self).__init__()
        
        
        self.user_embedding = nn.Embedding(num_users, embed_dim)
        self.item_embedding = nn.Embedding(num_items, embed_dim)
        
        
        self.user_feature_transform = nn.Linear(user_feature_dim, feature_dim)
        self.item_feature_transform = nn.Linear(item_feature_dim, feature_dim)

        
        interaction_dim = feature_dim
        input_dim = embed_dim * 2 + feature_dim * 2 + interaction_dim 

        
        self.fc_layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(),
            nn.LayerNorm(512),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.LayerNorm(256),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.LayerNorm(128),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.LayerNorm(64),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)  
        )
        
    def forward(self, user_id, item_id, user_features, item_features):
        
        user_embed = self.user_embedding(user_id)
        item_embed = self.item_embedding(item_id)
        
        
        user_feat_transformed = self.user_feature_transform(user_features)
        item_feat_transformed = self.item_feature_transform(item_features)
        
        
        interaction_features = user_feat_transformed * item_feat_transformed
        
        
        x = torch.cat([
            user_embed, item_embed,
            user_feat_transformed, item_feat_transformed, interaction_features
        ], dim=-1)
        
        
        output = self.fc_layers(x)
        
        return output  

model = NCF(
    num_users=num_users,
    num_items=num_items,
    user_feature_dim=len(user_feature_cols),
    item_feature_dim=len(item_feature_cols),
    num_classes=num_classes
)


criterion = nn.CrossEntropyLoss(weight=class_weights)
# criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
class_weights = class_weights.to(device)  

# early stop
num_epochs = 100
patience = 10
best_val_loss = float('inf')
epochs_no_improve = 0


train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    # TRAIN
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in train_loader_aug:  
        user_id, item_id, user_feat, item_feat, target = batch
        user_id, item_id = user_id.to(device), item_id.to(device)
        user_feat, item_feat = user_feat.to(device), item_feat.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(user_id, item_id, user_feat, item_feat)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
    
    avg_loss = total_loss / len(train_loader_aug)
    train_accuracy = correct / total
    train_losses.append(avg_loss)
    train_accuracies.append(train_accuracy)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}')
    
    scheduler.step()
    
    #VAL
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch in val_loader:
            user_id, item_id, user_feat, item_feat, target = batch
            user_id, item_id = user_id.to(device), item_id.to(device)
            user_feat, item_feat = user_feat.to(device), item_feat.to(device)
            target = target.to(device)
            
            output = model(user_id, item_id, user_feat, item_feat)
            loss = criterion(output, target)
            val_loss += loss.item()
            
            
            _, predicted = torch.max(output.data, 1)
            val_total += target.size(0)
            val_correct += (predicted == target).sum().item()
    
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = val_correct / val_total
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')
    
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print('Early stopping!')
            break


model.load_state_dict(torch.load('best_model.pt'))


y_pred = []
y_true = []
model.eval()
with torch.no_grad():
    for batch in val_loader:
        user_id, item_id, user_feat, item_feat, target = batch
        user_id, item_id = user_id.to(device), item_id.to(device)
        user_feat, item_feat = user_feat.to(device), item_feat.to(device)
        target = target.to(device)
        
        output = model(user_id, item_id, user_feat, item_feat)
        _, predicted = torch.max(output.data, 1)
        y_pred.extend(predicted.cpu().numpy())
        y_true.extend(target.cpu().numpy())


print("\nClassification Report:")
print(classification_report(y_true, y_pred))

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[f'Predicted {i}' for i in range(num_classes)],
            yticklabels=[f'Actual {i}' for i in range(num_classes)])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_losses)+1), train_losses,  label='Training loss')
plt.plot(range(1, len(val_losses)+1), val_losses,  label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, len(train_accuracies)+1), train_accuracies,  label='Training Accuracy')
plt.plot(range(1, len(val_accuracies)+1), val_accuracies,  label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()



In [None]:
def evaluate_full(model, user_ids, item_ids, user_features, item_features, targets, criterion, device='cpu'):
    model.eval()
    with torch.no_grad():
        output = model(user_ids.to(device), item_ids.to(device), user_features.to(device), item_features.to(device))
        loss = criterion(output, targets.to(device)).item()
        _, predicted = torch.max(output.data, 1)
        correct = (predicted == targets.to(device)).sum().item()
        accuracy = correct / targets.size(0)
    return loss, accuracy


feature_names = ['user_id', 'item_id'] + user_feature_cols + item_feature_cols


feature_importance = {feature: 0.0 for feature in feature_names}


val_user_ids_all = val_user_ids
val_item_ids_all = val_item_ids
val_user_features_all = val_user_features
val_item_features_all = val_item_features
val_targets_all = val_targets


val_user_ids_np = val_user_ids_all.cpu().numpy()
val_item_ids_np = val_item_ids_all.cpu().numpy()
val_user_features_np = val_user_features_all.cpu().numpy()
val_item_features_np = val_item_features_all.cpu().numpy()
val_targets_np = val_targets_all.cpu().numpy()

# DataLoader
def create_dataloader_np(user_ids, item_ids, user_features, item_features, targets, batch_size=32):
    tensor_user_ids = torch.tensor(user_ids, dtype=torch.long)
    tensor_item_ids = torch.tensor(item_ids, dtype=torch.long)
    tensor_user_features = torch.tensor(user_features, dtype=torch.float32)
    tensor_item_features = torch.tensor(item_features, dtype=torch.float32)
    tensor_targets = torch.tensor(targets, dtype=torch.long)
    dataset = TensorDataset(tensor_user_ids, tensor_item_ids, tensor_user_features, tensor_item_features, tensor_targets)
    return DataLoader(dataset, batch_size=batch_size)


base_loss, base_acc = evaluate_full(model, val_user_ids_all, val_item_ids_all, val_user_features_all, val_item_features_all, val_targets_all, criterion, device)
print(f'Baseline Validation Loss: {base_loss:.4f}, Baseline Validation Accuracy: {base_acc:.4f}')


for feature in feature_names:
    shuffled_user_ids = val_user_ids_np.copy()
    shuffled_item_ids = val_item_ids_np.copy()
    shuffled_user_features = val_user_features_np.copy()
    shuffled_item_features = val_item_features_np.copy()
    shuffled_targets = val_targets_np.copy()

    if feature == 'user_id':
       
        shuffle_indices = np.random.permutation(len(shuffled_user_ids))
        shuffled_user_ids = shuffled_user_ids[shuffle_indices]
        shuffled_user_features = shuffled_user_features[shuffle_indices]
    elif feature == 'item_id':
        
        shuffle_indices = np.random.permutation(len(shuffled_item_ids))
        shuffled_item_ids = shuffled_item_ids[shuffle_indices]
        shuffled_item_features = shuffled_item_features[shuffle_indices]
    elif feature in user_feature_cols:
        
        shuffled_user_features[:, user_feature_cols.index(feature)] = np.random.permutation(shuffled_user_features[:, user_feature_cols.index(feature)])
    elif feature in item_feature_cols:
        
        shuffled_item_features[:, item_feature_cols.index(feature)] = np.random.permutation(shuffled_item_features[:, item_feature_cols.index(feature)])
    else:
        print(f'Unknown feature: {feature}')
        continue

  
    temp_loader = create_dataloader_np(shuffled_user_ids, shuffled_item_ids, shuffled_user_features, shuffled_item_features, shuffled_targets, batch_size=batch_size)

    shuffled_loss, shuffled_acc = evaluate_full(model, torch.tensor(shuffled_user_ids, dtype=torch.long),
                                               torch.tensor(shuffled_item_ids, dtype=torch.long),
                                               torch.tensor(shuffled_user_features, dtype=torch.float32),
                                               torch.tensor(shuffled_item_features, dtype=torch.float32),
                                               torch.tensor(shuffled_targets, dtype=torch.long),
                                               criterion, device)

    # performance decrease
    loss_increase = shuffled_loss - base_loss
    acc_decrease = base_acc - shuffled_acc
    feature_importance[feature] = acc_decrease  
    print(f'Feature: {feature}, Accuracy Decrease: {acc_decrease:.4f}, Loss Increase: {loss_increase:.4f}')

sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
print("\nFeature Importance (based on accuracy decrease):")
for feature, importance in sorted_importance:
    print(f"{feature}: {importance:.4f}")