# **Study Double Model**
*Mohammad Hijazi and Kamal Dbouk*

### Generating a Synthetic Dataset for Training

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

np.random.seed(42)
torch.manual_seed(42)

num_users = 1000
num_interactions = 5000

# Generate user profiles
user_ids = [f"user_{i}" for i in range(num_users)]
user_profiles = {}

majors = [
    "Accounting", "Biology", "Business Administration", "Chemistry",
    "Computer Science", "Economics", "Engineering", "English",
    "Environmental Science", "Finance", "History", "Information Technology",
    "Law", "Liberal Arts", "Mathematics", "Mechanical Engineering",
    "Nursing", "Philosophy", "Physics", "Political Science",
    "Psychology", "Sociology", "Theater", "Social Work"
]

communication_styles = ["Quiet Focus", "Occasional Check-ins", "Interactive Conversation"]
goals = ["Exam Prep", "Writing", "Problem Solving", "Research", "Brainstorming", "General"]

# Dictionaries to map categorical features to numeric values
major_to_idx = {major: idx for idx, major in enumerate(majors)}
communication_to_idx = {style: idx for idx, style in enumerate(communication_styles)}
goal_to_idx = {goal: idx for idx, goal in enumerate(goals)}

# Generate profiles
for user_id in user_ids:
    age_norm = np.random.uniform(0.2, 0.8)

    # Categorical features
    major = np.random.choice(majors)
    communication_style = np.random.choice(communication_styles)
    goal = np.random.choice(goals)

    # Personality traits (between 0 and 1)
    extraversion = np.random.uniform(0, 1)
    agreeableness = np.random.uniform(0, 1)
    conscientiousness = np.random.uniform(0, 1)
    neuroticism = np.random.uniform(0, 1)
    openness = np.random.uniform(0, 1)

    totalSessions = np.random.randint(1, 50)
    totalGoals = np.random.randint(5, 30)
    totalTickedGoals = np.random.randint(0, totalGoals + 1)

    preferredStudyLength = np.random.randint(20, 120)  # in minutes, between 20 and 120
    preferredBreakLength = np.random.randint(5, 30)    # in minutes, between 5 and 30
    todayStudyLength = np.random.randint(0, 480)       # in minutes, between 0 and 8 hours (480 minutes)

    user_profiles[user_id] = {
        'age_norm': age_norm,
        'major': major,
        'major_idx': major_to_idx[major],
        'communication_style': communication_style,
        'communication_style_idx': communication_to_idx[communication_style],
        'goal': goal,
        'goal_idx': goal_to_idx[goal],
        'extraversion': extraversion,
        'agreeableness': agreeableness,
        'conscientiousness': conscientiousness,
        'neuroticism': neuroticism,
        'openness': openness,
        'totalSessions': totalSessions,
        'totalGoals': totalGoals,
        'totalTickedGoals': totalTickedGoals,
        'preferredStudyLength': preferredStudyLength,
        'preferredBreakLength': preferredBreakLength,
        'todayStudyLength': todayStudyLength
    }

profile_data = []
for user_id in user_ids:
    profile = user_profiles[user_id]
    profile_data.append([
        user_id,
        profile['age_norm'],
        profile['major'],
        profile['communication_style'],
        profile['goal'],
        profile['extraversion'],
        profile['agreeableness'],
        profile['conscientiousness'],
        profile['neuroticism'],
        profile['openness'],
        profile['totalSessions'],
        profile['totalGoals'],
        profile['totalTickedGoals'],
        profile['preferredStudyLength'],
        profile['preferredBreakLength'],
        profile['todayStudyLength']
    ])

profile_df = pd.DataFrame(
    profile_data,
    columns=[
        'user_id', 'age_norm', 'major', 'communication_style', 'goal',
        'extraversion', 'agreeableness', 'conscientiousness', 'neuroticism', 'openness',
        'totalSessions', 'totalGoals', 'totalTickedGoals',
        'preferredStudyLength', 'preferredBreakLength', 'todayStudyLength'
    ]
)

# Generate interactions
interactions = []
for _ in range(num_interactions):
    # Randomly select two users
    user_id = np.random.choice(user_ids)
    partner_id = np.random.choice(user_ids)

    # Skip self-matches
    if user_id == partner_id:
        continue

    # Get profiles
    user_profile = user_profiles[user_id]
    partner_profile = user_profiles[partner_id]

    # Calculate compatibility based on features
    age_diff = 1 - abs(user_profile['age_norm'] - partner_profile['age_norm'])

    same_major = 1.0 if user_profile['major'] == partner_profile['major'] else 0.0
    same_goal = 1.0 if user_profile['goal'] == partner_profile['goal'] else 0.0

    comm_compat = 1.0 if user_profile['communication_style'] == partner_profile['communication_style'] else 0.0

    # Personality compatibility
    personality_compat = (user_profile['extraversion'] * partner_profile['agreeableness'] +
                       partner_profile['extraversion'] * user_profile['agreeableness']) / 2

    compatibility = (
        0.2 * age_diff +
        0.3 * same_major +
        0.2 * same_goal +
        0.1 * comm_compat +
        0.2 * personality_compat
    )

    # Convert to binary rating with some noise
    rating = 1 if compatibility > 0.4 else 0

    # Add some noise to make it more realistic
    if np.random.random() < 0.1:  # 10% noise
        rating = 1 - rating

    interactions.append((user_id, partner_id, rating))

interactions_df = pd.DataFrame(interactions, columns=['user_id', 'partner_id', 'rating'])

train_df, val_df = train_test_split(interactions_df, test_size=0.2, random_state=42)

print(f"Number of users: {num_users}")
print(f"Number of interactions: {len(interactions_df)}")
print(f"Positive ratings: {interactions_df['rating'].sum()}")
print(f"Negative ratings: {len(interactions_df) - interactions_df['rating'].sum()}")
print("\nUser profiles sample:")
print(profile_df.head())
print("\nInteractions sample:")
print(interactions_df.head())

np.save('major_mapping.npy', major_to_idx)
np.save('communication_mapping.npy', communication_to_idx)
np.save('goal_mapping.npy', goal_to_idx)

profile_df.to_csv('user_profiles.csv', index=False)
train_df.to_csv('train_interactions.csv', index=False)
val_df.to_csv('val_interactions.csv', index=False)
print("\nData saved to CSV files: user_profiles.csv, train_interactions.csv, val_interactions.csv")

unique_users = pd.unique(interactions_df[['user_id', 'partner_id']].values.ravel())
user2idx = {uid: idx for idx, uid in enumerate(unique_users)}

print("\nUser to index mapping (first 5):")
for i, (uid, idx) in enumerate(list(user2idx.items())[:5]):
    print(f"{uid} -> {idx}")
print(f"\nTotal users encoded: {len(user2idx)}")

Number of users: 1000
Number of interactions: 4997
Positive ratings: 1164
Negative ratings: 3833

User profiles sample:
  user_id  age_norm                   major       communication_style  \
0  user_0  0.424724             Mathematics  Interactive Conversation   
1  user_1  0.633199               Economics      Occasional Check-ins   
2  user_2  0.319804                 Theater               Quiet Focus   
3  user_3  0.610540             Mathematics  Interactive Conversation   
4  user_4  0.781751  Information Technology      Occasional Check-ins   

              goal  extraversion  agreeableness  conscientiousness  \
0    Brainstorming      0.596850       0.445833           0.099975   
1         Research      0.183405       0.304242           0.524756   
2  Problem Solving      0.859940       0.680308           0.450499   
3         Research      0.495177       0.034389           0.909320   
4          Writing      0.894827       0.597900           0.921874   

   neuroticism  open

## Tier 1:

### Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def cluster_users(user_profiles_df, n_clusters=4):
    print("\nClustering users based on study preferences...")

    # Extract study preference features
    study_features = user_profiles_df[['preferredStudyLength', 'preferredBreakLength', 'todayStudyLength']].values

    # Standardize features
    scaler = StandardScaler()
    study_features_scaled = scaler.fit_transform(study_features)

    # Apply K-means clusteing
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(study_features_scaled)
    user_profiles_df['cluster'] = clusters

    print(f"Created {n_clusters} user clusters:")
    for cluster_id in range(n_clusters):
        cluster_users = user_profiles_df[user_profiles_df['cluster'] == cluster_id]
        print(f"  Cluster {cluster_id}: {len(cluster_users)} users")
        print(f"    Avg preferredStudyLength: {cluster_users['preferredStudyLength'].mean():.1f} minutes")
        print(f"    Avg preferredBreakLength: {cluster_users['preferredBreakLength'].mean():.1f} minutes")
        print(f"    Avg todayStudyLength: {cluster_users['todayStudyLength'].mean():.1f} minutes")

    return user_profiles_df

user_profiles_df = cluster_users(profile_df)


Clustering users based on study preferences...
Created 4 user clusters:
  Cluster 0: 243 users
    Avg preferredStudyLength: 98.0 minutes
    Avg preferredBreakLength: 15.2 minutes
    Avg todayStudyLength: 360.8 minutes
  Cluster 1: 237 users
    Avg preferredStudyLength: 41.4 minutes
    Avg preferredBreakLength: 19.6 minutes
    Avg todayStudyLength: 340.0 minutes
  Cluster 2: 246 users
    Avg preferredStudyLength: 82.2 minutes
    Avg preferredBreakLength: 23.9 minutes
    Avg todayStudyLength: 126.4 minutes
  Cluster 3: 274 users
    Avg preferredStudyLength: 62.6 minutes
    Avg preferredBreakLength: 10.3 minutes
    Avg todayStudyLength: 141.9 minutes


## Tier 2:

### Neural Reciprocal Collaborative Filtering

In [None]:
class ReciprocalDataset(torch.utils.data.Dataset):
    def __init__(self, interactions_df, user_profiles_df, user2idx,
                 major_mapping, communication_mapping, goal_mapping):
        self.interactions = interactions_df
        self.user_profiles = user_profiles_df.set_index('user_id')
        self.user2idx = user2idx
        self.major_mapping = major_mapping
        self.communication_mapping = communication_mapping
        self.goal_mapping = goal_mapping

        self.numeric_features = ['age_norm', 'extraversion', 'agreeableness',
                                'conscientiousness', 'neuroticism', 'openness']

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, idx):
        row = self.interactions.iloc[idx]
        user_id = row['user_id']
        partner_id = row['partner_id']

        u = self.user2idx[user_id]
        v = self.user2idx[partner_id]

        y = row['rating']

        # Get numeric features
        u_numeric = self.user_profiles.loc[user_id][self.numeric_features].values.astype(np.float32)
        v_numeric = self.user_profiles.loc[partner_id][self.numeric_features].values.astype(np.float32)

        # Get categorical features and convert to one-hot
        u_major_idx = self.major_mapping[self.user_profiles.loc[user_id]['major']]
        u_comm_idx = self.communication_mapping[self.user_profiles.loc[user_id]['communication_style']]
        u_goal_idx = self.goal_mapping[self.user_profiles.loc[user_id]['goal']]

        v_major_idx = self.major_mapping[self.user_profiles.loc[partner_id]['major']]
        v_comm_idx = self.communication_mapping[self.user_profiles.loc[partner_id]['communication_style']]
        v_goal_idx = self.goal_mapping[self.user_profiles.loc[partner_id]['goal']]

        return (
            torch.tensor(u),
            torch.tensor(v),
            torch.tensor(u_numeric),
            torch.tensor(v_numeric),
            torch.tensor(u_major_idx),
            torch.tensor(u_comm_idx),
            torch.tensor(u_goal_idx),
            torch.tensor(v_major_idx),
            torch.tensor(v_comm_idx),
            torch.tensor(v_goal_idx),
            torch.tensor(y, dtype=torch.float32)
        )

### NCRF Model

In [None]:
class NRCF(torch.nn.Module):
    def __init__(self, num_users, num_majors, num_comm_styles, num_goals,
                 numeric_dim=6, embed_dim=32, hidden_dims=(64, 32)):
        super(NRCF, self).__init__()

        # User embeddings
        self.user_embedding = torch.nn.Embedding(num_users, embed_dim)

        # Embedding layers for categorical features
        self.major_embedding = torch.nn.Embedding(num_majors, embed_dim)
        self.comm_embedding = torch.nn.Embedding(num_comm_styles, embed_dim)
        self.goal_embedding = torch.nn.Embedding(num_goals, embed_dim)

        # Linear layers for numeric features
        self.fc_numeric_user = torch.nn.Linear(numeric_dim, embed_dim)
        self.fc_numeric_partner = torch.nn.Linear(numeric_dim, embed_dim)

        # Prediction MLP
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 8, hidden_dims[0]),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dims[0], hidden_dims[1]),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dims[1], 1),
            torch.nn.Sigmoid()
        )

    def forward(self, u_id, v_id, u_numeric, v_numeric,
                u_major, u_comm, u_goal, v_major, v_comm, v_goal):
        u_emb = self.user_embedding(u_id)
        v_emb = self.user_embedding(v_id)

        # Numeric feature projections
        u_numeric_proj = self.fc_numeric_user(u_numeric)
        v_numeric_proj = self.fc_numeric_partner(v_numeric)

        # Categorical feature embeddings
        u_major_emb = self.major_embedding(u_major)
        u_comm_emb = self.comm_embedding(u_comm)
        u_goal_emb = self.goal_embedding(u_goal)

        v_major_emb = self.major_embedding(v_major)
        v_comm_emb = self.comm_embedding(v_comm)
        v_goal_emb = self.goal_embedding(v_goal)

        # Combine user and partner feature embeddings
        id_interaction = u_emb * v_emb

        # For numeric features
        numeric_interaction = u_numeric_proj * v_numeric_proj

        # For categorical features
        major_interaction = u_major_emb * v_major_emb
        comm_interaction = u_comm_emb * v_comm_emb
        goal_interaction = u_goal_emb * v_goal_emb

        # Cross-feature interactions
        u_id_v_numeric = u_emb * v_numeric_proj
        v_id_u_numeric = v_emb * u_numeric_proj

        # Combine all interactions
        x = torch.cat([
            id_interaction,
            numeric_interaction,
            major_interaction,
            comm_interaction,
            goal_interaction,
            u_id_v_numeric,
            v_id_u_numeric,
            u_numeric_proj * v_emb
        ], dim=1)

        return self.mlp(x).squeeze()

In [None]:
def train_model(model, dataloader, val_dataloader, epochs=5, lr=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.BCELoss()
    train_losses = []
    val_accs = []
    val_aucs = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in dataloader:
            u, v, u_numeric, v_numeric, u_major, u_comm, u_goal, v_major, v_comm, v_goal, y = batch

            optimizer.zero_grad()
            preds = model(u, v, u_numeric, v_numeric, u_major, u_comm, u_goal, v_major, v_comm, v_goal)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        train_losses.append(avg_loss)
        print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")

        # Evaluate on validation set
        if val_dataloader:
            model.eval()
            val_preds, val_labels = [], []

            with torch.no_grad():
                for batch in val_dataloader:
                    u, v, u_numeric, v_numeric, u_major, u_comm, u_goal, v_major, v_comm, v_goal, y = batch
                    preds = model(u, v, u_numeric, v_numeric, u_major, u_comm, u_goal, v_major, v_comm, v_goal)
                    val_preds.append(preds)
                    val_labels.append(y)

            preds = torch.cat(val_preds).cpu().numpy()
            labels = torch.cat(val_labels).cpu().numpy()

            # Calculate accuracy
            acc = ((preds > 0.5).astype(float) == labels).mean()
            val_accs.append(acc)

            # Calculate AUC
            from sklearn.metrics import roc_auc_score
            auc_score = roc_auc_score(labels, preds)
            val_aucs.append(auc_score)

            print(f"Validation Accuracy: {acc:.4f}, AUC: {auc_score:.4f}")

    return train_losses, val_accs, val_aucs

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            u, v, u_numeric, v_numeric, u_major, u_comm, u_goal, v_major, v_comm, v_goal, y = batch
            preds = model(u, v, u_numeric, v_numeric, u_major, u_comm, u_goal, v_major, v_comm, v_goal)
            all_preds.append(preds)
            all_labels.append(y)

    all_preds = torch.cat(all_preds).cpu().numpy()
    all_labels = torch.cat(all_labels).cpu().numpy()

    # Binary accuracy
    accuracy = ((all_preds > 0.5).astype(float) == all_labels).mean()

    # AUC score
    from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
    roc_auc = roc_auc_score(all_labels, all_preds)

    # Precision-Recall AUC
    precision, recall, _ = precision_recall_curve(all_labels, all_preds)
    pr_auc = auc(recall, precision)

    # Recommendation metrics
    true_positives = sum((all_preds > 0.5) & (all_labels == 1))
    false_positives = sum((all_preds > 0.5) & (all_labels == 0))
    false_negatives = sum((all_preds <= 0.5) & (all_labels == 1))

    precision_metric = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall_metric = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * precision_metric * recall_metric / (precision_metric + recall_metric) if (precision_metric + recall_metric) > 0 else 0

    results = {
        'accuracy': accuracy,
        'auc': roc_auc,
        'pr_auc': pr_auc,
        'precision': precision_metric,
        'recall': recall_metric,
        'f1': f1
    }

    return results, all_preds, all_labels


In [None]:
if __name__ == "__main__":
    # Load the data
    train_interactions_df = pd.read_csv('train_interactions.csv')
    val_interactions_df = pd.read_csv('val_interactions.csv')

    # Load feature mappings
    major_mapping = np.load('major_mapping.npy', allow_pickle=True).item()
    communication_mapping = np.load('communication_mapping.npy', allow_pickle=True).item()
    goal_mapping = np.load('goal_mapping.npy', allow_pickle=True).item()

    # Encode users to indices
    all_interactions = pd.concat([train_interactions_df, val_interactions_df])
    unique_users = pd.unique(all_interactions[['user_id', 'partner_id']].values.ravel())
    user2idx = {uid: idx for idx, uid in enumerate(unique_users)}
    num_users = len(user2idx)

    # Create datasets and dataloaders
    train_dataset = ReciprocalDataset(
        train_interactions_df,
        user_profiles_df,
        user2idx,
        major_mapping,
        communication_mapping,
        goal_mapping
    )

    val_dataset = ReciprocalDataset(
        val_interactions_df,
        user_profiles_df,
        user2idx,
        major_mapping,
        communication_mapping,
        goal_mapping
    )

    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    # Create and train the model
    model = NRCF(
        num_users=num_users,
        num_majors=len(majors),
        num_comm_styles=len(communication_styles),
        num_goals=len(goals),
        numeric_dim=6,
        embed_dim=32,
        hidden_dims=(64, 32)
    )

    print(f"Training NRCF model with {num_users} users")
    print(f"Number of majors: {len(majors)}")
    print(f"Number of communication styles: {len(communication_styles)}")
    print(f"Number of goals: {len(goals)}")

    train_losses, val_accs, val_aucs = train_model(
        model,
        train_dataloader,
        val_dataloader,
        epochs=15,
        lr=1e-3
    )



    # Final evaluation
    print("\nFinal Model Evaluation:")
    results, preds, labels = evaluate_model(model, val_dataloader)
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")

    # Example of making predictions for a few user pairs
    def predict_match(model, user1, user2, user_profiles_df, user2idx,
                    major_mapping, communication_mapping, goal_mapping):
        """Make a prediction for a pair of users"""
        model.eval()

        # Get user indices
        u_idx = torch.tensor([user2idx[user1]])
        v_idx = torch.tensor([user2idx[user2]])

        # Get user profiles
        user1_profile = user_profiles_df[user_profiles_df['user_id'] == user1].iloc[0]
        user2_profile = user_profiles_df[user_profiles_df['user_id'] == user2].iloc[0]

        # Get numeric features
        numeric_cols = ['age_norm', 'extraversion', 'agreeableness', 'conscientiousness', 'neuroticism', 'openness']
        u_numeric = torch.tensor(user1_profile[numeric_cols].values.astype(np.float32)).unsqueeze(0)
        v_numeric = torch.tensor(user2_profile[numeric_cols].values.astype(np.float32)).unsqueeze(0)

        # Get categorical features
        u_major = torch.tensor([major_mapping[user1_profile['major']]])
        u_comm = torch.tensor([communication_mapping[user1_profile['communication_style']]])
        u_goal = torch.tensor([goal_mapping[user1_profile['goal']]])

        v_major = torch.tensor([major_mapping[user2_profile['major']]])
        v_comm = torch.tensor([communication_mapping[user2_profile['communication_style']]])
        v_goal = torch.tensor([goal_mapping[user2_profile['goal']]])

        with torch.no_grad():
            score = model(
                u_idx, v_idx,
                u_numeric, v_numeric,
                u_major, u_comm, u_goal,
                v_major, v_comm, v_goal
            ).item()

        return score

    print("\nSample Predictions:")
    sample_pairs = [
        (unique_users[0], unique_users[10]),
        (unique_users[20], unique_users[30]),
        (unique_users[40], unique_users[50])
    ]

    for user1, user2 in sample_pairs:
        score = predict_match(
            model, user1, user2, user_profiles_df, user2idx,
            major_mapping, communication_mapping, goal_mapping
        )

        # Test reciprocal score
        score_reverse = predict_match(
            model, user2, user1, user_profiles_df, user2idx,
            major_mapping, communication_mapping, goal_mapping
        )

        # Display profile detials
        user1_profile = user_profiles_df[user_profiles_df['user_id'] == user1].iloc[0]
        user2_profile = user_profiles_df[user_profiles_df['user_id'] == user2].iloc[0]

        print(f"\nUser1: {user1}")
        print(f"  Major: {user1_profile['major']}")
        print(f"  Communication: {user1_profile['communication_style']}")
        print(f"  Goal: {user1_profile['goal']}")

        print(f"\nUser2: {user2}")
        print(f"  Major: {user2_profile['major']}")
        print(f"  Communication: {user2_profile['communication_style']}")
        print(f"  Goal: {user2_profile['goal']}")

        print(f"\n{user1} ↔ {user2}: {score:.4f} (match: {score > 0.5})")
        print(f"{user2} ↔ {user1}: {score_reverse:.4f} (match: {score_reverse > 0.5})")
        print(f"Difference in reciprocal scores: {abs(score - score_reverse):.4f}")

Training NRCF model with 1000 users
Number of majors: 24
Number of communication styles: 3
Number of goals: 6
Epoch 1 - Loss: 0.5027
Validation Accuracy: 0.8200, AUC: 0.7619
Epoch 2 - Loss: 0.3881
Validation Accuracy: 0.8260, AUC: 0.7597
Epoch 3 - Loss: 0.3610
Validation Accuracy: 0.8240, AUC: 0.7638
Epoch 4 - Loss: 0.3303
Validation Accuracy: 0.8190, AUC: 0.7656
Epoch 5 - Loss: 0.3017
Validation Accuracy: 0.8170, AUC: 0.7475
Epoch 6 - Loss: 0.2705
Validation Accuracy: 0.8260, AUC: 0.7560
Epoch 7 - Loss: 0.2327
Validation Accuracy: 0.8200, AUC: 0.7494
Epoch 8 - Loss: 0.1890
Validation Accuracy: 0.8140, AUC: 0.7480
Epoch 9 - Loss: 0.1498
Validation Accuracy: 0.8110, AUC: 0.7380
Epoch 10 - Loss: 0.1119
Validation Accuracy: 0.7920, AUC: 0.7296
Epoch 11 - Loss: 0.0753
Validation Accuracy: 0.8000, AUC: 0.7333
Epoch 12 - Loss: 0.0476
Validation Accuracy: 0.7980, AUC: 0.7259
Epoch 13 - Loss: 0.0324
Validation Accuracy: 0.7930, AUC: 0.7262
Epoch 14 - Loss: 0.0231
Validation Accuracy: 0.7950, A

In [None]:
def recommend_partners(model, user_id, user_profiles_df, user2idx,
                      major_mapping, communication_mapping, goal_mapping,
                      top_k=5, use_clusters=True):
        """Recommend top-k partners for a given user from their cluster"""
        model.eval()

        user_cluster = user_profiles_df[user_profiles_df['user_id'] == user_id]['cluster'].values[0]

        all_scores = []

        potential_partners = user_profiles_df[user_profiles_df['cluster'] == user_cluster]['user_id'].values
        print(f"Matching {user_id} with {len(potential_partners)} users in cluster {user_cluster}")


        for other_id in potential_partners:
            if other_id == user_id:
                continue

            score = predict_match(
                model, user_id, other_id, user_profiles_df, user2idx,
                major_mapping, communication_mapping, goal_mapping
            )
            all_scores.append((other_id, score))

        # Sort by score in descending order
        all_scores.sort(key=lambda x: x[1], reverse=True)

        # Return top-k recommendations
        return all_scores[:top_k]


print("\nSample Recommendations:")
for i in range(3):
    sample_user = unique_users[i * 10]
    user_profile = user_profiles_df[user_profiles_df['user_id'] == sample_user].iloc[0]

    print(f"\nRecommendations for {sample_user}:")
    print(f"  Major: {user_profile['major']}")
    print(f"  Communication: {user_profile['communication_style']}")
    print(f"  Goal: {user_profile['goal']}")
    print(f"  Cluster: {user_profile['cluster']}")
    print(f"  Study Preferences: {user_profile['preferredStudyLength']}min study, " +
          f"{user_profile['preferredBreakLength']}min break, " +
          f"{user_profile['todayStudyLength']}min today")

    # Get cluster-based recommendations
    cluster_recommendations = recommend_partners(
        model, sample_user, user_profiles_df, user2idx,
        major_mapping, communication_mapping, goal_mapping,
        use_clusters=True
    )

    print("\nTop 5 recommended partners from same cluster:")
    for j, (partner_id, score) in enumerate(cluster_recommendations):
        partner_profile = user_profiles_df[user_profiles_df['user_id'] == partner_id].iloc[0]
        print(f"{j+1}. {partner_id} (Score: {score:.4f})")
        print(f"   Major: {partner_profile['major']}")
        print(f"   Communication: {partner_profile['communication_style']}")
        print(f"   Goal: {partner_profile['goal']}")
        print(f"   Study Preferences: {partner_profile['preferredStudyLength']}min study, " +
              f"{partner_profile['preferredBreakLength']}min break, " +
              f"{partner_profile['todayStudyLength']}min today")


Sample Recommendations:

Recommendations for user_166:
  Major: History
  Communication: Quiet Focus
  Goal: General
  Cluster: 2
  Study Preferences: 45min study, 23min break, 68min today
Matching user_166 with 246 users in cluster 2

Top 5 recommended partners from same cluster:
1. user_126 (Score: 1.0000)
   Major: History
   Communication: Interactive Conversation
   Goal: General
   Study Preferences: 112min study, 29min break, 11min today
2. user_439 (Score: 1.0000)
   Major: History
   Communication: Quiet Focus
   Goal: General
   Study Preferences: 55min study, 29min break, 214min today
3. user_647 (Score: 1.0000)
   Major: Theater
   Communication: Interactive Conversation
   Goal: General
   Study Preferences: 119min study, 21min break, 200min today
4. user_537 (Score: 1.0000)
   Major: Theater
   Communication: Interactive Conversation
   Goal: General
   Study Preferences: 81min study, 20min break, 89min today
5. user_476 (Score: 1.0000)
   Major: Chemistry
   Communicati

## Tier 3:

### Reranking

In [None]:
def rerank_recommendations(recommendations, user_id, user_profiles_df,
                          diversity_weight=0.2, similarity_weight=0.5, productivity_weight=0.3,
                          top_k=5):
    import numpy as np

    # Extract original recommendations and scores
    partner_ids = [rec[0] for rec in recommendations]
    original_scores = [rec[1] for rec in recommendations]

    # Get profiles for all recommended partners
    partner_profiles = user_profiles_df[user_profiles_df['user_id'].isin(partner_ids)]

    # Calculate reranked scores
    reranked_recommendations = []

    # Weights for productivity metrics
    goal_completion_weight = 0.7
    sessions_weight = 0.3

    for partner_id, original_score in recommendations:
        partner_profile = partner_profiles[partner_profiles['user_id'] == partner_id].iloc[0]

        # Calculate goal completion rate
        goal_completion_rate = partner_profile['totalTickedGoals'] / partner_profile['totalGoals'] if partner_profile['totalGoals'] > 0 else 0

        # Calculate session engagement (normalize total sessions)
        max_sessions = 50
        session_engagement = min(np.log1p(partner_profile['totalSessions']) / np.log1p(max_sessions), 1.0)

        # Combine into a productivity score (the only factor for ranking now)
        productivity_score = (
            goal_completion_weight * goal_completion_rate +
            sessions_weight * session_engagement
        )

        # Add to reranked recommendations
        reranked_recommendations.append((partner_id, productivity_score, original_score, productivity_score))

    # Sort by productivity score in descending order
    reranked_recommendations.sort(key=lambda x: x[1], reverse=True)

    # Return top_k recommendations
    return reranked_recommendations[:top_k]


def display_reranked_recommendations(reranked_recommendations, user_profiles_df):
    print("\nReranked recommendations (optimized for productivity):")
    print("-" * 110)
    print(f"{'Rank':<5} {'Partner ID':<10} {'Final':<10} {'Original':<10} {'Productivity':<12} "
          f"{'Major':<20} {'Communication':<15} {'Goal':<8} {'Sessions':<8} {'Goals %':<8}")
    print("-" * 110)

    for i, (partner_id, new_score, original_score, productivity_score) in enumerate(reranked_recommendations, 1):
        partner_profile = user_profiles_df[user_profiles_df['user_id'] == partner_id].iloc[0]
        goal_completion = (partner_profile['totalTickedGoals'] / partner_profile['totalGoals'] * 100) if partner_profile['totalGoals'] > 0 else 0

        print(f"{i:<5} {partner_id:<10} {new_score:.4f}   {original_score:.4f}   {productivity_score:.4f}     "
              f"{partner_profile['major'][:19]:<20} {partner_profile['communication_style']:<15} "
              f"{partner_profile['goal']:<8} {partner_profile['totalSessions']:<8} {goal_completion:.1f}%")


reranked_recs = rerank_recommendations(cluster_recommendations, sample_user, user_profiles_df)
display_reranked_recommendations(reranked_recs, user_profiles_df)


Reranked recommendations (optimized for productivity):
--------------------------------------------------------------------------------------------------------------
Rank  Partner ID Final      Original   Productivity Major                Communication   Goal     Sessions Goals % 
--------------------------------------------------------------------------------------------------------------
1     user_736   0.8315   1.0000   0.8315     Theater              Quiet Focus     Brainstorming 22       84.6%
2     user_868   0.7803   1.0000   0.7803     Sociology            Quiet Focus     Brainstorming 21       77.8%
3     user_392   0.6421   1.0000   0.6421     Engineering          Quiet Focus     Brainstorming 45       50.0%
4     user_571   0.6213   1.0000   0.6213     Business Administra  Quiet Focus     Brainstorming 5        69.2%
5     user_788   0.1957   1.0000   0.1957     Philosophy           Quiet Focus     Brainstorming 12       0.0%
