In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'FINAL jester 2006-15.csv'  # Adjusted to correct file naming
jokes_path = 'Dataset3JokeSet.csv'
data = pd.read_csv(file_path)
jokes = pd.read_csv(jokes_path, header=None, names=["Joke_Text"])

# Replace 99 with NaN for unrated jokes
data = data.replace(99, np.nan)

# Separate user IDs and ratings
data_users = data.iloc[:, 0]  # User joke count column
data_ratings = data.iloc[:, 1:]



In [None]:
# Filter active users (threshold = 80 rated jokes)
threshold = 80
active_users_mask = data_ratings.notna().sum(axis=1) >= threshold
active_data = data_ratings[active_users_mask]
active_users = data_users[active_users_mask]

# Train-test split
train_data, test_data = train_test_split(active_data, test_size=0.2, random_state=42)

# Replace NaN with user-mean for better similarity computation
user_means = train_data.mean(axis=1)
train_filled = train_data.T.fillna(user_means).T

# Compute cosine similarity
similarity_matrix = cosine_similarity(train_filled)



In [None]:
# Function to predict ratings based on K most similar users
def predict_ratings(user_index, joke_index, k):
    # Get similarity scores for the user
    user_similarity = similarity_matrix[user_index]

    # Sort and pick top K similar users (excluding the user itself)
    similar_users = np.argsort(-user_similarity)[1 : k + 1]

    # Gather ratings from similar users for the specific joke
    ratings = train_data.iloc[similar_users, joke_index]
    similarities = user_similarity[similar_users]

    # Exclude NaN ratings
    mask = ~ratings.isna()
    ratings = ratings[mask]
    similarities = similarities[mask]

    if len(ratings) == 0:
        return user_means.iloc[user_index]  # Default to user's mean rating

    # Weighted average of ratings
    prediction = np.dot(ratings, similarities) / np.sum(similarities)
    return prediction



In [None]:
# Evaluate the model on the test set
def evaluate_model(k):
    predictions, actuals = [], []
    test_instances = []  # For storing test instances with original and predicted values

    for user_index in range(test_data.shape[0]):
        for joke_index in range(test_data.shape[1]):
            actual = test_data.iloc[user_index, joke_index]

            if not np.isnan(actual):
                prediction = predict_ratings(user_index, joke_index, k)
                if not np.isnan(prediction):
                    predictions.append(prediction)
                    actuals.append(actual)
                    test_instances.append((user_index, joke_index, actual, prediction))

    mae = mean_absolute_error(actuals, predictions)
    return mae, test_instances



In [None]:

# Baseline random recommender system
def random_recommender(k):
    predictions, actuals = [], []
    for user_index in range(test_data.shape[0]):
        for joke_index in range(test_data.shape[1]):
            actual = test_data.iloc[user_index, joke_index]

            if not np.isnan(actual):
                # Randomly select K users
                random_users = train_data.iloc[:, joke_index].dropna().sample(min(k, len(train_data.iloc[:, joke_index].dropna())), replace=False)

                if len(random_users) > 0:
                    random_prediction = random_users.mean()
                else:
                    random_prediction = np.nan  # If no valid ratings, set to NaN

                if not np.isnan(random_prediction):  # Only include valid predictions
                    predictions.append(random_prediction)
                    actuals.append(actual)

    return mean_absolute_error(actuals, predictions) if predictions else np.nan



In [None]:
# Recommend top N jokes for a user
def recommend_jokes(user_index, k, top_n=5):
    joke_predictions = []

    for joke_index in range(train_data.shape[1]):
        prediction = predict_ratings(user_index, joke_index, k)
        if not np.isnan(prediction):
            joke_predictions.append((joke_index, prediction))

    # Sort by predicted rating and pick top N
    joke_predictions = sorted(joke_predictions, key=lambda x: -x[1])[:top_n]
    return [(jokes.iloc[joke_id].Joke_Text, score) for joke_id, score in joke_predictions]

# Find top 2K users with similarity scores
def get_top_similar_users(k, top_n=2000):
    top_users = []

    for user_index in range(similarity_matrix.shape[0]):
        user_similarity = similarity_matrix[user_index]
        similar_users = np.argsort(-user_similarity)[1 : k + 1]
        scores = user_similarity[similar_users]
        top_users.append((user_index, list(zip(similar_users, scores))))

    return top_users[:top_n]



In [None]:
# Find optimal K
k_values = [5, 10, 20, 50]
model_results, baseline_mae = {}, {}

for k in k_values:
    mae, test_instances = evaluate_model(k)
    model_results[k] = {
        "mae": mae,
        "test_instances": test_instances,
    }
    baseline_mae[k] = random_recommender(k)



In [None]:
# Display results
print("K | Collaborative Filtering MAE | Random Recommender MAE")
for k in k_values:
    print(f"{k} | {model_results[k]['mae']:.4f} | {baseline_mae[k]:.4f}")


K | Collaborative Filtering MAE | Random Recommender MAE
5 | 5.7134 | 4.4255
10 | 5.6879 | 4.2382
20 | 5.6814 | 4.1436
50 | 5.6627 | 4.0863


In [None]:
# Print test instances for the last evaluated K
print("\nTest Data Instances (User Index, Joke Index, Original, Predicted):")
for instance in model_results[k_values[-1]]["test_instances"][:10]:
    print(instance)


Test Data Instances (User Index, Joke Index, Original, Predicted):
(0, 6, -8.34375, -4.154898970131304)
(0, 7, -8.34375, -0.7968891917150512)
(0, 12, -8.34375, -2.9924180330852805)
(0, 14, -6.90625, -0.3690117676660045)
(0, 15, 2.09375, 1.550551670673912)
(0, 16, 1.375, 1.0385968636707699)
(0, 17, -1.3125, 3.4644530107637705)
(0, 18, 3.90625, 3.2710170216507444)
(0, 20, 9.09375, 4.766570768147534)
(0, 21, 6.96875, 4.004287699191459)


In [None]:
# Recommend jokes for the first user in the test set
user_index = 5  # Replace with the desired user index from test_data
top_jokes = recommend_jokes(user_index, k=10, top_n=5)
print("\nTop Recommended Jokes:")
for joke, score in top_jokes:
    print(f"Score: {score:.2f}, Joke: {joke}")


Top Recommended Jokes:
Score: 7.27, Joke: A little boy goes to his dad and asks, "What is politics?"His dad says, "Well son, let me try to explain it this way: I'm the breadwinner of the family, so let's call me capitalism. Your Mom, she's the administrator of the money, so we'll call her the government. We're here to take care of your needs, so we'll call you the people. The nanny, we'll consider her the working class. And your baby brother, we'll call him the future. Now, think about that and see if that makes sense." So the little boy goes off to bed thinking about what dad had said. Later that night, he hears his baby brother crying, so he gets up to check on him. He finds that the baby has severely soiled his diaper. So the little boy goes to his parents' room and finds his mother sound asleep. Not wanting to wake her, he goes to the nanny's room. Finding the door locked, he peeks in the keyhole and sees his father in bed with the nanny. He gives up and goes back to bed. The next

In [None]:

# Display top 2K users with similarity scores
top_users = get_top_similar_users(k=10, top_n=2000)
print("\nTop 2K Users with Similarity Scores:")
for user_index, similarities in top_users[:10]:  # Display first 10 users
    print(f"User {user_index}: {similarities[:5]}")  # Display top 5 similar users for brevity



Top 2K Users with Similarity Scores:
User 0: [(598, 0.7780871229256645), (4440, 0.7332611382135881), (1644, 0.7255292209595938), (4119, 0.7220933686532582), (3109, 0.7207765247415778)]
User 1: [(2055, 0.653968033432101), (4718, 0.6309917564068408), (1644, 0.6301929245939322), (3041, 0.6176487779412523), (4930, 0.6133382549870979)]
User 2: [(2595, 0.3305844958932683), (2626, 0.31985771207293495), (4193, 0.3076461757332851), (1876, 0.292978694554563), (1908, 0.28612309715275197)]
User 3: [(463, 0.6388375473114427), (1683, 0.6348254728787144), (1723, 0.6312273701845401), (3015, 0.6120379244273043), (2164, 0.6116405342951784)]
User 4: [(3737, 0.8395978245391773), (3190, 0.8357058286316273), (2208, 0.8317445090771605), (3915, 0.8311641345712943), (522, 0.8309479832914495)]
User 5: [(1898, 0.8071276236922876), (1668, 0.7805575422355802), (226, 0.7659493582923735), (4850, 0.7649993734598965), (969, 0.7607281597429998)]
User 6: [(1179, 0.7986079435790834), (3197, 0.7628438725663185), (3027, 0