Loading of Datasets

In [None]:
import pandas as pd
import numpy as np

courses_data = pd.read_csv("final_users_courses.csv")
job_data = pd.read_csv("final_jobs.csv")


In [None]:
max_abs_value_courses = courses_data['Demeaned Rating'].abs().max()
courses_data['Normalised Demeaned Rating'] = courses_data['Demeaned Rating'] / max_abs_value_courses

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Apply the sentiment analyzer to the "reviews" column
courses_data['Sentiment_Score'] = courses_data['Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [None]:
# Create a user-course interaction matrix
grouped_data_train = courses_data.groupby('Reviewer')
user_course_interaction_train = {}

for user, user_data in grouped_data_train:
    interactions = {}
    for _, row in user_data.iterrows():
        course_name = row['Course Name']
        rating = row['Normalised Demeaned Rating']
        sentiment_score = row['Sentiment_Score']
        interactions[course_name] = 0.4*rating + 0.6*sentiment_score
    user_course_interaction_train[user] = interactions

user_course_matrix = pd.DataFrame.from_dict(user_course_interaction_train, orient="index")
user_course_matrix.fillna(0, inplace=True)

In [None]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from collections import defaultdict

courses_data_no_duplicates = courses_data.drop_duplicates(subset=['Course Name', 'Reviewer'])

# Load the data into Surprise format
reader = Reader(rating_scale=(-1, 1))  # Assuming your ratings range from -1 to 1
courses_data_surprise = Dataset.load_from_df(courses_data[['Reviewer', 'Course Name', 'Normalised Demeaned Rating']], reader)

#Train and Test split
train, test = train_test_split(courses_data_surprise, test_size=0.1, random_state=42)



In [None]:
# Build the item-based collaborative filtering model
model = KNNBasic(sim_options={'user_based': False, 'name':'pearson'}, k=10)

# Train the model on the training set
model.fit(train)

# Make predictions on the test set
predictions = model.test(test)

# Group the predictions by user
user_predictions = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    user_predictions[uid].append((iid, est))

# Sort the predictions for each user by the estimated rating in descending order
# and remove duplicates by converting the list to a set and back to a list
for uid, user_preds in user_predictions.items():
    user_preds = list(set(user_preds))  # remove duplicates
    user_preds.sort(key=lambda x: x[1], reverse=True)

# Get the top 5 course recommendations for each user
top_k_recommendations = {}
for uid, user_preds in user_predictions.items():
    top_k_recommendations[uid] = [iid for iid, _ in user_preds[:5]]

# Initialize an empty list to store the recommendations
recommendations = []

# Get the top 5 course recommendations for each user
for uid, user_preds in user_predictions.items():
    for iid, score in user_preds[:5]:
        recommendations.append((uid, iid, score))

# Convert the list of recommendations to a DataFrame
df_recommendations = pd.DataFrame(recommendations, columns=['Reviewers', 'Course Name', 'Score'])

# Sort the recommendations by User and Score in descending order
df_recommendations = df_recommendations.sort_values(by=['Reviewers', 'Score'], ascending=[True, False])

df_recommendations

In [None]:
# Initialize an empty dictionary to store the score matrix
score_matrix = defaultdict(dict)

# Populate the score matrix with predictions
for uid, iid, true_r, est, _ in predictions:
    score_matrix[uid][iid] = est

# Fill in missing course scores for each user
for uid in score_matrix:
    all_course_ids = set(score_matrix[uid]) | set(train['Course Name'].unique())
    for course_id in all_course_ids:
        if course_id not in score_matrix[uid]:
            score_matrix[uid][course_id] = None

# Convert the score matrix to a DataFrame
df_score_matrix = pd.DataFrame(score_matrix)

# Transpose the DataFrame to have users as rows and courses as columns
df_score_matrix = df_score_matrix.T

# Sort the DataFrame by user ID
df_score_matrix = df_score_matrix.sort_index()

# Fill in missing values with 0
df_score_matrix.fillna(0, inplace=True)

In [None]:
from surprise.model_selection import RandomizedSearchCV
from surprise import KNNBasic

# Define the parameter grid
param_grid = {'k': [10, 20], 'sim_options': {'name': ['pearson']}}

# Initialize the GridSearchCV object with FCP as the evaluation metric
grid_search = RandomizedSearchCV(KNNBasic, param_grid, measures=['fcp'], cv=3, n_iter=2)

# Load the data into a surprise Dataset object
entire_data = Dataset.load_from_df(courses_data[['Reviewer', 'Course Name', 'Normalised Demeaned Rating']], reader)

# Fit the GridSearchCV object
grid_search.fit(entire_data)

# Get the best hyperparameters
best_params = grid_search.best_params['fcp']
print("Best hyperparameters:", best_params)


In [None]:
from surprise import accuracy

# Value of k (Top 5 Recommendations)
k = 5

# Calculate FCP
fcp = accuracy.fcp(predictions, verbose=True)

print("Precision Score at k =", 5, ":", fcp)

# Calculate MAE
mae = accuracy.mae(predictions, verbose=True)

print("MAE Score at k =", 5, ":", mae)

# Calculate MSE
mse = accuracy.mse(predictions, verbose=True)

print("MSE Score at k =", 5, ":", mse)
