Loading of Datasets

In [68]:
import pandas as pd
import numpy as np

courses_data = pd.read_csv("final_courses.csv")
job_data = pd.read_csv("final_jobs.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
eval = pd.read_csv("eval.csv")


In [70]:
max_abs_value_test = test['Demeaned Rating'].abs().max()
test['Normalised Demeaned Rating'] = test['Demeaned Rating'] / max_abs_value_test

max_abs_value_train = train['Demeaned Rating'].abs().max()
train['Normalised Demeaned Rating'] = train['Demeaned Rating'] / max_abs_value_train

max_abs_value_eval = eval['Demeaned Rating'].abs().max()
eval['Normalised Demeaned Rating'] = eval['Demeaned Rating'] / max_abs_value_eval

max_abs_value_courses = courses_data['Demeaned Rating'].abs().max()
courses_data['Normalised Demeaned Rating'] = courses_data['Demeaned Rating'] / max_abs_value_courses

In [37]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Apply the sentiment analyzer to the "reviews" column
train['sentiment_score'] = train['reviews'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
test['sentiment_score'] = test['reviews'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
eval['sentiment_score'] = eval['reviews'].apply(lambda x: sia.polarity_scores(str(x))['compound'])


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [38]:
# Create a user-course interaction matrix (Train)
grouped_data_train = train.groupby('reviewers')
user_course_interaction_train = {}

for user, user_data in grouped_data_train:
    interactions = {}
    for _, row in user_data.iterrows():
        course_name = row['name']
        rating = row['Normalised Demeaned Rating']
        sentiment_score = row['sentiment_score']
        interactions[course_name] = 0.4*rating + 0.6*sentiment_score
    user_course_interaction_train[user] = interactions

user_course_matrix = pd.DataFrame.from_dict(user_course_interaction_train, orient="index")
user_course_matrix.fillna(0, inplace=True)

# Create a user-course interaction matrix (Test)
grouped_data_test = test.groupby('reviewers')
user_course_interaction_test = {}

for user, user_data in grouped_data_test:
    interactions = {}
    for _, row in user_data.iterrows():
        course_name = row['name']
        rating = row['Normalised Demeaned Rating']
        sentiment_score = row['sentiment_score']
        interactions[course_name] = 0.4*rating + 0.6*sentiment_score
    user_course_interaction_test[user] = interactions

user_course_matrix_test = pd.DataFrame.from_dict(user_course_interaction_test, orient="index")
user_course_matrix_test.fillna(0, inplace=True)

# Create a user-course interaction matrix (Eval)
grouped_data_eval = eval.groupby('reviewers')
user_course_interaction_eval = {}

for user, user_data in grouped_data_eval:
    interactions = {}
    for _, row in user_data.iterrows():
        course_name = row['name']
        rating = row['Normalised Demeaned Rating']
        sentiment_score = row['sentiment_score']
        interactions[course_name] = 0.4*rating + 0.6*sentiment_score
    user_course_interaction_eval[user] = interactions

user_course_matrix_eval = pd.DataFrame.from_dict(user_course_interaction_eval, orient="index")
user_course_matrix_eval.fillna(0, inplace=True)

In [None]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from collections import defaultdict

# Load the training data into Surprise format
reader = Reader(rating_scale=(-1, 1))  # Assuming your ratings range from -1 to 1
train_data = Dataset.load_from_df(train[['reviewers', 'name', 'Normalised Demeaned Rating']], reader)

# Build the training set
trainset = train_data.build_full_trainset()

# Build the item-based collaborative filtering model
model = KNNBasic(sim_options={'user_based': False})

# Train the model on the training set
model.fit(trainset)

# Convert the test DataFrame into a list of tuples
testset = list(test[['reviewers', 'name', 'Normalised Demeaned Rating']].itertuples(index=False, name=None))

# Make predictions on the test set
predictions = model.test(testset)

# Group the predictions by user
user_predictions = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    user_predictions[uid].append((iid, est))

# Sort the predictions for each user by the estimated rating in descending order
# and remove duplicates by converting the list to a set and back to a list
for uid, user_preds in user_predictions.items():
    user_preds = list(set(user_preds))  # remove duplicates
    user_preds.sort(key=lambda x: x[1], reverse=True)

# Get the top 5 course recommendations for each user
top_k_recommendations = {}
for uid, user_preds in user_predictions.items():
    top_k_recommendations[uid] = [iid for iid, _ in user_preds[:5]]

# Initialize an empty list to store the recommendations
recommendations = []

# Get the top 5 course recommendations for each user
for uid, user_preds in user_predictions.items():
    for iid, score in user_preds[:5]:
        recommendations.append((uid, iid, score))

# Convert the list of recommendations to a DataFrame
df_recommendations = pd.DataFrame(recommendations, columns=['reviewers', 'name', 'Score'])

# Sort the recommendations by User and Score in descending order
df_recommendations = df_recommendations.sort_values(by=['reviewers', 'Score'], ascending=[True, False])

df_recommendations




In [109]:
from surprise import accuracy

# Value of k (Top 5 Recommendations)
k = 5

# Calculate FCP
fcp = accuracy.fcp(predictions, verbose=True)

print("Precision Score at k =", 5, ":", fcp)

# Calculate MAE
mae = accuracy.mae(predictions, verbose=True)

print("MAE Score at k =", 5, ":", mae)

# Calculate MSE
mse = accuracy.mse(predictions, verbose=True)

print("MSE Score at k =", 5, ":", mse)


FCP:  0.5027
Precision Score at k = 5 : 0.5027134100641967
MAE:  0.1304
MAE Score at k = 5 : 0.13043135022567037
MSE: 0.0528
MSE Score at k = 5 : 0.05283062699257813
