### Importing Libraries 

In [1]:
import pandas as pd

from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import BaselineOnly
from surprise import CoClustering
from surprise.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances

import matplotlib.pyplot as plt
import matplotlib
import numpy as np

from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

import squarify
import seaborn as sns


import random
import tiktoken
import openai
import pickle


from typing import List, Dict, Tuple
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
import warnings
warnings.filterwarnings("ignore")


### Accessing the datasets

In [2]:
#accessing the user_id profile datasets
user_profile_dataset = pd.read_csv("./cleanedDatasets/cleaned_user_profile.csv", encoding= 'unicode_escape')
user_profile_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [3]:
#accessing the course dataset
course_dataset = pd.read_csv("./cleanedDatasets/cleaned_course_dataset.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [4]:
#accessing the user_id rating
user_rating_dataset = pd.read_csv("./cleanedDatasets/cleaned_user_ratings.csv", encoding= 'unicode_escape')
user_rating_dataset.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Content Based Filtering

In [5]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cb_course_dataset = pd.DataFrame()
cb_course_dataset['sr_'] = course_dataset['sr_']
cb_course_dataset['campus'] = course_dataset['campus'] 
cb_course_dataset['degree_1'] = course_dataset['degree_1']
cb_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cb_course_dataset['key_skills_str'] = course_dataset['key_skills_str']


#store the career objective and key skills in description
cb_course_dataset['description'] = cb_course_dataset['degree_1'] + cb_course_dataset['degree_1_specializations'] + cb_course_dataset['key_skills_str']
cb_course_dataset.head()

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA


In [6]:
#for user_id
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cb_user_dataset = pd.DataFrame()
cb_user_dataset['userid'] = user_profile_dataset['userid']
cb_user_dataset['degree_1'] = user_profile_dataset['degree_1']
cb_user_dataset['degree_1_specializations'] = user_profile_dataset['degree_1_specializations']
cb_user_dataset['career_objective'] = user_profile_dataset['career_objective']
cb_user_dataset['key_skills_str'] = user_profile_dataset['key_skills_str']


#combine the fields which are essential into one called - description
cb_user_dataset['description'] = cb_user_dataset['degree_1'] + cb_user_dataset['degree_1_specializations'] + cb_user_dataset['career_objective'] + cb_user_dataset['key_skills_str']
cb_user_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E.Computer Science EngineeringComputer Engin...
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E.Computer Science EngineeringInterested in ...
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown,B.E.Computer Science EngineeringUnknownUnknown
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E.Computer Science EngineeringCurrently a fi...
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E.Computer Science EngineeringTo have a grow...


In [7]:
#combining the descirption of the user_id and the course in a new pandas dataframe - so we can compare them later on
cbf_combined_data_ds = pd.DataFrame()
cbf_combined_data_ds['description'] = cb_user_dataset['description']+ cb_course_dataset['description']
cbf_combined_data_ds.head()

Unnamed: 0,description
0,B.E.Computer Science EngineeringComputer Engin...
1,B.E.Computer Science EngineeringInterested in ...
2,B.E.Computer Science EngineeringUnknownUnknown...
3,B.E.Computer Science EngineeringCurrently a fi...
4,B.E.Computer Science EngineeringTo have a grow...


In [8]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(cbf_combined_data_ds['description'].values.astype('U'))


In [9]:
#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)


In [10]:
#create indices for the courses is using series
cf_course_dataset = cb_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

In [11]:
#function to get content-filtered recommendations
def get_course_content_based_recommendations(user_id): 
    
    #get index of course
    idx = indices[user_id]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [12]:
#getting the similar course recs for user_id 1001
get_course_content_based_recommendations(1001).head(5)

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
894,894,1895,MITCOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
996,996,1997,"MIT,Pune",B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E. Electronics Telecommunication Engineerin...
93,93,1094,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E. MechanicalAutoCAD, PROE"
256,256,1257,MITAOE,B.E.,Mechanical,"ProE,CATIA","B.E. MechanicalProE,CATIA"
653,653,1654,MITCOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."


In [13]:
#viewing the user_id details 
user_profile_dataset[user_profile_dataset["userid"] == "1001"]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [14]:
#viewing the recommended courses
course_dataset[course_dataset["sr_"] == 1895]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
894,1895,B.E.,Computer Science Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [15]:
#viewing the user_id rating scores for the course 1895
user_rating_dataset[user_rating_dataset["course_id"] == 1895]

Unnamed: 0,course_id,user_id,rating


In [16]:
user_rating_dataset[user_rating_dataset["user_id"] == 1001]

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
20,2002,1001,3
40,2003,1001,4
60,2004,1001,2
80,2005,1001,0
...,...,...,...
8377,2420,1001,0
8397,2421,1001,4
8417,2422,1001,0
8437,2423,1001,2


### Collaborative Filtering 

Using a library surprise that provides us access to these techniques. Each technique is evaluated and the results will determine which technique is best to use. The technique chosen is based on the RMAE and MAE scores. 

In [17]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(user_rating_dataset[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

In [18]:
#create dict for different memory-based  and model based recommendation algorithms
recommendation_algorithms =[KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore(), SVD(), BaselineOnly(), CoClustering() ] 
results = {} #to store the scores

In [19]:
for algorithm in recommendation_algorithms:
    #kfold set to 5
    #using three different metrics for evaluation: Mean Absolute Error ,Mean Squared Error, Root Mean Squared Error
    crossval_scores = cross_validate(algorithm, data, measures=["MAE", "MSE", "RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_mse': 'MSE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(algorithm).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE

Unnamed: 0,MAE,MSE,RMSE,Fit Time,Test Time
matrix_factorization.SVD,1.102855,1.859151,1.363458,0.032239,0.004016
knns.KNNBaseline,1.286029,2.438289,1.561061,0.003763,0.019938
knns.KNNWithMeans,1.285322,2.446193,1.563923,0.003193,0.018829
co_clustering.CoClustering,1.262878,2.459639,1.568049,0.067064,0.003304
knns.KNNWithZScore,1.287788,2.470232,1.571589,0.003634,0.017459
knns.KNNBasic,1.321936,2.546868,1.595771,0.00205,0.011909
baseline_only.BaselineOnly,1.360657,2.62143,1.619057,0.001736,0.002734


#### SVD - for the collaborative filtering recommendation since it obtained the smallest values regarding the MAE and RMSE values. 

In [20]:
#recommends courses to a user_id based on their previous interactions (ratings) with a dataset of courses, using a method called Singular Value Decomposition (SVD) since it was the best technique. Note 
#uses a machine learning model to recommend new courses to a user_id based on their past course ratings, ensuring that only courses the user_id hasn't already completed are recommended.
def collaborative_filter_recommendations(user_id):
    
    # model is designed to learn from the ratings data to make good course recommendations.
    #using the "best" parameters found using the gridsearchcv method from testing collaborative filtering techniques notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset) #It trains (or "fits") the SVD model on a dataset (trainset) that includes user_id-course ratings, helping the model learn about user_id preferences.
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()  #creates a "testset" of all possible user_id-item (course) pairs that are not present in the training
    #it's considering courses the user_id hasn't rated yet. The model then predicts ratings for these courses.
    predictions = model.test(testset)
    
    #predictions made by the model are saved 
    rating_predictions = pd.DataFrame(predictions) # contains the predicted ratings for each course for each user_id.
    #print(rating_predictions)
    
    #Filter predictions for the specified user_id
    user_predictions = rating_predictions[rating_predictions['uid'] == user_id].\
                         sort_values(by="est", ascending = False) # sorts these predictions by the estimated rating (est) in descending order,
        
    #Exclude courses the user_id has already completed (assuming they are rated 1 to 5)  
    final_courserecs = [] #to store the final course recs
    
    #checks to ensure it doesn't recommend courses the user_id has already completed and rated.
    #uid is the user_id and iid is the course id
    for i, row in user_predictions.iterrows():
        #get rating details
        rating_details = user_rating_dataset[(user_rating_dataset['course_id'] == row[1]) & (user_rating_dataset['user_id'] == user_id)]
        
        #specifically looking for courses that the user_id hasn't rated yet (rating is 0).
        if rating_details["rating"].values[0] == 0:
            #print("Not rated!")
            #Compile final recommendations
            final_courserecs.append(row[1])
    
    #Return the recommendations
    return(final_courserecs)

In [21]:
#obtaining the recommendations for the user_id passed as an argument. 
collaborative_filter_recommendations(1001)

[2294,
 2272,
 2134,
 2016,
 2385,
 2357,
 2241,
 2390,
 2370,
 2420,
 2196,
 2316,
 2136,
 2336,
 2407,
 2275,
 2226,
 2302]

In [22]:
#checking if the collaborative_filter_recommendations function works if the user_id was to input a course id 
collaborative_filter_recommendations("2294")# an empty list indicates that it does not work 

[]

In [23]:
#checking if the collaborative_filter_recommendations function works if the user_id was to input a course id 
collaborative_filter_recommendations(2294) # an empty list indicates that it does not work 

[]

In [24]:
#checking the details 
user_rating_dataset[user_rating_dataset['user_id'] == 1001]

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
20,2002,1001,3
40,2003,1001,4
60,2004,1001,2
80,2005,1001,0
...,...,...,...
8377,2420,1001,0
8397,2421,1001,4
8417,2422,1001,0
8437,2423,1001,2


In [25]:
#checking the course rating
user_rating_dataset[user_rating_dataset['course_id'] == 2294] #the value 0 indicates that the user_id was recommended a course which they have yet not rated. 

Unnamed: 0,course_id,user_id,rating
5858,2294,1001,0
5859,2294,1002,5
5860,2294,1003,5
5861,2294,1004,4
5862,2294,1005,3
5863,2294,1006,5
5864,2294,1007,4
5865,2294,1008,3
5866,2294,1009,3
5867,2294,1010,5


### LLM Using GPT4 

To use the LLM, couldn't directly encode with the models provided through the access keys therefore, TfidVectorizer and Cosine Similarity were used. 

In [26]:
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

#set up OpenAI API credentials
openai.api_key = "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"

In [27]:
#for user_id
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
llm_user_dataset = pd.DataFrame()
llm_user_dataset['userid'] = user_profile_dataset['userid']
llm_user_dataset['degree_1'] = user_profile_dataset['degree_1']
llm_user_dataset['degree_1_specializations'] = user_profile_dataset['degree_1_specializations']
llm_user_dataset['career_objective'] = user_profile_dataset['career_objective']
llm_user_dataset['key_skills_str'] = user_profile_dataset['key_skills_str']
llm_user_dataset.head(5)

#store the career objective and key skills in description
llm_user_dataset['description'] = llm_user_dataset['degree_1'] + " "+ llm_user_dataset['degree_1_specializations'] 
llm_user_dataset.head(5)

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E. Computer Science Engineering
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E. Computer Science Engineering
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown,B.E. Computer Science Engineering
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E. Computer Science Engineering
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E. Computer Science Engineering


In [28]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
llm_course_dataset = pd.DataFrame()
llm_course_dataset['sr_'] = course_dataset['sr_']
llm_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
llm_course_dataset['degree_1'] = course_dataset['degree_1']
llm_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
llm_course_dataset['key_skills_str'] = course_dataset['key_skills_str']

#store the career objective and key skills in description
llm_course_dataset['description'] = llm_course_dataset['degree_1'] + llm_course_dataset['degree_1_specializations'] 
llm_course_dataset.head(5)

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E. Mechanical
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical


In [29]:
#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = llm_user_dataset['description'] + llm_course_dataset['description']
comb.head(5)

Unnamed: 0,description
0,B.E. Computer Science EngineeringB.E. Mechanical
1,B.E. Computer Science EngineeringB.E. Mechanical
2,B.E. Computer Science EngineeringB.E. Mechanical
3,B.E. Computer Science EngineeringB.E. Mechanical
4,B.E. Computer Science EngineeringB.E. Mechanical


In [30]:
#load and preprocess data
data = llm_course_dataset

corpus = data["description"].tolist()

#extract features using TF-IDF --- not directly using LLM as with the prompts it did not work well
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

#compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X)

In [31]:
#generate recommendations based on user_id input
def llm_recommendations(user_id):
    course_recommendations = []
    
    user_input = llm_user_dataset["description"].loc[llm_user_dataset["userid"] == user_id].values[0] 
    
    user_index = corpus.index(user_input)
    recommendations = similarity_matrix[user_index].argsort()[:-6:-1]
    
    
    #Print top recommendations
    #print("Top 5 Recommendations:")
    for i, index in enumerate(recommendations):
        #print(f"{i+1}. {data.loc[index]['sr_']}: {data.loc[index]['description']}")
        course_recommendations.append(data.loc[index]['sr_'])
    
    #return the course ids
    return course_recommendations

In [32]:
llm_recommendations("1001")

[7408, 2954, 8949, 8948, 8947]

In [33]:

def evaluate_recommendations_success_rate(user_id,recommendations):
    #defining a list for the success_rate - to store the calculated success rate for each recommended course
    success_rate = []
    
    #printing positions of column names
    #print("Degree 1 position", recommendations.columns.get_loc("degree_1"))
    #print("Degree 1 Specialization position", recommendations.columns.get_loc("degree_1_specializations"))
    #print("Key Skills position", recommendations.columns.get_loc("key_skills_str"))
    
    #getting the user_id's required information such as the user_id's degree, specialization and key skills as these are crucial aspects of the user_id's academic and profession profile
    user_degree = user_profile_dataset["degree_1"].loc[user_profile_dataset["userid"] == str(user_id)].values[0] #getting the user_id's degree
    user_specialization = user_profile_dataset["degree_1_specializations"].loc[user_profile_dataset["userid"] == str(user_id)].values[0] #getting the user_id's specialization
    user_key_skills = user_profile_dataset["key_skills_str"].loc[user_profile_dataset["userid"] == str(user_id)].values[0] #getting the user_id's key skills

    #formatting this information into a list to make it easy for comprassion
    user_degree = user_degree.split("-")
    user_specialization = user_specialization.split("-") 
    user_key_skills = user_key_skills.split(",") #key skills are split into individual skills
    user_key_skills = (map(lambda x: x.lower().strip(), user_key_skills)) # unnecessary spaces or case differences are removed to standardize the data
    user_key_skills = list(user_key_skills)

    #iterating through the recommendations and gathers the following info degree, specialization and key skills - to see what the course demands from the students
    #also comparing the user_id details with the recommendations
    for index, row in recommendations.iterrows(): # iterates over each recommended course (contained in the recommendations DataFrame)
        #print("Checking course: ",index)
        #print("Row details: ", row)
        
        #initializing the score components as they are a structured way to quantify how well a user_id's profile matches the requirements of a recommended course
        alignment_score_for_degree = 0 #assesses whether the user_id's educational background (degree) aligns with what the course requires. It's fundamental because 
        #having the appropriate degree often forms the base level of preparedness for a course.
        
        alignment_score_for_specialization = 0 # indicates whether the user_id has the specific knowledge or focus area that the course is designed for
        
        alignment_score_for_key_skills = 0 #relates to the practical abilities and competencies the course aims to build or requires as a prerequisite


        #getting the course's required information such as the degree, specialization and key skills as these are crucial aspects of the course's academic and profession profile
        course_degree = row[1] #getting the course's degree
        course_specialization = row[2]
        course_key_skills = row[4]

        #formatting this information into a list to make it easy for comprassion
        course_degree = course_degree.split("-")
        course_specialization = course_specialization.split("-")
        course_key_skills = course_key_skills.strip().split(",")
        course_key_skills = (map(lambda x: x.lower().strip(), course_key_skills))
        course_key_skills = list(course_key_skills)

        #deteriming the weights/ percentages of the importance of the degree, specialization and key skills
        #weight_degree, weight_specialization, weight_key_skills = determine_weights(len(course_key_skills))

        #weight/percentage depending on the academic success

        #highest percentage because possessing the specific skills required by a course is often the most direct predictor of a student's ability to succeed and engage 
        #with the course material. Skills are practical and applicable, making this component critical for matching users to courses where they can excel and benefit the most.
        percentage_key_skills = 50 

        # It's weighted less than key skills because while specialization relevance is important, the specific skills the user_id possesses are a more direct indicator of their 
        #ability to perform specific tasks or understand certain topics within the course.
        percentage_specialization = 35


        #having a relevant degree provides a good foundation, it's considered less directly predictive of success in a specific course than the user_id's specialized areas of study or their practical skills.
        percentage_degree = 15

        #degree and specialization meatch 
        if(user_degree == course_degree):
            alignment_score_for_degree = percentage_degree
        if(user_specialization == course_specialization):
            alignment_score_for_specialization = percentage_specialization

        #key skills comparision
        percentage_temp = percentage_key_skills/len(course_key_skills) #obtaining the percentage of each individual key skill

        #iterating through each course key skills
        for skill in course_key_skills:
            if skill == "unknown":
                print("There is not enough information")
            if skill == "c cpp":
                skill = "c"
                course_key_skills.append("cpp")

                #adjusting the percentage 
                percentage_temp = percentage_key_skills/len(course_key_skills)

            if skill == "c programming":
                skill = "c"
                
            if skill in user_key_skills:
                alignment_score_for_key_skills += percentage_temp

        #calculating the total score
        total_score = round(alignment_score_for_degree + alignment_score_for_specialization + alignment_score_for_key_skills, 2)
        #print("Course: ",index," Success Rate: ",total_score,"%")
        success_rate.append(total_score)

    return success_rate

In [34]:
#removes courses that the user_id has already taken by checking the ratings in the ratings dataframe
def drop_taken_courses(user_id, cf):
    finalcourserecs = [] #to store the final course recs

    for i, row in cf.iterrows():
        #print(row[1])

        if row[1] in user_rating_dataset["course_id"].values:
            #print("Course is found in the ratings dataframe!")
            #check if rated
            rating_dets = user_rating_dataset[(user_rating_dataset['course_id'] == row[1]) & (user_rating_dataset['user_id'] == user_id)]
            #print(rating_dets)
            if rating_dets["rating"].values[0] != 0:
                cf.drop([i], inplace=True)
                finalcourserecs.append(row[1])
    return cf

In [35]:
def get_course_details(svd_courses):
    # DataFrame for final recommendations 
    finalRecs = pd.DataFrame()
    
    # Iterate over the list of courses
    for i in svd_courses:
        # If course is found in course dataset, then store details in finalRecs DataFrame
        if i in course_dataset["sr_"]:
            course_details = course_dataset[course_dataset["sr_"] == i]
            
            # Use concat
            finalRecs = pd.concat([finalRecs, course_details], ignore_index=True)
    
    # # Drop unnecessary columns --- commented since the data is already cleaned
    # finalRecs.drop(["Unnamed: 0", "key_skills"], axis=1, inplace=True)
    
    # Return useful information
    return finalRecs[["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str"]]


In [36]:
#hybrid recommender was adjusted to the final recommender and all developed algorithms are called here
def final_recommender(user_id):
    
    svd_courses = [] #pd.DataFrame()
    user_recs = pd.DataFrame()
    course_recs = pd.DataFrame()
    
    #get collaborative filtering recommendations
    svd_courses = collaborative_filter_recommendations(user_id)
    
    #get content based filtering recommendations
    cbfilter_recs = get_course_content_based_recommendations(user_id)
    
    #get llm recommendations
    llm_courses = llm_recommendations(str(user_id))
    #llm_recs = pd.DataFrame(llm_recs1, columns=['sr_'])
    llm_recs = get_course_details(llm_courses)
    

    #adjust columns as necessary
    cbfilter_recs = cbfilter_recs.loc[:, ["sr_","degree_1","degree_1_specializations","campus", "key_skills_str", "index","description"]]

    #if there are no similar users with ratings for the svd, recommendations will be made solely on content-based filtering
    if len(svd_courses) == 0:
        #there are no svd recs
        have_svd = False
        print("No Collaborative Filtering Recommednations!")
    else:
        #get course details of svd recs
        have_svd = True
        
        #check and drop courses if taken by user_id 
        cbfilter_recs = drop_taken_courses(user_id, cbfilter_recs)
        llm_recs = drop_taken_courses(user_id, llm_recs)

        svd_recs = get_course_details(svd_courses)
        
    #making a filter column to show where the recommendations were generated from
    cbfilter_recs["Algorithm"] = "Content-Based"
    llm_recs["Algorithm"] = "LLM (GPT-4)"

    
    #calculate the success rates and save in Success Rate column
    cbf_success_rate = evaluate_recommendations_success_rate(str(user_id), cbfilter_recs)     
    cbfilter_recs['Success Rate'] = cbf_success_rate
    
    llm1_success_rate = evaluate_recommendations_success_rate(str(user_id), llm_recs)  
    llm_recs['Success Rate'] = llm1_success_rate
    
    
    #sort to get the courses with the highest success rate
    cbfilter_recs = cbfilter_recs.sort_values(by=['Success Rate'], ascending=False) 
    llm_recs = llm_recs.sort_values(by=['Success Rate'], ascending=False)


    #displaying
    #get final recommendations using concat to get the top 5 recs from both recommenders
# Get final recommendations using concat to get the top 5 recs from both recommenders
    if have_svd == True:
        svd_recs["Algorithm"] = "Collaborative"
        svd_success_rate = evaluate_recommendations_success_rate(str(user_id), svd_recs)
        svd_recs['Success Rate'] = svd_success_rate
        svd_recs = svd_recs.sort_values(by=['Success Rate'], ascending=False)

        final = pd.concat([cbfilter_recs.iloc[:5], svd_recs.iloc[:5]], ignore_index=True, sort=False)
    else:
        final = cbfilter_recs.head(10)

    # Concatenate llm_recs
    final = pd.concat([final, llm_recs.iloc[:2]], ignore_index=True, sort=False)

    # Sort the entire final DataFrame by 'Success Rate' in descending order
    final = final.sort_values(by='Success Rate', ascending=False)

    #arraning the display 
    final = final.rename(columns = {final.columns[0]: "Course Code"})
    final.set_index("Course Code", inplace = True)

    #drop unnecessary colummns
    final.drop(["index", "description"], axis = 1, inplace = True)


    #rename other columns
    final.rename(columns = {"degree_1":"Degree", "degree_1_specializations": "Degree Specializations", "campus": "Campus", "key_skills": "Key Skills Actual", "key_skills_str": "Key Skills", "Filter": "Filter"}, inplace = True)


    final = final.reset_index()
    print("Course Recommendations for user_id", user_id, ":")
    final = final.reset_index()

    
    # Drop the 'index' column from the DataFrame if it exists
    if 'index' in final.columns:
        final = final.drop('index', axis=1)

    #saving to a csv file to pass to Unity 
    #specifying the file path where to save the csv file 
    file_path = "./Personalised_VRLE/Personalised_VRLE/Assets/Resources/"
    #specifying the file name to save 
    file_name = f"final_recommendations_user_{user_id}_course_recommendations.csv"

    full_path = file_path + file_name
    
    #saving to a csv file
    final.to_csv(full_path, index = False) #index = False so the row indices are not included in the CSV file

    
    return final

In [37]:
final_recommender(1002)

Course Recommendations for user_id 1002 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Algorithm,Success Rate
0,1896,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,51.67
1,1428,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,51.67
2,1212,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,51.67
3,2385,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,51.67
4,2215,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,51.67
5,2954,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",LLM (GPT-4),51.67
6,8949,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",LLM (GPT-4),51.67
7,2370,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,JAVA",Collaborative,25.0
8,1285,B.E.,Mechanical,MITAOE,CATIA,Content-Based,0.0
9,1789,B.E.,Mechanical,MITAOE,CATIA,Content-Based,0.0
