### Importing libraries

In [670]:
import pandas as pd

#for collobarative filtering
from surprise import Reader, Dataset, SVD, KNNBaseline, KNNWithMeans, KNNWithZScore, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

#for cross validation
from surprise.model_selection import cross_validate

#for content based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

import matplotlib.pyplot as plt
import matplotlib
import numpy as np

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

#randomisation to have data distributed evenly when splitting and no bias
import random

### Importing the cleaned and pre-processed datasets

In [671]:
#accessing the course dataset
course_dataset = pd.read_csv("./cleanedDatasets/cleaned_course_dataset.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [672]:
#accessing the user profile datasets
user_profile = pd.read_csv("./cleanedDatasets/cleaned_user_profile.csv", encoding= 'unicode_escape')
user_profile.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [673]:
#accessing the user rating
user_rating = pd.read_csv("./cleanedDatasets/cleaned_user_ratings.csv", encoding= 'unicode_escape')
user_rating.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Collaborative Filtering Algorithm

#### Preparing the data in a suitable format.

In [674]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(user_rating[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

#data is set to the training dataset
data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)


#### Creating the SVD recommendation algorithm

In [675]:
#using the best technique for collaborative filtering when evaluated against various techniques - SVD 

#creating a function for collaborative filtering recommendations and passing the user id as an argument
def collaborative_filtering_recommendations(user_id):

    #using the parameters obtained from the analysis of the SVD when applying the gridsearchmethodcv from the TestingCollaborativeTechniques.ipynb
    model = SVD(n_factors= 90, n_epochs= 20, lr_all= 0.005, reg_all= 0.2) #using 0.2 instead of 0.02
    model.fit(trainset)

    #building the test set then predicting the ratings for the test set
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)

    #saving the results in a pandas dataframe
    results_for_rating_predictions = pd.DataFrame(predictions)

    #obtaining the top 10 recommendations for the user and sorting values
    user_predictions = results_for_rating_predictions[results_for_rating_predictions['uid'] == user_id].\
                         sort_values(by="est", ascending = False).head(10)
  
    #creating a recommendations array and adding the user predictions as recommendations
    recommendations = []
    recommendations.append(user_predictions['iid'])
    recommendations = recommendations[0]

    return recommendations

### Content - Based Filtering Algorithm

#### Creating new dataframes to work with

In [676]:
#for the users 
#create a new dataframe containg the following fields: userid, degree_1, degree_1_specializations, key_skills_str and the career objective
cbf_user_dataset = pd.DataFrame()
cbf_user_dataset['userid'] = user_profile['userid']
cbf_user_dataset['degree_1'] = user_profile['degree_1']
cbf_user_dataset['degree_1_specializations'] = user_profile['degree_1_specializations']
cbf_user_dataset['career_objective'] = user_profile['career_objective']
cbf_user_dataset['key_skills_str'] = user_profile['key_skills_str']

#creating a new field called description containing all the fields expect userid
cbf_user_dataset['description'] = cbf_user_dataset['degree_1'] + ' ' + cbf_user_dataset['degree_1_specializations'] + ' ' + cbf_user_dataset['career_objective'] + ' ' + cbf_user_dataset['key_skills_str']
cbf_user_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E. Computer Science Engineering Computer Eng...
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E. Computer Science Engineering Interested i...
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown,B.E. Computer Science Engineering Unknown Unknown
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E. Computer Science Engineering Currently a ...
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E. Computer Science Engineering To have a gr...


In [677]:
# same process but for the courses
#create a new dataset and add the following fields: sr_, campus, degree_1, degree_1_specializations, key_skills_str 
cbf_course_dataset = pd.DataFrame()
cbf_course_dataset['sr_'] = course_dataset['sr_']
cbf_course_dataset['campus'] = course_dataset['campus']
cbf_course_dataset['degree_1'] = course_dataset['degree_1']
cbf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cbf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']

cbf_course_dataset['description'] = cbf_course_dataset['degree_1'] + ' ' + cbf_course_dataset['degree_1_specializations'] + ' ' + cbf_course_dataset['key_skills_str']
cbf_course_dataset.head()

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical CATIA
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical CATIA
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E. Mechanical CATIA
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical CATIA
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical CATIA


In [678]:
#putting the new fields created from the cbf_user_dataset and the cbf_course_dataset into a new dataframe which combiens them
cbf_combined_dataset = pd.DataFrame()
cbf_combined_dataset['description'] = cbf_user_dataset['description']+cbf_course_dataset['description']

#### Preparing the data in a suitable format

In [679]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(cbf_combined_dataset['description'].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#create indices for the courses is using series
cbf_course_dataset = cbf_course_dataset.reset_index()
courses = cbf_course_dataset
indices = pd.Series(cbf_course_dataset.index, index=cbf_course_dataset['sr_'])

#### Creating the content based recommendation algorithm

In [680]:
#function to get content-filtered recommendations
def content_based_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

### Success Rate Predictor

Calculating Success Rate: For each recommended course, the system calculates a "success rate," which estimates how well the course matches the user's profile. This rate is a combination of three factors:

- How well the user's degree matches the course's required degree.
- How closely the user's specialization aligns with the course's specialization.
- The overlap between the user's key skills and those needed for the course.

Each factor is weighted differently, with key skills being the most important. The system adjusts for courses requiring skills the user has, increasing the success rate accordingly.

In [681]:
user_profile

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."
...,...,...,...,...,...,...,...
1092,2042,B.E.,Computer Science Engineering,"['CPP', 'Python']","['java', ' database', ' html', ' OOPs', ' Core...",i have to be carrer in programming and after t...,"java, database, html, OOPs, Core Java, MySQL"
1093,2043,B.E.,Computer Science Engineering,"['CPP', 'C']",['Ability-to-cope-up-with-different-situation....,To work in an Industry with a professional wor...,"Ability-to-cope-up-with-different-situation., ..."
1094,2044,B.E.,Computer Science Engineering,"['HTML', 'CSS']","['C', ' Java', ' cpp', ' HTML', ' Basic-Python...",To pursue a highly challenging and creative ca...,"C, Java, cpp, HTML, Basic-Python, MySQL"
1095,2045,B.E.,Computer Science Engineering,['Html; Wordpress; Css'],"['C', ' Drupal-(CMS)', ' Bootstrap', ' Wordpre...",To prove myself dedicated worthful and energet...,"C, Drupal-(CMS), Bootstrap, Wordpress-(CMS)..."


In [682]:
#create a function designed to evaluate how well a set of recommended courses match a user's academic background and skillset. 
#Note the purpose of this function is to quantify the relevance of each course recommendation to the user, allowing the recommendation system to prioritize and suggest the most suitable courses. 

def evaluate_recommendations_success_rate(user,recommendations):
    #defining a list for the success_rate - to store the calculated success rate for each recommended course
    success_rate = []
    
    #printing positions of column names
    #print("Degree 1 position", recommendations.columns.get_loc("degree_1"))
    #print("Degree 1 Specialization position", recommendations.columns.get_loc("degree_1_specializations"))
    #print("Key Skills position", recommendations.columns.get_loc("key_skills_str"))
    
    #getting the user's required information such as the user's degree, specialization and key skills as these are crucial aspects of the user's academic and profession profile
    user_degree = user_profile["degree_1"].loc[user_profile["userid"] == str(user)].values[0] #getting the user's degree
    user_specialization = user_profile["degree_1_specializations"].loc[user_profile["userid"] == str(user)].values[0] #getting the user's specialization
    user_key_skills = user_profile["key_skills_str"].loc[user_profile["userid"] == str(user)].values[0] #getting the user's key skills

    #formatting this information into a list to make it easy for comprassion
    user_degree = user_degree.split("-")
    user_specialization = user_specialization.split("-") 
    user_key_skills = user_key_skills.split(",") #key skills are split into individual skills
    user_key_skills = (map(lambda x: x.lower().strip(), user_key_skills)) # unnecessary spaces or case differences are removed to standardize the data
    user_key_skills = list(user_key_skills)

    #iterating through the recommendations and gathers the following info degree, specialization and key skills - to see what the course demands from the students
    #also comparing the user details with the recommendations
    for index, row in recommendations.iterrows(): # iterates over each recommended course (contained in the recommendations DataFrame)
        #print("Checking course: ",index)
        #print("Row details: ", row)
        
        #initializing the score components as they are a structured way to quantify how well a user's profile matches the requirements of a recommended course
        alignment_score_for_degree = 0 #assesses whether the user's educational background (degree) aligns with what the course requires. It's fundamental because 
        #having the appropriate degree often forms the base level of preparedness for a course.
        
        alignment_score_for_specialization = 0 # indicates whether the user has the specific knowledge or focus area that the course is designed for
        
        alignment_score_for_key_skills = 0 #relates to the practical abilities and competencies the course aims to build or requires as a prerequisite


        #getting the course's required information such as the degree, specialization and key skills as these are crucial aspects of the course's academic and profession profile
        course_degree = row[1] #getting the course's degree
        course_specialization = row[2]
        course_key_skills = row[4]

        #formatting this information into a list to make it easy for comprassion
        course_degree = course_degree.split("-")
        course_specialization = course_specialization.split("-")
        course_key_skills = course_key_skills.strip().split(",")
        course_key_skills = (map(lambda x: x.lower().strip(), course_key_skills))
        course_key_skills = list(course_key_skills)

        #deteriming the weights/ percentages of the importance of the degree, specialization and key skills
        #weight_degree, weight_specialization, weight_key_skills = determine_weights(len(course_key_skills))

        #weight/percentage depending on the academic success

        #highest percentage because possessing the specific skills required by a course is often the most direct predictor of a student's ability to succeed and engage 
        #with the course material. Skills are practical and applicable, making this component critical for matching users to courses where they can excel and benefit the most.
        percentage_key_skills = 50 

        # It's weighted less than key skills because while specialization relevance is important, the specific skills the user possesses are a more direct indicator of their 
        #ability to perform specific tasks or understand certain topics within the course.
        percentage_specialization = 35


        #having a relevant degree provides a good foundation, it's considered less directly predictive of success in a specific course than the user's specialized areas of study or their practical skills.
        percentage_degree = 15

        #degree and specialization meatch 
        if(user_degree == course_degree):
            alignment_score_for_degree = percentage_degree
        if(user_specialization == course_specialization):
            alignment_score_for_specialization = percentage_specialization

        #key skills comparision
        percentage_temp = percentage_key_skills/len(course_key_skills) #obtaining the percentage of each individual key skill

        #iterating through each course key skills
        for skill in course_key_skills:
            if skill == "unknown":
                print("There is not enough information")
            if skill == "c cpp":
                skill = "c"
                course_key_skills.append("cpp")

                #adjusting the percentage 
                percentage_temp = percentage_key_skills/len(course_key_skills)

            if skill == "c programming":
                skill = "c"
                
            if skill in user_key_skills:
                alignment_score_for_key_skills += percentage_temp

        #calculating the total score
        total_score = round(alignment_score_for_degree + alignment_score_for_specialization + alignment_score_for_key_skills, 2)
        #print("Course: ",index," Success Rate: ",total_score,"%")
        success_rate.append(total_score)

    return success_rate




### Creating Modular Custom Functions

Since some functions for example to remove the courses already done by a user from the list of recommendations one function can be created to used when necessary. This is instead of having it implemented within another function whose purpose is completly different. 


In [683]:
#removes courses that the user has already taken by checking the ratings in the ratings dataframe
def drop_courses_taken(user, dataframe_recommendations):
    finalcourserecs = [] #to store the final course recs after removing the ones the user has already taken
    
    #print("Dataframe Recommendations:",dataframe_recommendations)
    #position of the Course ID in the dataframe_recommendations dataset
    #print("Course ID position: ", dataframe_recommendations.columns.get_loc("sr_"))

    for i, row in dataframe_recommendations.iterrows():
        #print("Drop Method - Deatils of row[0] : ", row[0])

        if row[0] in user_rating["course_id"].values:
            #check if rated
            rating_dets = user_rating[(user_rating['course_id'] == row[0]) & (user_rating['user_id'] == user)]
            if rating_dets["rating"].values[0] != 0:
                #remove course if already taken and updating the recommendations
                dataframe_recommendations.drop([i], inplace=True)
                finalcourserecs.append(row[0])
    
    #returing the updated recommendation list
    return dataframe_recommendations

In [684]:
#gets course details of svd collaborative filtering recommendation technique
def get_course_details(svd_courses_recommended): #takes the list of course recommendations obtained by SVD
    # List to store the details of each recommended course
    courses_details_list = []
    
    #dataframe for final recommendations 
    finalRecs = pd.DataFrame()
    
    #print("Parameter passed: ", svd_courses_recommended)
    
    for i in svd_courses_recommended: #Loop Through Recommended Courses IDs
        #checks if that ID exists in the column sr_ in the course dataset
        if i in course_dataset["sr_"].values:
            #Retrieve and Store Course Details
            course_details = course_dataset[course_dataset["sr_"] == i]
            # Add the DataFrame to the list instead of appending to finalRecs
            courses_details_list.append(course_details)

    # if courses_details_list is not empty, concatenate the dataframes in the list
    if courses_details_list:
        finalRecs = pd.concat(courses_details_list, ignore_index=True)
        
    return finalRecs[["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str"]]


### Hybrid Recommendation Algorithm + Success Rate Predictor


In [685]:
def hybrid_recommendation_algorithm(user_id):

    #defining variables
    user_recommendations = pd.DataFrame()
    course_recommendations = pd.DataFrame()
    svd_recommended_courses = []

    #get the recommendations using the collaborative filtering technique
    svd_recommended_courses = collaborative_filtering_recommendations(user_id)

    #get the recommendations using the content based filtering technique 
    cbf_recommendations = content_based_recommendations(user_id)

    #adjust the columns as needed - to keep only the neccessary ones
    cbf_recommendations = cbf_recommendations.loc[:,["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str","index","description"]]

    #checking for collaborative recommendations
    #if there are no similar users with ratings for the svd, recommendations will be made solely on content-based filtering
    if len(svd_recommended_courses) == 0:
        #using boolean 
        have_svd_recommendations = False
        print("No SVD collaborative recommendations available")
    else:
        have_svd_recommendations = True
        #drop any courses which were already taken by the user from the content-based recommendations
        cbf_recommendations = drop_courses_taken(user_id, cbf_recommendations)
        #get the course details of the svd recommendations
        svd_recommendations = get_course_details(svd_recommended_courses)

    #a filter column is created to show from where the recommendations were obtained - indicating which technique was used 
    cbf_recommendations["filter"] = "content-based"


    #calculating the success rate predictions in a success rate field 
    #calculate and add a success rate prediction for each content-based recommendation.
    cf_success_rate = evaluate_recommendations_success_rate(user_id,cbf_recommendations)
    cbf_recommendations["success_rate"] = cf_success_rate

    #sort depening on the highest success rate
    cbf_recommendations = cbf_recommendations.sort_values(by = ['success_rate'], ascending = False)


    #If there are SVD recommendations
    if have_svd_recommendations == True:
        #setting the label filter
        svd_recommendations["filter"] = "collaborative"

        #calculating the success rate predictions in a success rate field
        svd_success_rate = evaluate_recommendations_success_rate(user_id,svd_recommendations)
        svd_recommendations["success_rate"] = svd_success_rate

        #sort depening on the highest success rate
        svd_recommendations = svd_recommendations.sort_values(by = ['success_rate'], ascending = False)
        
        #get final recommendations using concat to get the top 5 recs from both recommenders
        final = pd.concat([cbf_recommendations.iloc[:5], svd_recommendations.iloc[:5]], ignore_index=True, sort=False)
    else:
        final = cbf_recommendations.head(10)

    #arraning the display 
    final = final.rename(columns = {final.columns[0]: "Course Code"})
    final.set_index("Course Code", inplace = True)

    #drop unnecessary colummns
    final.drop(["index", "description"], axis = 1, inplace = True)

    #rename other columns
    final.rename(columns = {"degree_1":"Degree", "degree_1_specializations": "Degree Specializations", "campus": "Campus", "key_skills": "Key Skills Actual", "key_skills_str": "Key Skills", "Filter": "Filter"}, inplace = True)


    final = final.reset_index()

    print("Course Recommmendations for User", user_id, ":")
    return final


In [686]:
final = hybrid_recommendation_algorithm(1001)
final

Course Recommmendations for User 1001 :


  if row[0] in user_rating["course_id"].values:
  course_degree = row[1] #getting the course's degree
  course_specialization = row[2]
  course_key_skills = row[4]
  course_degree = row[1] #getting the course's degree
  course_specialization = row[2]
  course_key_skills = row[4]


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,filter,success_rate
0,1895,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",content-based,76.67
1,1648,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",content-based,76.67
2,1186,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",content-based,76.67
3,1185,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",content-based,76.67
4,1183,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",content-based,76.67
5,2200,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",collaborative,76.67
6,2153,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",collaborative,76.67
7,2140,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",collaborative,76.67
8,2202,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,JAVA",collaborative,50.0
9,2216,B.E.,Electronics Telecommunication Engineering,MITAOE,"EmbeddedC, MATLAB, C",collaborative,16.67
