### Importing Libraries

In [37]:
import pandas as pd

from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split, GridSearchCV
import random

import numpy as np
from surprise import accuracy
from surprise.model_selection.validation import cross_validate

### Loading the datasets

In [38]:
user_ds = pd.read_csv("./cleanedDatasets/cleaned_user_profile.csv", encoding= 'unicode_escape')
user_ds.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001.0,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002.0,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003.0,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004.0,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005.0,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [39]:
course_ds = pd.read_csv("./cleanedDatasets/cleaned_course_dataset.csv", encoding= 'unicode_escape')
course_ds.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [40]:
ratings_ds = pd.read_csv("./cleanedDatasets/cleaned_user_ratings.csv", encoding= 'unicode_escape')
ratings_ds.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Preparing the data in a suitable format

In [41]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_ds[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

#data is set to the training dataset
data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)


### Applying the Best Technique 

In TestingCollaborativeTechniques.ipynb various techniques both memory and model based were applied. Through the various techniques we were able to determine from the considered techniques which technique is best to apply for the collaborative analysis. The best technique was the SVD technique. 

In [42]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3524  1.3754  1.3173  1.3665  1.3554  1.3534  0.0198  
MAE (testset)     1.1010  1.1270  1.0621  1.1080  1.1005  1.0997  0.0211  
Fit time          0.04    0.04    0.04    0.05    0.07    0.05    0.01    
Test time         0.00    0.00    0.00    0.01    0.01    0.01    0.00    

Accuracy on the testset:
RMSE: 1.2783
MSE: 1.6341
MAE:  1.0094

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 4.46   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.53   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.53   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=4.455646777427926, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.816788270028572, details={'was_impossible': False})]

### Applying Grid Search

In [43]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.4865433754936859
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.2529326439546455


### Applying the best parameters

In [44]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.2984
MSE: 1.6858
MAE:  1.0842


1.0842221727880341

#### Checking the user and course details

In [45]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_ds[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

In [46]:
#recommends courses to a user based on their previous interactions (ratings) with a dataset of courses, using a method called Singular Value Decomposition (SVD) since it was the best technique. Note 
#uses a machine learning model to recommend new courses to a user based on their past course ratings, ensuring that only courses the user hasn't already completed are recommended.
def cf_recommendations(user):
    
    # model is designed to learn from the ratings data to make good course recommendations.
    #using the "best" parameters found using the gridsearchcv method from testing collaborative filtering techniques notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset) #It trains (or "fits") the SVD model on a dataset (trainset) that includes user-course ratings, helping the model learn about user preferences.
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()  #creates a "testset" of all possible user-item (course) pairs that are not present in the training
    #it's considering courses the user hasn't rated yet. The model then predicts ratings for these courses.
    predictions = model.test(testset)
    
    #predictions made by the model are saved 
    rating_predictions = pd.DataFrame(predictions) # contains the predicted ratings for each course for each user.
    #print(rating_predictions)
    
    #Filter predictions for the specified user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False) # sorts these predictions by the estimated rating (est) in descending order,
        
    #Exclude courses the user has already completed (assuming they are rated 1 to 5)  
    final_courserecs = [] #to store the final course recs
    
    #checks to ensure it doesn't recommend courses the user has already completed and rated.
    #uid is the user and iid is the course id
    for i, row in user_predictions.iterrows():
        #get rating details
        rating_details = ratings_ds[(ratings_ds['course_id'] == row[1]) & (ratings_ds['user_id'] == user)]
        
        #specifically looking for courses that the user hasn't rated yet (rating is 0).
        if rating_details["rating"].values[0] == 0:
            #print("Not rated!")
            #Compile final recommendations
            final_courserecs.append(row[1])
    
    #Return the recommendations
    return(final_courserecs)

In [51]:
testing_user_id = 1002
recommended_courses = cf_recommendations(testing_user_id) #input is a user and output is a course!svdrecs
#1001 is a user (shown above no course is 1001)
#2082 is course since no user rating is found (check above)

In [52]:
testing_user_id_str = str(testing_user_id)+".0"
user_ds[user_ds["userid"] == testing_user_id_str]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
1,1002.0,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."


In [53]:
recommended_course_details = course_ds[course_ds["sr_"].isin(recommended_courses)]

In [54]:
recommended_course_details.style.background_gradient(cmap='Blues') #For a gradient, the darkest shade usually represents the highest value in the column, 
#while the lightest shade represents the lowest value. Intermediate values get a proportional shade between these two extremes. This kind of visualization makes 
#it easy to spot higher and lower values at a glance.


Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1033,2034,B.E.,Mechanical,MITCOE,['AUTOCAD'],AUTOCAD
1054,2055,B.E.,Mechanical,MITCOE,"['AutoCAD, PROE']","AutoCAD, PROE"
1138,2139,B.E.,Electronics Telecommunication Engineering,MITAOE,"['EmbeddedC, MATLAB, C ']","EmbeddedC, MATLAB, C"
1160,2161,B.E.,Electronics Telecommunication Engineering,MITCOE,"['EmbeddedC, MATLAB, C ']","EmbeddedC, MATLAB, C"
1185,2186,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['C,JAVA']","C,JAVA"
1208,2209,B.E.,Computer Science Engineering,MITCOE,"['CPPProgramming, Core JAVA, CProgramming, MySql']","CPPProgramming, Core JAVA, CProgramming, MySql"
1224,2225,B.E.,Computer Science Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"
1242,2243,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['LoRateChnology, C, MathematiCaltoolMATLAB, basiCsofEmbeddedCPython, VHDL, platformWindowsandUNIX, EDATOOLMentorGraphiCs, PspiCe, ARM7, Python, Windows']","LoRateChnology, C, MathematiCaltoolMATLAB, basiCsofEmbeddedCPython, VHDL, platformWindowsandUNIX, EDATOOLMentorGraphiCs, PspiCe, ARM7, Python, Windows"
1255,2256,B.E.,Electronics Telecommunication Engineering,MITCOE,"['EAGLE, MiCrosoftoffiCe, ProgrammingLanguageCCPP, SimulationSoftwareProteusMultisem, Goodorganizingskills, MATLAB, MySQL']","EAGLE, MiCrosoftoffiCe, ProgrammingLanguageCCPP, SimulationSoftwareProteusMultisem, Goodorganizingskills, MATLAB, MySQL"
1321,2322,B.E.,Mechanical,MITCOE,['AUTOCAD'],AUTOCAD
