### Importing Libraries

In [23]:
import pandas as pd

from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split, GridSearchCV
import random

import numpy as np
from surprise import accuracy
from surprise.model_selection.validation import cross_validate

### Loading the datasets

In [24]:
user_ds = pd.read_csv("./cleanedDatasets/cleaned_user_profile.csv", encoding= 'unicode_escape')
user_ds.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001.0,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002.0,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003.0,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004.0,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005.0,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [25]:
course_ds = pd.read_csv("./cleanedDatasets/cleaned_course_dataset.csv", encoding= 'unicode_escape')
course_ds.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [26]:
ratings_ds = pd.read_csv("./cleanedDatasets/cleaned_user_ratings.csv", encoding= 'unicode_escape')
ratings_ds.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Preparing the data in a suitable format

In [27]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_ds[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

#data is set to the training dataset
data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)


### Applying the Best Technique 

In TestingCollaborativeTechniques.ipynb various techniques both memory and model based were applied. Through the various techniques we were able to determine from the considered techniques which technique is best to apply for the collaborative analysis. The best technique was the SVD technique. 

In [28]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3415  1.3452  1.3750  1.3442  1.3609  1.3534  0.0128  
MAE (testset)     1.0878  1.0948  1.1182  1.0850  1.1080  1.0988  0.0125  
Fit time          0.03    0.03    0.04    0.03    0.04    0.03    0.00    
Test time         0.00    0.01    0.00    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 1.3066
MSE: 1.7071
MAE:  1.0376

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 4.29   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.46   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.46   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=4.292627100315375, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.745508540968983, details={'was_impossible': False})]

### Applying Grid Search

In [29]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.4871428019420767
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.251143110627003


### Applying the best parameters

In [30]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.2933
MSE: 1.6727
MAE:  1.0819


1.0819395726684395

#### Checking the user and course details

In [31]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_ds[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

In [32]:
#Since the svd recommender is making recommmendations from the ratings dataset, 
#its recommemnding courses the users have already done
def svd_cf_recommendations(user):
    
    #using the "best" parameters found using the gridsearchcv method from experiments notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset)
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    
    #save into dataframe
    rating_predictions = pd.DataFrame(predictions)
    #print(rating_predictions)
    
    #find the predictions for the given user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False)
        
    #removing courses already done by user (assuming they are rated 1 to 5)  
    finalcourserecs = [] #to store the final course recs
    
    #uid is the user and iid is the course id
    for i, row in user_predictions.iterrows():
        #get rating details
        rating_dets = ratings_ds[(ratings_ds['course_id'] == row[1]) & (ratings_ds['user_id'] == user)]
        
        if rating_dets["rating"].values[0] == 0:
            #print("Not rated!")
            finalcourserecs.append(row[1])
    
    return(finalcourserecs)

In [33]:
svd_cf_recommendations(1001) #input is a user and output is a course!svdrecs
#1001 is a user (shown above no course is 1001)
#2082 is course since no user rating is found (check above)

[2294,
 2076,
 2348,
 2223,
 2396,
 2340,
 2186,
 2005,
 2405,
 2394,
 2378,
 2061,
 2312,
 2355,
 2259,
 2196,
 2281,
 2199,
 2286,
 2420,
 2302,
 2380,
 2243,
 2270,
 2253,
 2172,
 2256,
 2034]

In [38]:
user_ds[user_ds["userid"] == "1001.0"]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001.0,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [48]:
course_ds[course_ds["sr_"] == 2294]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1293,2294,B.E.,Mechanical,MITCOE,"['CAD,CAM']","CAD,CAM"


In [49]:
course_ds[course_ds["sr_"] == 2076]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1075,2076,B.E.,Mechanical,MITAOE,"['SOLIDWORKS, AUTOCAD, CREO']","SOLIDWORKS, AUTOCAD, CREO"


In [50]:
course_ds[course_ds["sr_"] == 2348]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1347,2348,B.E.,Electronics Telecommunication Engineering,MITCOE,"['C,PCB design']","C,PCB design"


In [51]:
course_ds[course_ds["sr_"] == 2223]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1222,2223,B.E.,Computer Science Engineering,MITAOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [52]:
course_ds[course_ds["sr_"] == 2396]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1395,2396,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","['C,PCB design']","C,PCB design"


In [53]:
course_ds[course_ds["sr_"] == 2340]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1339,2340,B.E.,Mechanical,MITAOE,"['SOLIDWORKS, AUTOCAD, CREO']","SOLIDWORKS, AUTOCAD, CREO"


In [54]:
course_ds[course_ds["sr_"] == 2186]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1185,2186,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['C,JAVA']","C,JAVA"


In [55]:
course_ds[course_ds["sr_"] == 2005]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1004,2005,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['EAGLE, MiCrosoftoffiCe, ProgrammingLanguageC...","EAGLE, MiCrosoftoffiCe, ProgrammingLanguageCCP..."


In [56]:
course_ds[course_ds["sr_"] == 2405]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1404,2405,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['EmbeddedC, MATLAB, C ']","EmbeddedC, MATLAB, C"


In [57]:
course_ds[course_ds["sr_"] == 2394]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1393,2394,B.E.,Mechanical,MITAOE,"['SOLIDWORKS, AUTOCAD, CREO']","SOLIDWORKS, AUTOCAD, CREO"
