### Importing libraries

In [1]:
#to access datasets
import pandas as pd
import numpy as np



#to combine features 
from scipy.sparse import hstack, csr_matrix
#for feature extraction and similarity calculation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import normalize
#import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import base64


### Accessing the datasets

In [2]:
#accessing the user profile dataset
user_profile_encoded = pd.read_csv('cleanedDatasets/cleaned_encoded_user_profile_dataset.csv')
user_profile_encoded.head()

Unnamed: 0,userid,degree_1,key_skills,career_objective,key_skills_str,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical,known_languages_[' Android-Studio; Blockchain'],known_languages_[' Android; Php'],...,known_languages_['enthusiasm-motivation; motivation; Technical-and-problem-solving; communication; commitment; patience; interpersonal; maths; problem-solving; chemistry; physics'],"known_languages_['hindi', 'marathi', 'english languages']","known_languages_['java', 'CPP']","known_languages_['java', 'image processing']",known_languages_['java; NodeJS; HTML; MS-SQL'],known_languages_['java; NodeJS; jav; HTML; MS-SQL'],known_languages_['mysql; Javascript'],known_languages_['no'],"known_languages_['python', 'C']","known_languages_['python', 'Html']"
0,1001,0,"['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1002,0,"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1003,0,['Unknown'],Unknown,Unknown,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1004,0,"['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1005,0,"['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [3]:
#accessing the courses dataset
courses_encoded = pd.read_csv('cleanedDatasets/cleaned_encoded_course_dataset.csv')
courses_encoded.head()

Unnamed: 0,sr_,degree_1,campus,key_skills,key_skills_str,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical
0,1001,0,3,['CATIA'],CATIA,False,False,False,True
1,1002,0,3,['CATIA'],CATIA,False,False,False,True
2,1003,0,2,['CATIA'],CATIA,False,False,False,True
3,1004,0,3,['CATIA'],CATIA,False,False,False,True
4,1005,0,3,['CATIA'],CATIA,False,False,False,True


In [4]:
#accessing the user rating dataset
user_rating = pd.read_csv('cleanedDatasets/cleaned_user_ratings.csv')
user_rating.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Implementing the Recommendation Algorithm

#### Recommendations based on the Course Dataset

Testing using the course dataset alone - Implies that the recommendations will be driven by course features.

##### Creating a new dataset

In [5]:
#creating a new dataset for the courses using only the features required for the recommendation system
courses_ds_cb = pd.DataFrame()

#using existing features from the courses dataset
courses_ds_cb['sr_'] = courses_encoded['sr_'] # taking the course id
courses_ds_cb['degree_1'] = courses_encoded['degree_1'] # taking the degree_1

#to include those one-hot encoded columns
specialization_columns = [col for col in courses_encoded.columns if 'degree_1_specializations' in col]
for col in specialization_columns:
    courses_ds_cb[col] = courses_encoded[col]

#courses_ds_cb['campus'] = courses['campus'] # taking the campus -----> not that essential
courses_ds_cb['key_skills_str'] = courses_encoded['key_skills_str'] # taking the key_skills_str and avoiding to take the list since there is already a column containing strings

courses_ds_cb.head()
#By combining relevant features, a consolidated representation is created that captures essential information for the recommendation system.

Unnamed: 0,sr_,degree_1,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical,key_skills_str
0,1001,0,False,False,False,True,CATIA
1,1002,0,False,False,False,True,CATIA
2,1003,0,False,False,False,True,CATIA
3,1004,0,False,False,False,True,CATIA
4,1005,0,False,False,False,True,CATIA


##### Transforming the information into a numerical format

In [6]:
# transforms textual information into a numerical format that can be used in similarity calculations.
#sr_ will not be vectorized since it is a unique identifier for each course
#degree_1 is already encoded using label encoding therefore, the categorical data is already converted into a numeric format. Thefore, no need to apply the TF-IDF
#degree_1_specializations are already one-hot encoded therefore, the categorical data is already converted into a numeric format. Thefore, no need to apply the TF-IDF

#key_skills_str - text data therefore, TF-IDF will be applied -to convert this unstructured text into a structured numeric form that can be used to calculate the similarity between courses based on the key skills they teach
#apply the TF-IDF vectorizer to the key_skills_str column

tfidf_vectorizer = TfidfVectorizer()

#apply the TF-IDF vectorizer to the key_skills_str textual data
tfidf_matrix = tfidf_vectorizer.fit_transform(courses_ds_cb['key_skills_str'])

In [7]:
#combining the encoded features and the TF-IDF 
# Convert the label-encoded 'degree_1' column to a CSR matrix
degree_matrix = csr_matrix(courses_ds_cb[['degree_1']].values)

# Convert the one-hot encoded specialization columns to a CSR matrix
specialization_matrix = csr_matrix(courses_ds_cb[specialization_columns].values)

# Combine the TF-IDF vectors with the label-encoded degree matrix and the one-hot encoded specialization matrix
combined_course_features = hstack([tfidf_matrix, degree_matrix, specialization_matrix])


##### Calculating the similarity 

In [8]:
# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(combined_course_features)
cosine_sim_matrix 
# cosine_sim_matrix is a square matrix with courses as both rows and columns, where each element represents the similarity between courses.

array([[1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.44118456,
        0.44118456],
       [0.        , 0.        , 0.        , ..., 0.44118456, 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.44118456, 1.        ,
        1.        ]])

##### Getting recommendations

In [9]:
# Function to get course recommendations based on cosine similarity
def get_course_recommendations(course_id, cosine_sim_matrix, courses_df, top_n=5):
    # Get the index of the course that matches the course_id
    course_idx = courses_df.index[courses_df['sr_'] == course_id].tolist()[0]
    
    # Get the pairwsie similarity scores of all courses with that course
    sim_scores = list(enumerate(cosine_sim_matrix[course_idx]))
    
    # Sort the courses based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top-n most similar courses
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the course indices
    course_indices = [i[0] for i in sim_scores]
    
    # Return the top-n most similar courses
    return courses_df['sr_'].iloc[course_indices]

def get_recommendations(course_id):
    #to make it more flexible in the number of recommendations to get as well as more modular 
    recommendations = get_course_recommendations(course_id, cosine_sim_matrix, courses_ds_cb)
    return recommendations




In [10]:
get_recommendations(1003)

1    1002
2    1003
3    1004
4    1005
5    1006
Name: sr_, dtype: int64

In [11]:
get_recommendations(10996)

172    1173
209    1210
212    1213
213    1214
214    1215
Name: sr_, dtype: int64

#### Recommendations based on the User Profile Dataset

##### Creating a new dataset

In [12]:
user_profile_ds_cb = pd.DataFrame()
#taking the features of the user that are required for the recommendation system and are considered for the courses. 
user_profile_ds_cb['userid'] = user_profile_encoded['userid'] # taking the user id
user_profile_ds_cb['degree_1'] = user_profile_encoded['degree_1'] # taking the degree_1


# For degree_1_specializations, include the one-hot encoded columns if already applied
specialization_columns = [col for col in user_profile_encoded.columns if 'degree_1_specializations' in col]
user_profile_ds_cb = user_profile_ds_cb.join(user_profile_encoded[specialization_columns])


#user_profile_ds_cb['known_languages'] = user_profile['known_languages'] # taking the known_languages ----> not that essential as the courses don't require it 
user_profile_ds_cb['key_skills_str'] = user_profile_encoded['key_skills_str'] # taking the key_skills_str and avoiding to take the list since there is already a column containing strings

user_profile_ds_cb.head()

Unnamed: 0,userid,degree_1,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical,key_skills_str
0,1001,0,True,False,False,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,0,True,False,False,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,0,True,False,False,Unknown
3,1004,0,True,False,False,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,0,True,False,False,"XML, Word, Data Structures, Communication, ..."


##### Transforming the information into a numerical format

In [13]:
# userid will not be vectorized since it is a unique identifier for each course. 
#degree_1 is already encoded using label encoding therefore, the categorical data is already converted into a numeric format. Thefore, no need to apply the TF-IDF
#degree_1_specializations are already one-hot encoded therefore, the categorical data is already converted into a numeric format. Thefore, no need to apply the TF-IDF

#converting key_skills_str
#apply the TF-IDF vectorizer to the key_skills_str textual data

# Vectorize the 'key_skills_str' using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
key_skills_tfidf = tfidf_vectorizer.fit_transform(user_profile_ds_cb['key_skills_str'])


In [14]:
# Convert the label-encoded 'degree_1' column to a CSR matrix
degree_matrix = csr_matrix(user_profile_ds_cb[['degree_1']].values)

# Assuming 'degree_1_specializations_Computer Science Engineering' is a representative of one-hot encoded columns for specializations
specialization_columns = [col for col in user_profile_ds_cb.columns if 'degree_1_specializations' in col]
specialization_matrix = csr_matrix(user_profile_ds_cb[specialization_columns].values)

# Combine the TF-IDF vectors with the label-encoded degree matrix and the one-hot encoded specialization matrix
combined_features_user = hstack([key_skills_tfidf, degree_matrix, specialization_matrix])

##### Calculating similarity

In [15]:
# Assuming 'combined_features_course' is the feature matrix you created earlier for courses
cosine_sim = cosine_similarity(combined_features_user, combined_features_user)


##### Getting recommendations

Recommend items (courses) to users based on the similarity of their user profiles

In [16]:
cf_dataset = user_profile_ds_cb.reset_index()
users = cf_dataset['userid']
indices = pd.Series(cf_dataset.index, index=cf_dataset['userid']) #creates a series (like a list) named users that contains all user IDs
indices.head(10)

userid
1001    0
1002    1
1003    2
1004    3
1005    4
1006    5
1007    6
1008    7
1009    8
1010    9
dtype: int64

In [17]:
#function to get recommendations
def get_recommendations(user): 
    idx = indices[user]  #uses the user's ID to find the corresponding row number in the DataFrame. This is needed because the similarity calculations were performed on row numbers, not user IDs.
    sim_scores = list(enumerate(cosine_sim[idx])) #retrieves the similarity scores for the specified user against all other users. These scores tell us how similar each user is to the given user.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)# from highest to lowest.
    sim_scores = sim_scores[1:31] # selects the top 30 users who are most similar
    user_indices = [i[0] for i in sim_scores] #creates a list of the indices of these top 30 users
    return users.iloc[user_indices] #returns the IDs of these similar users

In [18]:
get_recommendations(1001).head(10)


842     1847
941     1946
230     1231
926     1931
441     1444
816     1821
915     1920
1034    2039
662     1667
373     1375
Name: userid, dtype: int64

In [19]:
get_recommendations(1847).head(10)

941    1946
926    1931
0      1001
230    1231
373    1375
445    1449
291    1292
662    1667
103    1104
272    1273
Name: userid, dtype: int64

#### Combined Recommendation Algorithm



 Previously there where two recommendation algorithms created one that recommended similar courses to a course and similar users to a similar user. However, to fulfil the purpose of a course recommendation using a content-based filtering algorithm we need to combine the recommendations previously created into one. 

When trying to use the encoding the information to create a content-based filtering algorithm which is capable of recommending course to users. I could not work with the encoding data. Due to 2 problems:

- Computing the user features against the courses features would not be fair as they contrast in their shapes. 

- if we had to add the following information together, into a single column, from the following features 
    For the user features:
        - degree_1 (encoded using label-encoding)
        - degree_1_specialisation (encoded using one-hot encoding)
        - career_objective
        - key_skills_str

    For the course features:
        - degree_1 (encoded using label-encoding)
        - degree_1_specialisation (encoded using one-hot encoding)
        - key_skills_str 

The process of encoding would not be effective as much as that information would have to be converted into a string format to be able to combine the information. Therefore, for the combined recommendation algorithm, the datasets which are cleaned but not encoded will be used throughout. 

##### Accessing the cleaned and not encoded datasets

In [20]:
user_profile = pd.read_csv('cleanedDatasets/cleaned_user_profile.csv')
user_profile.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [21]:
courses = pd.read_csv('cleanedDatasets/cleaned_course_dataset.csv')
courses.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


##### Creating new dataframes to work with

In [22]:
#for the user

cbf_user_ds = pd.DataFrame()
cbf_user_ds['userid'] = user_profile['userid'] # taking the user id
cbf_user_ds['degree_1'] = user_profile['degree_1'] # taking the degree_1
cbf_user_ds['degree_1_specializations'] = user_profile['degree_1_specializations'] # taking the degree_1_specializations
#cbf_user_ds['known_languages'] = user_profile['known_languages'] # taking the known_languages 
cbf_user_ds['career_objective'] = user_profile['career_objective'] # taking the career_objective
cbf_user_ds['key_skills_str'] = user_profile['key_skills_str'] # taking the key_skills_str and avoiding to take the list since there is already a column containing strings

cbf_user_ds.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [23]:
#storing the information of the features together to be able to get the recommendations later on 
cbf_user_ds['description'] = cbf_user_ds['degree_1'] + ' ' + cbf_user_ds['degree_1_specializations'] + ' ' + cbf_user_ds['career_objective'] + ' ' + cbf_user_ds['key_skills_str']
cbf_user_ds.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E. Computer Science Engineering Computer Eng...
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E. Computer Science Engineering Interested i...
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown,B.E. Computer Science Engineering Unknown Unknown
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E. Computer Science Engineering Currently a ...
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E. Computer Science Engineering To have a gr...


In [24]:
#for the courses
cbf_course_ds = pd.DataFrame()
cbf_course_ds['sr_'] = courses['sr_'] # taking the course id
cbf_course_ds['degree_1'] = courses['degree_1'] # taking the degree_1
cbf_course_ds['degree_1_specializations'] = courses['degree_1_specializations'] # taking the degree_1_specializations
cbf_course_ds['campus'] = courses['campus'] # taking the campus
cbf_course_ds['key_skills_str'] = courses['key_skills_str'] # taking the key_skills_str and avoiding to take the list since there is already a column containing strings

cbf_course_ds.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,CATIA
1,1002,B.E.,Mechanical,MITCOE,CATIA
2,1003,B.E.,Mechanical,MITAOE,CATIA
3,1004,B.E.,Mechanical,MITCOE,CATIA
4,1005,B.E.,Mechanical,MITCOE,CATIA


In [25]:
#creating a description column to contain all information about the courses except the sr_ column and campus column (as the goal is to recommend courses based on the academic content thus, the physical location and name of the campus might be less relevant for when calculating similarity. 
#Also if multiple courses which are offered throughout on various campuses but are otherwise identical it could lead to unneccessary duplication in the recommendation process, as the same courses is recommended multiple times simply because it is offered at various locations.)
cbf_course_ds['description'] = cbf_course_ds['degree_1'] + ' ' + cbf_course_ds['degree_1_specializations'] + ' ' + cbf_course_ds['key_skills_str']
cbf_course_ds.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills_str,description
0,1001,B.E.,Mechanical,MITCOE,CATIA,B.E. Mechanical CATIA
1,1002,B.E.,Mechanical,MITCOE,CATIA,B.E. Mechanical CATIA
2,1003,B.E.,Mechanical,MITAOE,CATIA,B.E. Mechanical CATIA
3,1004,B.E.,Mechanical,MITCOE,CATIA,B.E. Mechanical CATIA
4,1005,B.E.,Mechanical,MITCOE,CATIA,B.E. Mechanical CATIA


In [26]:
#combining the descirption of the user and the course in a new pandas dataframe
cbf_combined_data_ds = pd.DataFrame()
cbf_combined_data_ds['description'] = cbf_user_ds['description']+ cbf_course_ds['description']
cbf_combined_data_ds.head()

Unnamed: 0,description
0,B.E. Computer Science Engineering Computer Eng...
1,B.E. Computer Science Engineering Interested i...
2,B.E. Computer Science Engineering Unknown Unkn...
3,B.E. Computer Science Engineering Currently a ...
4,B.E. Computer Science Engineering To have a gr...


In [27]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(cbf_combined_data_ds['description'].values.astype('U'))
print(comb_count_matrix)

  (0, 1237)	4
  (0, 6138)	1
  (0, 2179)	2
  (0, 6830)	1
  (0, 2920)	1
  (0, 7010)	1
  (0, 6363)	1
  (0, 5539)	1
  (0, 6571)	1
  (0, 26)	1
  (0, 533)	1
  (0, 3352)	1
  (0, 7491)	2
  (0, 1745)	2
  (0, 4026)	4
  (0, 4272)	2
  (0, 6016)	1
  (0, 3566)	1
  (0, 3775)	1
  (0, 2718)	1
  (0, 6237)	1
  (0, 1384)	1
  (0, 1947)	1
  (0, 5812)	1
  (0, 3172)	1
  :	:
  (9975, 4673)	1
  (9976, 4673)	1
  (9977, 4673)	1
  (9978, 4673)	1
  (9979, 4673)	1
  (9980, 4673)	1
  (9981, 4673)	1
  (9982, 4673)	1
  (9983, 4673)	1
  (9984, 4673)	1
  (9985, 4673)	1
  (9986, 4673)	1
  (9987, 4673)	1
  (9988, 4673)	1
  (9989, 4673)	1
  (9990, 4673)	1
  (9991, 4673)	1
  (9992, 4673)	1
  (9993, 4673)	1
  (9994, 4673)	1
  (9995, 4673)	1
  (9996, 4673)	1
  (9997, 4673)	1
  (9998, 4673)	1
  (9999, 4673)	1


In [28]:
#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)
print(course_cosine_sim)

[[1.         0.19571329 0.3050957  ... 0.         0.         0.        ]
 [0.19571329 1.         0.33836395 ... 0.         0.         0.        ]
 [0.3050957  0.33836395 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]]


In [29]:
#create indices for the courses is using series
cf_course_dataset = cbf_course_ds.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

In [30]:
#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [31]:
#getting the similar course recs for user 1001
get_course_cf_recommendations(1001).head(10)

Unnamed: 0,index,sr_,degree_1,degree_1_specializations,campus,key_skills_str,description
842,842,1843,B.E.,Mechanical,MITCOE,"AutoCAD, PROE","B.E. Mechanical AutoCAD, PROE"
941,941,1942,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
926,926,1927,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
86,86,1087,B.E.,Mechanical,MITCOE,AUTOCAD,B.E. Mechanical AUTOCAD
886,886,1887,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
143,143,1144,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
117,117,1118,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
188,188,1189,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
909,909,1910,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
927,927,1928,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."


In [32]:
#print the details of user 1001
user_profile[user_profile['userid'] == 1001]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [33]:
#print the details of the returned recommended courses from the function: get_course_cf_recommendations
courses[courses['sr_'].isin(get_course_cf_recommendations(1001)['sr_'])]

Unnamed: 0,index,sr_,degree_1,degree_1_specializations,campus,key_skills_str,description
44,44,1045,B.E.,Mechanical,MITCOE,"AutoCAD, PROE","B.E. Mechanical AutoCAD, PROE"
84,84,1085,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,JAVA",B.E. Electronics Telecommunication Engineeri...
86,86,1087,B.E.,Mechanical,MITCOE,AUTOCAD,B.E. Mechanical AUTOCAD
117,117,1118,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
118,118,1119,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
119,119,1120,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
122,122,1123,B.E.,Computer Science Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
123,123,1124,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
139,139,1140,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
143,143,1144,B.E.,Computer Science Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science Engineering C, Java, CP..."
