### Importing libraries

In [36]:
#to access datasets
import pandas as pd
import numpy as np

#for encoding 
from sklearn.preprocessing import LabelEncoder

#to combine features 
from scipy.sparse import hstack, csr_matrix
#for feature extraction and similarity calculation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel


### Accessing the datasets

In [37]:
#accessing the user profile dataset
user_profile = pd.read_csv('cleanedDatasets/cleaned_user_profile.csv')
user_profile.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001.0,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002.0,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003.0,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004.0,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005.0,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [38]:
#accessing the courses dataset
courses = pd.read_csv('cleanedDatasets/cleaned_course_dataset.csv')
courses.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [39]:
#accessing the user rating dataset
user_rating = pd.read_csv('cleanedDatasets/cleaned_user_ratings.csv')
user_rating.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Encoding Data Filtering

Purpose: Content-based recommender systems focus on item attributes and content. They recommend items similar to those a user has liked in the past based on shared features.

#### Encoding the data as neccessary

Why Encode Categorical Variables?
- Categorical variables (like degree, specialization, and known languages) need to be encoded because most machine learning algorithms work with numerical data.
- Encoding converts categorical attributes into numerical representations, allowing algorithms to process them effectively.

##### User Profile

In [40]:
#encoding the user profile dataset using label encoding for degree_1
le = LabelEncoder()
user_profile['degree_1'] = le.fit_transform(user_profile['degree_1'])
user_profile.head()


Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001.0,0,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002.0,0,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003.0,0,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004.0,0,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005.0,0,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [41]:
#encoding the user profile dataset using one-hot encoding for degree_1_specializations 
user_profile = pd.get_dummies(user_profile, columns=['degree_1_specializations'])
user_profile.head()


Unnamed: 0,userid,degree_1,known_languages,key_skills,career_objective,key_skills_str,degree_1_specializations_Computer Science Engineering
0,1001.0,0,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",True
1,1002.0,0,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",True
2,1003.0,0,['Unknown'],['Unknown'],Unknown,Unknown,True
3,1004.0,0,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",True
4,1005.0,0,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",True


In [42]:
#encoding the user profile dataset using one-hot encoding for known_languages
user_profile = pd.get_dummies(user_profile, columns=['known_languages'])
user_profile.head()

Unnamed: 0,userid,degree_1,key_skills,career_objective,key_skills_str,degree_1_specializations_Computer Science Engineering,known_languages_[' Android-Studio; Blockchain'],known_languages_[' Android; Php'],known_languages_[' java; NodeJS; CS; MS-SQL'],known_languages_[' java; NodeJS; HTML; MS-SQL'],...,known_languages_['enthusiasm-motivation; motivation; Technical-and-problem-solving; communication; commitment; patience; interpersonal; maths; problem-solving; chemistry; physics'],"known_languages_['hindi', 'marathi', 'english languages']","known_languages_['java', 'CPP']","known_languages_['java', 'image processing']",known_languages_['java; NodeJS; HTML; MS-SQL'],known_languages_['java; NodeJS; jav; HTML; MS-SQL'],known_languages_['mysql; Javascript'],known_languages_['no'],"known_languages_['python', 'C']","known_languages_['python', 'Html']"
0,1001.0,0,"['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1002.0,0,"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1003.0,0,['Unknown'],Unknown,Unknown,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1004.0,0,"['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1005.0,0,"['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


##### Course Datset

In [43]:
#encoding the course dataset using  label encoding for degree_1
courses['degree_1'] = le.fit_transform(courses['degree_1'])
courses.head()


Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,0,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,0,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,0,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,0,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,0,Mechanical,MITCOE,['CATIA'],CATIA


In [44]:
#encoding the course dataset using one-hot encoding for degree_1_specializations
courses = pd.get_dummies(courses, columns=['degree_1_specializations'])
courses.head()

Unnamed: 0,sr_,degree_1,campus,key_skills,key_skills_str,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical
0,1001,0,MITCOE,['CATIA'],CATIA,False,False,False,True
1,1002,0,MITCOE,['CATIA'],CATIA,False,False,False,True
2,1003,0,MITAOE,['CATIA'],CATIA,False,False,False,True
3,1004,0,MITCOE,['CATIA'],CATIA,False,False,False,True
4,1005,0,MITCOE,['CATIA'],CATIA,False,False,False,True


In [45]:
#encoding the course dataset using  label encoding for campus
courses['campus'] = le.fit_transform(courses['campus'])
courses.head()

Unnamed: 0,sr_,degree_1,campus,key_skills,key_skills_str,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical
0,1001,0,3,['CATIA'],CATIA,False,False,False,True
1,1002,0,3,['CATIA'],CATIA,False,False,False,True
2,1003,0,2,['CATIA'],CATIA,False,False,False,True
3,1004,0,3,['CATIA'],CATIA,False,False,False,True
4,1005,0,3,['CATIA'],CATIA,False,False,False,True


In [46]:
courses.to_csv('cleanedDatasets/cleaned_encoded_course_dataset.csv', index=False)
user_profile.to_csv('cleanedDatasets/cleaned_encoded_user_profile_dataset.csv', index=False)

### Implementing the Recommendation Algorithm

#### Recommendations based on the Course Dataset

Testing using the course dataset alone - Implies that the recommendations will be driven by course features.

##### Creating a new dataset

In [47]:
#creating a new dataset for the courses using only the features required for the recommendation system
courses_ds_cb = pd.DataFrame()

#using existing features from the courses dataset
courses_ds_cb['sr_'] = courses['sr_'] # taking the course id
courses_ds_cb['degree_1'] = courses['degree_1'] # taking the degree_1

#to include those one-hot encoded columns
specialization_columns = [col for col in courses.columns if 'degree_1_specializations' in col]
for col in specialization_columns:
    courses_ds_cb[col] = courses[col]

#courses_ds_cb['campus'] = courses['campus'] # taking the campus -----> not that essential
courses_ds_cb['key_skills_str'] = courses['key_skills_str'] # taking the key_skills_str and avoiding to take the list since there is already a column containing strings

courses_ds_cb.head()
#By combining relevant features, a consolidated representation is created that captures essential information for the recommendation system.

Unnamed: 0,sr_,degree_1,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical,key_skills_str
0,1001,0,False,False,False,True,CATIA
1,1002,0,False,False,False,True,CATIA
2,1003,0,False,False,False,True,CATIA
3,1004,0,False,False,False,True,CATIA
4,1005,0,False,False,False,True,CATIA


##### Transforming the information into a numerical format

In [48]:
# transforms textual information into a numerical format that can be used in similarity calculations.
#sr_ will not be vectorized since it is a unique identifier for each course
#degree_1 is already encoded using label encoding therefore, the categorical data is already converted into a numeric format. Thefore, no need to apply the TF-IDF
#degree_1_specializations are already one-hot encoded therefore, the categorical data is already converted into a numeric format. Thefore, no need to apply the TF-IDF

#key_skills_str - text data therefore, TF-IDF will be applied -to convert this unstructured text into a structured numeric form that can be used to calculate the similarity between courses based on the key skills they teach
#apply the TF-IDF vectorizer to the key_skills_str column

tfidf_vectorizer = TfidfVectorizer()

#apply the TF-IDF vectorizer to the key_skills_str textual data
tfidf_matrix = tfidf_vectorizer.fit_transform(courses_ds_cb['key_skills_str'])

In [49]:
#combining the encoded features and the TF-IDF 
# Convert the label-encoded 'degree_1' column to a CSR matrix
degree_matrix = csr_matrix(courses_ds_cb[['degree_1']].values)

# Convert the one-hot encoded specialization columns to a CSR matrix
specialization_matrix = csr_matrix(courses_ds_cb[specialization_columns].values)

# Combine the TF-IDF vectors with the label-encoded degree matrix and the one-hot encoded specialization matrix
combined_features = hstack([tfidf_matrix, degree_matrix, specialization_matrix])


##### Calculating the similarity 

In [50]:
# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(combined_features)
cosine_sim_matrix 
# cosine_sim_matrix is a square matrix with courses as both rows and columns, where each element represents the similarity between courses.

array([[1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.44118456,
        0.44118456],
       [0.        , 0.        , 0.        , ..., 0.44118456, 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.44118456, 1.        ,
        1.        ]])

##### Getting recommendations

In [51]:
# Function to get course recommendations based on cosine similarity
def get_course_recommendations(course_id, cosine_sim_matrix, courses_df, top_n=5):
    # Get the index of the course that matches the course_id
    course_idx = courses_df.index[courses_df['sr_'] == course_id].tolist()[0]
    
    # Get the pairwsie similarity scores of all courses with that course
    sim_scores = list(enumerate(cosine_sim_matrix[course_idx]))
    
    # Sort the courses based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top-n most similar courses
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the course indices
    course_indices = [i[0] for i in sim_scores]
    
    # Return the top-n most similar courses
    return courses_df['sr_'].iloc[course_indices]

def get_recommendations(course_id):
    #to make it more flexible in the number of recommendations to get as well as more modular 
    recommendations = get_course_recommendations(course_id, cosine_sim_matrix, courses_ds_cb)
    return recommendations




In [52]:
get_recommendations(1003)

1    1002
2    1003
3    1004
4    1005
5    1006
Name: sr_, dtype: int64

In [53]:
get_recommendations(10996)

172    1173
209    1210
212    1213
213    1214
214    1215
Name: sr_, dtype: int64

#### Recommendations based on the User Profile Dataset

##### Creating a new dataset

In [55]:
user_profile_ds_cb = pd.DataFrame()
#taking the features of the user that are required for the recommendation system and are considered for the courses. 
user_profile_ds_cb['userid'] = user_profile['userid'] # taking the user id
user_profile_ds_cb['degree_1'] = user_profile['degree_1'] # taking the degree_1


# For degree_1_specializations, include the one-hot encoded columns if already applied
specialization_columns = [col for col in user_profile.columns if 'degree_1_specializations' in col]
user_profile_ds_cb = user_profile_ds_cb.join(user_profile[specialization_columns])


#user_profile_ds_cb['known_languages'] = user_profile['known_languages'] # taking the known_languages ----> not that essential as the courses don't require it 
user_profile_ds_cb['key_skills_str'] = user_profile['key_skills_str'] # taking the key_skills_str and avoiding to take the list since there is already a column containing strings

user_profile_ds_cb.head()

Unnamed: 0,userid,degree_1,degree_1_specializations_Computer Science Engineering,key_skills_str
0,1001.0,0,True,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002.0,0,True,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003.0,0,True,Unknown
3,1004.0,0,True,"XML, C, Java, Data Structures, Python, Mo..."
4,1005.0,0,True,"XML, Word, Data Structures, Communication, ..."
