### Importing Libraries

In [1]:
#to access datasets
import pandas as pd
import numpy as np


#for encoding 
from sklearn.preprocessing import LabelEncoder

### Accessing Data

In [2]:
#accessing the user profile dataset
user_profile = pd.read_csv('cleanedDatasets/cleaned_user_profile.csv')
user_profile.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [3]:
#accessing the courses dataset
courses = pd.read_csv('cleanedDatasets/cleaned_course_dataset.csv')
courses.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [4]:
#accessing the user rating dataset
user_rating = pd.read_csv('cleanedDatasets/cleaned_user_ratings.csv')
user_rating.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Encoding Data Filtering

Purpose: Content-based recommender systems focus on item attributes and content. They recommend items similar to those a user has liked in the past based on shared features. 

#### Encoding the data as neccessary

Why Encode Categorical Variables?
- Categorical variables (like degree, specialization, and known languages) need to be encoded because most machine learning algorithms work with numerical data.
- Encoding converts categorical attributes into numerical representations, allowing algorithms to process them effectively.

##### User Profile

In [5]:
#encoding the user profile dataset using label encoding for degree_1
le = LabelEncoder()
user_profile['degree_1'] = le.fit_transform(user_profile['degree_1'])
user_profile.head()


Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,0,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,0,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,0,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,0,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,0,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [6]:
#encoding the user profile dataset using one-hot encoding for degree_1_specializations 
user_profile = pd.get_dummies(user_profile, columns=['degree_1_specializations'])
user_profile.head()


Unnamed: 0,userid,degree_1,known_languages,key_skills,career_objective,key_skills_str,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical
0,1001,0,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",True,False,False
1,1002,0,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",True,False,False
2,1003,0,['Unknown'],['Unknown'],Unknown,Unknown,True,False,False
3,1004,0,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",True,False,False
4,1005,0,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",True,False,False


In [7]:
#encoding the user profile dataset using one-hot encoding for known_languages
user_profile = pd.get_dummies(user_profile, columns=['known_languages'])
user_profile.head()

Unnamed: 0,userid,degree_1,key_skills,career_objective,key_skills_str,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical,known_languages_[' Android-Studio; Blockchain'],known_languages_[' Android; Php'],...,known_languages_['enthusiasm-motivation; motivation; Technical-and-problem-solving; communication; commitment; patience; interpersonal; maths; problem-solving; chemistry; physics'],"known_languages_['hindi', 'marathi', 'english languages']","known_languages_['java', 'CPP']","known_languages_['java', 'image processing']",known_languages_['java; NodeJS; HTML; MS-SQL'],known_languages_['java; NodeJS; jav; HTML; MS-SQL'],known_languages_['mysql; Javascript'],known_languages_['no'],"known_languages_['python', 'C']","known_languages_['python', 'Html']"
0,1001,0,"['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1002,0,"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1003,0,['Unknown'],Unknown,Unknown,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1004,0,"['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1005,0,"['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


##### Course Datset

In [8]:
#encoding the course dataset using  label encoding for degree_1
courses['degree_1'] = le.fit_transform(courses['degree_1'])
courses.head()


Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,0,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,0,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,0,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,0,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,0,Mechanical,MITCOE,['CATIA'],CATIA


In [9]:
#encoding the course dataset using one-hot encoding for degree_1_specializations
courses = pd.get_dummies(courses, columns=['degree_1_specializations'])
courses.head()

Unnamed: 0,sr_,degree_1,campus,key_skills,key_skills_str,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical
0,1001,0,MITCOE,['CATIA'],CATIA,False,False,False,True
1,1002,0,MITCOE,['CATIA'],CATIA,False,False,False,True
2,1003,0,MITAOE,['CATIA'],CATIA,False,False,False,True
3,1004,0,MITCOE,['CATIA'],CATIA,False,False,False,True
4,1005,0,MITCOE,['CATIA'],CATIA,False,False,False,True


In [10]:
#encoding the course dataset using  label encoding for campus
courses['campus'] = le.fit_transform(courses['campus'])
courses.head()

Unnamed: 0,sr_,degree_1,campus,key_skills,key_skills_str,degree_1_specializations_Civil Engineering,degree_1_specializations_Computer Science Engineering,degree_1_specializations_Electronics Telecommunication Engineering,degree_1_specializations_Mechanical
0,1001,0,3,['CATIA'],CATIA,False,False,False,True
1,1002,0,3,['CATIA'],CATIA,False,False,False,True
2,1003,0,2,['CATIA'],CATIA,False,False,False,True
3,1004,0,3,['CATIA'],CATIA,False,False,False,True
4,1005,0,3,['CATIA'],CATIA,False,False,False,True


### Saving the datasets

In [11]:
courses.to_csv('cleanedDatasets/cleaned_encoded_course_dataset.csv', index=False)
user_profile.to_csv('cleanedDatasets/cleaned_encoded_user_profile_dataset.csv', index=False)