### Importing libraries

In [1]:
import pandas as pd

#to be able to implement a content-based recommendation system
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

#the test train split to measure the accuracy of the model
from sklearn.model_selection import train_test_split


### Accessing the datasets

In [7]:
#accessing the user profile dataset from the cleanedDatasets folder
user_profile_ds = pd.read_csv('cleanedDatasets/cleaned_user_profile.csv')
user_profile_ds.head()


Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [8]:
#accessing the courses dataset from the cleanedDatasets folder
courses_ds = pd.read_csv('cleanedDatasets/cleaned_course_dataset.csv')
courses_ds.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [10]:
user_rating_ds = pd.read_csv('cleanedDatasets/cleaned_user_ratings.csv')
user_rating_ds.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Creating New Dataframes

In [13]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cb_user_dataset = pd.DataFrame()
cb_user_dataset['userid'] = user_profile_ds['userid']
cb_user_dataset['degree_1'] = user_profile_ds['degree_1']
cb_user_dataset['degree_1_specializations'] = user_profile_ds['degree_1_specializations']
cb_user_dataset['career_objective'] = user_profile_ds['career_objective']
cb_user_dataset['key_skills_str'] = user_profile_ds['key_skills_str']
cb_user_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [14]:
#store the career objective and key skills in description
cb_user_dataset['description'] = cb_user_dataset['degree_1'] + cb_user_dataset['degree_1_specializations'] + cb_user_dataset['career_objective'] + cb_user_dataset['key_skills_str']
cb_user_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E.Computer Science EngineeringComputer Engin...
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E.Computer Science EngineeringInterested in ...
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown,B.E.Computer Science EngineeringUnknownUnknown
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E.Computer Science EngineeringCurrently a fi...
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E.Computer Science EngineeringTo have a grow...


In [15]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cb_course_dataset = pd.DataFrame()
cb_course_dataset['sr_'] = courses_ds['sr_']
cb_course_dataset['campus'] = courses_ds['campus'] 
cb_course_dataset['degree_1'] = courses_ds['degree_1']
cb_course_dataset['degree_1_specializations'] = courses_ds['degree_1_specializations']
cb_course_dataset['key_skills_str'] = courses_ds['key_skills_str']
cb_course_dataset.head()

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str
0,1001,MITCOE,B.E.,Mechanical,CATIA
1,1002,MITCOE,B.E.,Mechanical,CATIA
2,1003,MITAOE,B.E.,Mechanical,CATIA
3,1004,MITCOE,B.E.,Mechanical,CATIA
4,1005,MITCOE,B.E.,Mechanical,CATIA


In [16]:
#store the career objective and key skills in description
cb_course_dataset['description'] = cb_course_dataset['degree_1'] + cb_course_dataset['degree_1_specializations'] + cb_course_dataset['key_skills_str']
cb_course_dataset.head()

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E. MechanicalCATIA


### Content-Based Recommendations

In [29]:
#combining the descirption of the user and the course in a new pandas dataframe
cbf_combined_data_ds = pd.DataFrame()
cbf_combined_data_ds['description'] = cb_user_dataset['description']+ cb_course_dataset['description']
cbf_combined_data_ds.head()

Unnamed: 0,description
0,B.E.Computer Science EngineeringComputer Engin...
1,B.E.Computer Science EngineeringInterested in ...
2,B.E.Computer Science EngineeringUnknownUnknown...
3,B.E.Computer Science EngineeringCurrently a fi...
4,B.E.Computer Science EngineeringTo have a grow...


In [31]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(cbf_combined_data_ds['description'].values.astype('U'))


In [32]:
#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)


In [33]:
#create indices for the courses is using series
cf_course_dataset = cb_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

In [34]:
#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [35]:
#getting the similar course recs for user 1001
get_course_cf_recommendations(1001).head(10)

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
842,842,1843,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E. MechanicalAutoCAD, PROE"
941,941,1942,MITCOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
86,86,1087,MITCOE,B.E.,Mechanical,AUTOCAD,B.E. MechanicalAUTOCAD
886,886,1887,MITCOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
230,230,1231,MITCOE,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E. Electronics Telecommunication Engineerin...
143,143,1144,MITAOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
926,926,1927,MITAOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
188,188,1189,MITCOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
117,117,1118,MITCOE,B.E.,Computer Science Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E. Computer Science EngineeringC, Java, CPP,..."
84,84,1085,MITAOE,B.E.,Electronics Telecommunication Engineering,"C,JAVA",B.E. Electronics Telecommunication Engineerin...


### Evaluating performance 

In [17]:
#combining the two dataframes
combined_dataframes = pd.DataFrame()
combined_dataframes['description'] = cb_user_dataset['description'] + cb_course_dataset['description'] 

#Including specific details like degree, specializations, and key skills separately allows for a more structured comparison between users and courses. 
#Having these details explicitly listed makes it easier to interpret the recommendation logic and evaluate the system's performance. For instance, when assessing the effectiveness of the recommendations, it's straightforward to analyze how well the system matched users and courses based on specific criteria like degree compatibility or skill alignment.
combined_dataframes["sr_"] = cb_course_dataset['sr_']
combined_dataframes["degree_1"] = cb_course_dataset['degree_1']
combined_dataframes["degree_1_specializations"] = cb_course_dataset['degree_1_specializations']
combined_dataframes["key_skills_str"] = cb_course_dataset['key_skills_str']

#listing the head
combined_dataframes.head()

Unnamed: 0,description,sr_,degree_1,degree_1_specializations,key_skills_str
0,B.E.Computer Science EngineeringComputer Engin...,1001,B.E.,Mechanical,CATIA
1,B.E.Computer Science EngineeringInterested in ...,1002,B.E.,Mechanical,CATIA
2,B.E.Computer Science EngineeringUnknownUnknown...,1003,B.E.,Mechanical,CATIA
3,B.E.Computer Science EngineeringCurrently a fi...,1004,B.E.,Mechanical,CATIA
4,B.E.Computer Science EngineeringTo have a grow...,1005,B.E.,Mechanical,CATIA


#### Loading and preprocessing the data

In [19]:
#assigns the combined dataset created from both the user and course datasets to a new variable
data = combined_dataframes

#the description column from the combined dataframes is convered into a list. 
#This list is essentially a collection of strings, where each string is a concatenated description containing user details 
#(degree, specialization, career objective, key skills) and course details (degree, specialization, key skills). 
#This corpus is used for text analysis, likely to vectorize the descriptions for similarity measurements as part of the recommendation process.
#Converting the descriptions into a list format simplifies further text processing tasks, such as vectorization or feature extraction.
corpus = combined_dataframes["description"].tolist()
X_train, X_test = train_test_split(data, test_size=0.2)

#### Training and Testing the model

In [20]:
X_train, X_test = train_test_split(data, test_size=0.2)

In [21]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(X_train["description"].values.astype('U'))

In [22]:
#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

In [24]:
#generate recommendations for each test user
recommendations = []

#user_dataset = llm_user_dataset.dropna()

for i in range(len(cb_user_dataset)):
    #user_input = cb_user_dataset.iloc[i]["description"]
    user_index = cb_user_dataset.iloc[i]["description"] #.index(user_input)
    recommended_items = course_cosine_sim[i].argsort()[:-6:-1]
    recommendations.append(recommended_items)

In [26]:
#Compute precision and recall for key skills
relevant_items = []
for i in range(len(cb_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["key_skills_str"] == X_test.iloc[i]["key_skills_str"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Key Skills: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")


Evaluation for Key Skills: 
Accuracy: 0.002
Precision: 0.108
Recall: 0.001
F1 Score: 0.002


In [27]:
#Compute precision and recall for degree spec
relevant_items = []
for i in range(len(cb_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1_specializations"] == X_test.iloc[i]["degree_1_specializations"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Degree Specializations: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")


Evaluation for Degree Specializations: 
Accuracy: 0.001
Precision: 0.320
Recall: 0.001
F1 Score: 0.001


In [28]:
#Compute precision and recall for degree
relevant_items = []
for i in range(len(cb_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1"] == X_test.iloc[i]["degree_1"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print("Evaluation for Degree: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

Evaluation for Degree: 
Accuracy: 0.001
Precision: 0.908
Recall: 0.001
F1 Score: 0.001
